diff -Nru x265-1.5/debian/changelog x265-1.6/debian/changelog --- x265-1.5/debian/changelog 2015-02-18 01:02:59.000000000 +0000 +++ x265-1.6/debian/changelog 2015-05-08 14:29:46.000000000 +0000 @@ -1,8 +1,26 @@ +x265 (1.6-1~ppa1) vivid; urgency=low + + * Backported from Debian unstable. + + -- Nate Muench Fri, 08 May 2015 09:29:27 -0500 + +x265 (1.6-1) unstable; urgency=medium + + * New upstream release. + * SONAME bump: libx265-43 -> libx265-51. + * debian/patches: + - compile-flags.patch: Refreshed. + - workaround-partial-SIMD-support.patch, hide-internal-symbol.patch: + Removed, no longer needed. + * debian/control: Add libnuma-dev to B-D to build with libnuma support. + + -- Sebastian Ramacher Tue, 07 Apr 2015 23:09:39 +0200 + x265 (1.5-1) unstable; urgency=medium * New upstream release. - Fix bit depth check in CLI. (Closes: #775181) - * SONAME ump: libx265-35 -> libx265-43. + * SONAME bump: libx265-35 -> libx265-43. * debian/patches: - atomic.patch: Removed, no longer needed. - compile-flags.patch, workaround-partial-SIMD-support.patch: Refreshed. diff -Nru x265-1.5/debian/control x265-1.6/debian/control --- x265-1.5/debian/control 2015-02-18 00:34:19.000000000 +0000 +++ x265-1.6/debian/control 2015-04-07 21:03:33.000000000 +0000 @@ -6,6 +6,7 @@ Build-Depends: debhelper (>= 9), cmake, + libnuma-dev [amd64 arm64 i386 mips mipsel powerpc ppc64el], yasm (>= 1.2) [any-amd64] Build-Depends-Indep: python-sphinx @@ -42,7 +43,7 @@ Multi-Arch: same Depends: ${misc:Depends}, - libx265-43 (= ${binary:Version}) + libx265-51 (= ${binary:Version}) Suggests: libx265-doc Description: H.265/HEVC video stream encoder (development files) @@ -52,7 +53,7 @@ This is the development package which contains headers and libraries for libx265. -Package: libx265-43 +Package: libx265-51 Section: libs Architecture: any Multi-Arch: same @@ -60,21 +61,21 @@ ${misc:Depends}, ${shlibs:Depends} Suggests: - libx265-43-dbg + libx265-51-dbg Description: H.265/HEVC video stream encoder (shared library) libx265 is an encoding library for creating H.265/High Efficency Video Coding (HEVC) video streams. . This package contains the shared library. -Package: libx265-43-dbg +Package: libx265-51-dbg Priority: extra Section: debug Architecture: any Multi-Arch: same Depends: ${misc:Depends}, - libx265-43 (= ${binary:Version}) + libx265-51 (= ${binary:Version}) Description: H.265/HEVC video stream encoder (debugging symbols) libx265 is an encoding library for creating H.265/High Efficency Video Coding (HEVC) video streams. diff -Nru x265-1.5/debian/libx265-43.install x265-1.6/debian/libx265-43.install --- x265-1.5/debian/libx265-43.install 2015-02-18 00:34:19.000000000 +0000 +++ x265-1.6/debian/libx265-43.install 1970-01-01 00:00:00.000000000 +0000 @@ -1,2 +0,0 @@ -usr/lib/*/libx265.so.* -usr/lib/*/x265-10bit/libx265.so.* diff -Nru x265-1.5/debian/libx265-43.symbols x265-1.6/debian/libx265-43.symbols --- x265-1.5/debian/libx265-43.symbols 2015-02-18 00:34:19.000000000 +0000 +++ x265-1.6/debian/libx265-43.symbols 1970-01-01 00:00:00.000000000 +0000 @@ -1,24 +0,0 @@ -libx265.so.43 libx265-43 #MINVER# - x265_setup_primitives@Base 1.4 - x265_param_alloc@Base 1.4 - x265_param_free@Base 1.4 - x265_param_default@Base 1.4 - x265_param_parse@Base 1.4 - x265_param_apply_profile@Base 1.4 - x265_param_default_preset@Base 1.4 - x265_picture_alloc@Base 1.4 - x265_picture_free@Base 1.4 - x265_picture_init@Base 1.4 - x265_max_bit_depth@Base 1.4 - x265_version_str@Base 1.4 - x265_build_info_str@Base 1.4 - x265_encoder_open_43@Base 1.5 - x265_encoder_parameters@Base 1.4 - x265_encoder_headers@Base 1.4 - x265_encoder_encode@Base 1.4 - x265_encoder_get_stats@Base 1.4 - x265_encoder_log@Base 1.4 - x265_encoder_close@Base 1.4 - x265_cleanup@Base 1.4 - (regex|optional)"^x265_.*@Base$" 1.5 - (regex|c++|optional)"^.*@Base$" 1.5 diff -Nru x265-1.5/debian/libx265-51.install x265-1.6/debian/libx265-51.install --- x265-1.5/debian/libx265-51.install 1970-01-01 00:00:00.000000000 +0000 +++ x265-1.6/debian/libx265-51.install 2015-02-18 00:34:19.000000000 +0000 @@ -0,0 +1,2 @@ +usr/lib/*/libx265.so.* +usr/lib/*/x265-10bit/libx265.so.* diff -Nru x265-1.5/debian/libx265-51.symbols x265-1.6/debian/libx265-51.symbols --- x265-1.5/debian/libx265-51.symbols 1970-01-01 00:00:00.000000000 +0000 +++ x265-1.6/debian/libx265-51.symbols 2015-04-07 20:54:37.000000000 +0000 @@ -0,0 +1,25 @@ +libx265.so.51 libx265-51 #MINVER# +| libx265-51 (>= 1.6), libx265-51 (<< 1.7) + x265_api_get_51@Base 1.6 + x265_build_info_str@Base 1.4 + x265_cleanup@Base 1.4 + x265_encoder_close@Base 1.4 + x265_encoder_encode@Base 1.4 + x265_encoder_get_stats@Base 1.4 + x265_encoder_headers@Base 1.4 + x265_encoder_log@Base 1.4 + x265_encoder_open_51@Base 1.6 + x265_encoder_parameters@Base 1.4 + x265_max_bit_depth@Base 1.4 + x265_param_alloc@Base 1.4 + x265_param_apply_profile@Base 1.4 + x265_param_default@Base 1.4 + x265_param_default_preset@Base 1.4 + x265_param_free@Base 1.4 + x265_param_parse@Base 1.4 + x265_picture_alloc@Base 1.4 + x265_picture_free@Base 1.4 + x265_picture_init@Base 1.4 + x265_version_str@Base 1.4 + (regex|optional)"^x265_.*@Base$" 0 1 + (regex|c++|optional)"^.*@Base$" 0 1 diff -Nru x265-1.5/debian/patches/compile-flags.patch x265-1.6/debian/patches/compile-flags.patch --- x265-1.5/debian/patches/compile-flags.patch 2015-02-18 00:58:09.000000000 +0000 +++ x265-1.6/debian/patches/compile-flags.patch 2015-04-07 20:40:44.000000000 +0000 @@ -5,16 +5,15 @@ --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt -@@ -130,12 +130,6 @@ - if(ENABLE_PIC) - add_definitions(-fPIC) - endif(ENABLE_PIC) -- if(X86 AND NOT X64) +@@ -152,11 +152,6 @@ + else() + add_definitions(-march=native) + endif() +- elseif(X86 AND NOT X64) - add_definitions(-march=i686) - endif() - if(ARM) - add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp) -- endif() - check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING) - check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS) - if (CC_HAS_NO_ARRAY_BOUNDS) + endif() + if(FPROFILE_GENERATE) + if(INTEL_CXX) diff -Nru x265-1.5/debian/patches/hide-internal-symbols.patch x265-1.6/debian/patches/hide-internal-symbols.patch --- x265-1.5/debian/patches/hide-internal-symbols.patch 2015-02-18 01:02:36.000000000 +0000 +++ x265-1.6/debian/patches/hide-internal-symbols.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,18 +0,0 @@ -Description: Hide internal symbols -Author: Sebastian Ramacher -Bug: https://bitbucket.org/multicoreware/x265/issue/105/defaultanalysisfilename-in-source-encoder -Last-Update: 2015-02-18 - -diff --git a/source/encoder/encoder.cpp b/source/encoder/encoder.cpp -index 45ad5b1..38a0c76 100644 ---- a/source/encoder/encoder.cpp -+++ b/source/encoder/encoder.cpp -@@ -51,7 +51,7 @@ static const char *summaryCSVHeader = - "B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), " - "Version\n"; - --const char* defaultAnalysisFileName = "x265_analysis.dat"; -+static const char* defaultAnalysisFileName = "x265_analysis.dat"; - - using namespace x265; - diff -Nru x265-1.5/debian/patches/series x265-1.6/debian/patches/series --- x265-1.5/debian/patches/series 2015-02-18 01:02:36.000000000 +0000 +++ x265-1.6/debian/patches/series 2015-04-07 20:43:28.000000000 +0000 @@ -1,3 +1 @@ compile-flags.patch -workaround-partial-SIMD-support.patch -hide-internal-symbols.patch diff -Nru x265-1.5/debian/patches/workaround-partial-SIMD-support.patch x265-1.6/debian/patches/workaround-partial-SIMD-support.patch --- x265-1.5/debian/patches/workaround-partial-SIMD-support.patch 2015-02-18 01:02:36.000000000 +0000 +++ x265-1.6/debian/patches/workaround-partial-SIMD-support.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,23 +0,0 @@ -Description: remove buggy workarounds for partial SIMD support - In the past, there were a number of primitives written in SIMD intrinsics that - could work without compiling with YASM. Most of those are now gone, and we - generally require YASM for SIMD support. This commit remoes support for using - the few remaining SIMD intrinsics without having YASM to provide - implementations of x265_emms(), x265_cpu_cpuid(), etc. Fixing a bug in the - process. -Origin: upstream, - https://bitbucket.org/multicoreware/x265/commits/d7b5e73fc91ab20f86c36d5c1924632c1bcf3e68 -Bug: https://bitbucket.org/multicoreware/x265/issue/92/fails-to-build-on-x86-32-without-yasm -Last-Update: 2015-02-18 - ---- a/source/common/common.h -+++ b/source/common/common.h -@@ -74,7 +74,7 @@ - #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16))) - #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32))) - --#if X265_ARCH_X86 && !defined(X86_64) -+#if ENABLE_ASSEMBLY && X265_ARCH_X86 && !defined(X86_64) - extern "C" intptr_t x265_stack_align(void (*func)(), ...); - #define x265_stack_align(func, ...) x265_stack_align((void (*)())func, __VA_ARGS__) - #else diff -Nru x265-1.5/debian/rules x265-1.6/debian/rules --- x265-1.5/debian/rules 2015-02-18 01:02:36.000000000 +0000 +++ x265-1.6/debian/rules 2015-04-07 20:45:31.000000000 +0000 @@ -45,5 +45,5 @@ dh_sphinxdoc override_dh_strip: - dh_strip -plibx265-43 --dbg-package=libx265-43-dbg + dh_strip -plibx265-51 --dbg-package=libx265-51-dbg dh_strip --remaining-packages diff -Nru x265-1.5/debian/watch x265-1.6/debian/watch --- x265-1.5/debian/watch 2014-11-28 02:50:07.000000000 +0000 +++ x265-1.6/debian/watch 2015-04-07 20:30:34.000000000 +0000 @@ -1,4 +1,3 @@ version=3 -opts="filenamemangle=s/.*\/(\d.*)\.tar\.bz2/x265-$1.tar.bz2/" \ -https://bitbucket.org/multicoreware/x265/downloads \ - .*/get/(\d.*).tar.bz2 +http://ftp.videolan.org/pub/videolan/x265/ \ + x265_(\d.*).tar.gz diff -Nru x265-1.5/doc/reST/api.rst x265-1.6/doc/reST/api.rst --- x265-1.5/doc/reST/api.rst 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/doc/reST/api.rst 2015-04-02 16:46:36.000000000 +0000 @@ -72,11 +72,13 @@ process. All of the encoders must use the same maximum CTU size because many global variables are configured based on this size. Encoder allocation will fail if a mis-matched CTU size is attempted. + If no encoders are open, **x265_cleanup()** can be called to reset + the configured CTU size so a new size can be used. An encoder is allocated by calling **x265_encoder_open()**:: /* x265_encoder_open: - * create a new encoder handler, all parameters from x265_param are copied */ + * create a new encoder handler, all parameters from x265_param are copied */ x265_encoder* x265_encoder_open(x265_param *); The returned pointer is then passed to all of the functions pertaining @@ -337,10 +339,44 @@ void x265_encoder_close(x265_encoder *); When the application has completed all encodes, it should call -**x265_cleanup()** to free process global resources like the thread pool; -particularly if a memory-leak detection tool is being used:: +**x265_cleanup()** to free process global, particularly if a memory-leak +detection tool is being used. **x265_cleanup()** also resets the saved +CTU size so it will be possible to create a new encoder with a different +CTU size:: - /*** - * Release library static allocations - */ + /* x265_cleanup: + * release library static allocations, reset configured CTU size */ void x265_cleanup(void); + + +Multi-library Interface +======================= + +If your application might want to make a runtime selection between among +a number of libx265 libraries (perhaps 8bpp and 16bpp), then you will +want to use the multi-library interface. + +Instead of directly using all of the **x265_** methods documented +above, you query an x265_api structure from your libx265 and then use +the function pointers within that structure of the same name, but +without the **x265_** prefix. So **x265_param_default()** becomes +**api->param_default()**. The key method is x265_api_get():: + + /* x265_api_get: + * Retrieve the programming interface for a linked x265 library. + * May return NULL if no library is available that supports the + * requested bit depth. If bitDepth is 0, the function is guarunteed + * to return a non-NULL x265_api pointer from the system default + * libx265 */ + const x265_api* x265_api_get(int bitDepth); + +The general idea is to request the API for the bitDepth you would prefer +the encoder to use (8 or 10), and if that returns NULL you request the +API for bitDepth=0, which returns the system default libx265. + +Note that using this multi-library API in your application is only the +first step. Next your application must dynamically link to libx265 and +then you must build and install a multi-lib configuration of libx265, +which includes 8bpp and 16bpp builds of libx265 and a shim library which +forwards x265_api_get() calls to the appropriate library using dynamic +loading and binding. diff -Nru x265-1.5/doc/reST/cli.rst x265-1.6/doc/reST/cli.rst --- x265-1.5/doc/reST/cli.rst 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/doc/reST/cli.rst 2015-04-02 16:46:36.000000000 +0000 @@ -171,19 +171,54 @@ Over-allocation of frame threads will not improve performance, it will generally just increase memory use. -.. option:: --threads + **Values:** any value between 8 and 16. Default is 0, auto-detect - Number of threads to allocate for the worker thread pool This pool - is used for WPP and for distributed analysis and motion search: - :option:`--wpp` :option:`--pmode` and :option:`--pme` respectively. - - If :option:`--threads` 1 is specified, then no thread pool is - created. When no thread pool is created, all the thread pool - features are implicitly disabled. If all the pool features are - disabled by the user, then the pool is implicitly disabled. +.. option:: --pools , --numa-pools - Default 0, one thread is allocated per detected hardware thread - (logical CPU cores) + Comma seperated list of threads per NUMA node. If "none", then no worker + pools are created and only frame parallelism is possible. If NULL or "" + (default) x265 will use all available threads on each NUMA node:: + + '+' is a special value indicating all cores detected on the node + '*' is a special value indicating all cores detected on the node and all remaining nodes + '-' is a special value indicating no cores on the node, same as '0' + + example strings for a 4-node system:: + + "" - default, unspecified, all numa nodes are used for thread pools + "*" - same as default + "none" - no thread pools are created, only frame parallelism possible + "-" - same as "none" + "10" - allocate one pool, using up to 10 cores on node 0 + "-,+" - allocate one pool, using all cores on node 1 + "+,-,+" - allocate two pools, using all cores on nodes 0 and 2 + "+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2 + "-,*" - allocate three pools, using all cores on nodes 1, 2 and 3 + "8,8,8,8" - allocate four pools with up to 8 threads in each pool + + The total number of threads will be determined by the number of threads + assigned to all nodes. The worker threads will each be given affinity for + their node, they will not be allowed to migrate between nodes, but they + will be allowed to move between CPU cores within their node. + + If the three pool features: :option:`--wpp` :option:`--pmode` and + :option:`--pme` are all disabled, then :option:`--pools` is ignored + and no thread pools are created. + + If "none" is specified, then all three of the thread pool features are + implicitly disabled. + + Multiple thread pools will be allocated for any NUMA node with more than + 64 logical CPU cores. But any given thread pool will always use at most + one NUMA node. + + Frame encoders are distributed between the available thread pools, + and the encoder will never generate more thread pools than + :option:`--frame-threads`. The pools are used for WPP and for + distributed analysis and motion search. + + Default "", one thread is allocated per detected hardware thread + (logical CPU cores) and one thread pool per NUMA node. .. option:: --wpp, --no-wpp @@ -409,7 +444,30 @@ If :option:`--level-idc` has been specified, the option adds the intention to support the High tier of that level. If your specified level does not support a High tier, a warning is issued and this - modifier flag is ignored. + modifier flag is ignored. If :option:`--level-idc` has been specified, + but not --high-tier, then the encoder will attempt to encode at the + specified level, main tier first, turning on high tier only if + necessary and available at that level. + +.. option:: --ref <1..16> + + Max number of L0 references to be allowed. This number has a linear + multiplier effect on the amount of work performed in motion search, + but will generally have a beneficial affect on compression and + distortion. + + Note that x265 allows up to 16 L0 references but the HEVC + specification only allows a maximum of 8 total reference frames. So + if you have B frames enabled only 7 L0 refs are valid and if you + have :option:`--b-pyramid` enabled (which is enabled by default in + all presets), then only 6 L0 refs are the maximum allowed by the + HEVC specification. If x265 detects that the total reference count + is greater than 8, it will issue a warning that the resulting stream + is non-compliant and it signals the stream as profile NONE and level + NONE but still allows the encode to continue. Compliant HEVC + decoders may refuse to decode such streams. + + Default 3 .. note:: :option:`--profile`, :option:`--level-idc`, and @@ -444,7 +502,7 @@ +-------+---------------------------------------------------------------+ | 3 | RDO mode and split decisions, chroma residual used for sa8d | +-------+---------------------------------------------------------------+ - | 4 | Adds RDO Quant | + | 4 | Currently same as 3 | +-------+---------------------------------------------------------------+ | 5 | Adds RDO prediction decisions | +-------+---------------------------------------------------------------+ @@ -465,6 +523,23 @@ and less frame parallelism as well. Because of this the faster presets use a CU size of 32. Default: 64 +.. option:: --min-cu-size <64|32|16|8> + + Minimum CU size (width and height). By using 16 or 32 the encoder + will not analyze the cost of CUs below that minimum threshold, + saving considerable amounts of compute with a predictable increase + in bitrate. This setting has a large effect on performance on the + faster presets. + + Default: 8 (minimum 8x8 CU for HEVC, best compression efficiency) + +.. note:: + + All encoders within a single process must use the same settings for + the CU size range. :option:`--ctu` and :option:`--min-cu-size` must + be consistent for all of them since the encoder configures several + key global data structures based on this range. + .. option:: --rect, --no-rect Enable analysis of rectangular motion partitions Nx2N and 2NxN @@ -494,14 +569,6 @@ Measure full CU size (2Nx2N) merge candidates first; if no residual is found the analysis is short circuited. Default disabled -.. option:: --fast-cbf, --no-fast-cbf - - Short circuit analysis if a prediction is found that does not set - the coded block flag (aka: no residual was encoded). It prevents - the encoder from perhaps finding other predictions that also have no - residual but require less signaling bits or have less distortion. - Only applicable for RD levels 5 and 6. Default disabled - .. option:: --fast-intra, --no-fast-intra Perform an initial scan of every fifth intra angular mode, then @@ -526,14 +593,6 @@ Only effective at RD levels 3 and above, which perform RDO mode decisions. -.. option:: --tskip, --no-tskip - - Enable evaluation of transform skip (bypass DCT but still use - quantization) coding for 4x4 TU coded blocks. - - Only effective at RD levels 3 and above, which perform RDO mode - decisions. Default disabled - .. option:: --tskip-fast, --no-tskip-fast Only evaluate transform skip for NxN intra predictions (4x4 blocks). @@ -567,6 +626,30 @@ Options which affect the transform unit quad-tree, sometimes referred to as the residual quad-tree (RQT). +.. option:: --rdoq-level <0|1|2>, --no-rdoq-level + + Specify the amount of rate-distortion analysis to use within + quantization:: + + At level 0 rate-distortion cost is not considered in quant + + At level 1 rate-distortion cost is used to find optimal rounding + values for each level (and allows psy-rdoq to be effective). It + trades-off the signaling cost of the coefficient vs its post-inverse + quant distortion from the pre-quant coefficient. When + :option:`--psy-rdoq` is enabled, this formula is biased in favor of + more energy in the residual (larger coefficient absolute levels) + + At level 2 rate-distortion cost is used to make decimate decisions + on each 4x4 coding group, including the cost of signaling the group + within the group bitmap. If the total distortion of not signaling + the entire coding group is less than the rate cost, the block is + decimated. Next, it applies rate-distortion cost analysis to the + last non-zero coefficient, which can result in many (or all) of the + coding groups being decimated. Psy-rdoq is less effective at + preserving energy when RDOQ is at level 2, since it only has + influence over the level distortion costs. + .. option:: --tu-intra-depth <1..4> The transform unit (residual) quad-tree begins with the same depth @@ -593,9 +676,76 @@ partitions, in which case a TU split is implied and thus the residual quad-tree begins one layer below the CU quad-tree. +.. option:: --nr-intra , --nr-inter + + Noise reduction - an adaptive deadzone applied after DCT + (subtracting from DCT coefficients), before quantization. It does + no pixel-level filtering, doesn't cross DCT block boundaries, has no + overlap, The higher the strength value parameter, the more + aggressively it will reduce noise. + + Enabling noise reduction will make outputs diverge between different + numbers of frame threads. Outputs will be deterministic but the + outputs of -F2 will no longer match the outputs of -F3, etc. + + **Values:** any value in range of 0 to 2000. Default 0 (disabled). + +.. option:: --tskip, --no-tskip + + Enable evaluation of transform skip (bypass DCT but still use + quantization) coding for 4x4 TU coded blocks. + + Only effective at RD levels 3 and above, which perform RDO mode + decisions. Default disabled + +.. option:: --rdpenalty <0..2> + + When set to 1, transform units of size 32x32 are given a 4x bit cost + penalty compared to smaller transform units, in intra coded CUs in P + or B slices. + + When set to 2, transform units of size 32x32 are not even attempted, + unless otherwise required by the maximum recursion depth. For this + option to be effective with 32x32 intra CUs, + :option:`--tu-intra-depth` must be at least 2. For it to be + effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be + at least 3. + + Note that in HEVC an intra transform unit (a block of the residual + quad-tree) is also a prediction unit, meaning that the intra + prediction signal is generated for each TU block, the residual + subtracted and then coded. The coding unit simply provides the + prediction modes that will be used when predicting all of the + transform units within the CU. This means that when you prevent + 32x32 intra transform units, you are preventing 32x32 intra + predictions. + + Default 0, disabled. + + **Values:** 0:disabled 1:4x cost penalty 2:force splits + +.. option:: --max-tu-size <32|16|8|4> + + Maximum TU size (width and height). The residual can be more + efficiently compressed by the DCT transform when the max TU size + is larger, but at the expense of more computation. Transform unit + quad-tree begins at the same depth of the coded tree unit, but if the + maximum TU size is smaller than the CU size then transform QT begins + at the depth of the max-tu-size. Default: 32. + Temporal / motion search options ================================ +.. option:: --max-merge <1..5> + + Maximum number of neighbor (spatial and temporal) candidate blocks + that the encoder may consider for merging motion predictions. If a + merge candidate results in no residual, it is immediately selected + as a "skip". Otherwise the merge candidates are tested as part of + motion estimation when searching for the least cost inter option. + The max candidate number is encoded in the SPS and determines the + bit cost of signaling merge CUs. Default 2 + .. option:: --me Motion search method. Generally, the higher the number the harder @@ -658,16 +808,6 @@ **Range of values:** an integer from 0 to 32768 -.. option:: --max-merge <1..5> - - Maximum number of neighbor (spatial and temporal) candidate blocks - that the encoder may consider for merging motion predictions. If a - merge candidate results in no residual, it is immediately selected - as a "skip". Otherwise the merge candidates are tested as part of - motion estimation when searching for the least cost inter option. - The max candidate number is encoded in the SPS and determines the - bit cost of signaling merge CUs. Default 2 - .. option:: --temporal-mvp, --no-temporal-mvp Enable temporal motion vector predictors in P and B slices. @@ -704,32 +844,6 @@ propagation of reference errors that may have resulted from lossy signals. Default disabled -.. option:: --rdpenalty <0..2> - - When set to 1, transform units of size 32x32 are given a 4x bit cost - penalty compared to smaller transform units, in intra coded CUs in P - or B slices. - - When set to 2, transform units of size 32x32 are not even attempted, - unless otherwise required by the maximum recursion depth. For this - option to be effective with 32x32 intra CUs, - :option:`--tu-intra-depth` must be at least 2. For it to be - effective with 64x64 intra CUs, :option:`--tu-intra-depth` must be - at least 3. - - Note that in HEVC an intra transform unit (a block of the residual - quad-tree) is also a prediction unit, meaning that the intra - prediction signal is generated for each TU block, the residual - subtracted and then coded. The coding unit simply provides the - prediction modes that will be used when predicting all of the - transform units within the CU. This means that when you prevent - 32x32 intra transform units, you are preventing 32x32 intra - predictions. - - Default 0, disabled. - - **Values:** 0:disabled 1:4x cost penalty 2:force splits - Psycho-visual options ===================== @@ -752,8 +866,8 @@ inter prediction. :option:`--psy-rdoq` will adjust the distortion cost used in -rate-distortion optimized quantization (RDO quant), enabled in -:option:`--rd` 4 and above, favoring the preservation of energy in the +rate-distortion optimized quantization (RDO quant), enabled by +:option:`--rdoq-level` 1 or 2, favoring the preservation of energy in the reconstructed image. :option:`--psy-rdoq` prevents RDOQ from blurring all of the encoding options which psy-rd has to chose from. At low strength levels, psy-rdoq will influence the quantization level @@ -801,9 +915,8 @@ Influence rate distortion optimized quantization by favoring higher energy in the reconstructed image. This generally improves perceived visual quality at the cost of lower quality metric scores. It only - has effect on slower presets which use RDO Quantization - (:option:`--rd` 4, 5 and 6). 1.0 is a typical value. High values can - be beneficial in preserving high-frequency detail like film grain. + has effect when :option:`--rdoq-level` is 1 or 2. High values can + be beneficial in preserving high-frequency detail like film grain. Default: 1.0 **Range of values:** 0 .. 50.0 @@ -850,11 +963,36 @@ **Range of values:** Between the maximum consecutive bframe count (:option:`--bframes`) and 250 +.. option:: --lookahead-slices <0..16> + + Use multiple worker threads to measure the estimated cost of each + frame within the lookahead. When :option:`--b-adapt` is 2, most + frame cost estimates will be performed in batch mode, many cost + estimates at the same time, and lookahead-slices is ignored for + batched estimates. The effect on performance can be quite small. + The higher this parameter, the less accurate the frame costs will be + (since context is lost across slice boundaries) which will result in + less accurate B-frame and scene-cut decisions. + + The encoder may internally lower the number of slices to ensure + each slice codes at least 10 16x16 rows of lowres blocks. If slices + are used in lookahead, they are logged in the list of tools as + *lslices*. + + **Values:** 0 - disabled (default). 1 is the same as 0. Max 16 + .. option:: --b-adapt - Adaptive B frame scheduling. Default 2 + Set the level of effort in determining B frame placement. - **Values:** 0:none; 1:fast; 2:full(trellis) + With b-adapt 0, the GOP structure is fixed based on the values of + :option:`--keyint` and :option:`--bframes`. + + With b-adapt 1 a light lookahead is used to choose B frame placement. + + With b-adapt 2 (trellis) a viterbi B path selection is performed + + **Values:** 0:none; 1:fast; 2:full(trellis) **default** .. option:: --bframes, -b <0..16> @@ -874,13 +1012,6 @@ Use B-frames as references, when possible. Default enabled -.. option:: --ref <1..16> - - Max number of L0 references to be allowed. This number has a linear - multiplier effect on the amount of work performed in motion search, - but will generally have a beneficial affect on compression and - distortion. Default 3 - Quality, rate control and rate distortion options ================================================= @@ -990,20 +1121,6 @@ less bits. This tends to improve detail in the backgrounds of video with less detail in areas of high motion. Default enabled -.. option:: --nr-intra , --nr-inter - - Noise reduction - an adaptive deadzone applied after DCT - (subtracting from DCT coefficients), before quantization. It does - no pixel-level filtering, doesn't cross DCT block boundaries, has no - overlap, The higher the strength value parameter, the more - aggressively it will reduce noise. - - Enabling noise reduction will make outputs diverge between different - numbers of frame threads. Outputs will be deterministic but the - outputs of -F2 will no longer match the outputs of -F3, etc. - - **Values:** any value in range of 0 to 2000. Default 0 (disabled). - .. option:: --pass Enable multi-pass rate control mode. Input is encoded multiple times, @@ -1308,6 +1425,8 @@ 13. iec61966-2-1 14. bt2020-10 15. bt2020-12 + 16. smpte-st-2084 + 17. smpte-st-428 .. option:: --colormatrix @@ -1342,13 +1461,13 @@ to keep the stream headers for you and you want keyframes to be random access points. Default disabled -.. option:: --info, --no-info +.. option:: --aud, --no-aud - Emit an informational SEI with the stream headers which describes - the encoder version, build info, and encode parameters. This is very - helpful for debugging purposes but encoding version numbers and - build info could make your bitstreams diverge and interfere with - regression testing. Default enabled + Emit an access unit delimiter NAL at the start of each slice access + unit. If :option:`--repeat-headers` is not enabled (indicating the + user will be writing headers manually at the start of the stream) + the very first AUD will be skipped since it cannot be placed at the + start of the access unit, where it belongs. Default disabled .. option:: --hrd, --no-hrd @@ -1357,13 +1476,13 @@ Picture Timing SEI messages providing timing information to the decoder. Default disabled -.. option:: --aud, --no-aud +.. option:: --info, --no-info - Emit an access unit delimiter NAL at the start of each slice access - unit. If :option:`--repeat-headers` is not enabled (indicating the - user will be writing headers manually at the start of the stream) - the very first AUD will be skipped since it cannot be placed at the - start of the access unit, where it belongs. Default disabled + Emit an informational SEI with the stream headers which describes + the encoder version, build info, and encode parameters. This is very + helpful for debugging purposes but encoding version numbers and + build info could make your bitstreams diverge and interfere with + regression testing. Default enabled .. option:: --hash @@ -1375,6 +1494,18 @@ 2. CRC 3. Checksum +.. option:: --temporal-layers,--no-temporal-layers + + Enable a temporal sub layer. All referenced I/P/B frames are in the + base layer and all unreferenced B frames are placed in a temporal + sublayer. A decoder may chose to drop the sublayer and only decode + and display the base layer slices. + + If used with a fixed GOP (:option:`b-adapt` 0) and :option:`bframes` + 3 then the two layers evenly split the frame rate, with a cadence of + PbBbP. You probably also want :option:`--no-scenecut` and a keyframe + interval that is a multiple of 4. + Debugging options ================= diff -Nru x265-1.5/doc/reST/presets.rst x265-1.6/doc/reST/presets.rst --- x265-1.5/doc/reST/presets.rst 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/doc/reST/presets.rst 2015-04-02 16:46:36.000000000 +0000 @@ -24,19 +24,21 @@ +==============+===========+===========+==========+========+======+========+======+========+==========+=========+ | ctu | 32 | 32 | 32 | 64 | 64 | 64 | 64 | 64 | 64 | 64 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ -| bframes | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 8 | 8 | 8 | +| min-cu-size | 16 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | 8 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ -| b-adapt | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 2 | 2 | 2 | +| bframes | 3 | 3 | 4 | 4 | 4 | 4 | 4 | 8 | 8 | 8 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ -| rc-lookahead | 10 | 10 | 15 | 15 | 15 | 20 | 25 | 30 | 40 | 60 | +| b-adapt | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 2 | 2 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| rc-lookahead | 5 | 10 | 15 | 15 | 15 | 20 | 25 | 30 | 40 | 60 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | scenecut | 0 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | 40 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ -| refs | 1 | 1 | 1 | 1 | 3 | 3 | 3 | 3 | 5 | 5 | +| refs | 1 | 1 | 1 | 1 | 2 | 3 | 3 | 3 | 5 | 5 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | me | dia | hex | hex | hex | hex | hex | star | star | star | star | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ -| merange | 25 | 44 | 57 | 57 | 57 | 57 | 57 | 57 | 57 | 92 | +| merange | 57 | 57 | 57 | 57 | 57 | 57 | 57 | 57 | 57 | 92 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | subme | 0 | 1 | 1 | 2 | 2 | 2 | 3 | 3 | 4 | 5 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ @@ -60,12 +62,14 @@ +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | weightb | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ -| aq-mode | 0 | 0 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | +| aq-mode | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | cuTree | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | rdLevel | 2 | 2 | 2 | 2 | 2 | 3 | 4 | 6 | 6 | 6 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ +| rdoq-level | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 2 | 2 | ++--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | tu-intra | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | 4 | +--------------+-----------+-----------+----------+--------+------+--------+------+--------+----------+---------+ | tu-inter | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 3 | 4 | @@ -114,17 +118,12 @@ modes which preserve high frequency noise: * :option:`--psy-rd` 0.5 + * :option:`--rdoq-level` 1 * :option:`--psy-rdoq` 30 -.. Note:: - - --psy-rdoq is only effective when RDOQuant is enabled, which is at - RD levels 4, 5, and 6 (presets slow and below). - It lowers the strength of adaptive quantization, so residual energy can be more evenly distributed across the (noisy) picture: - * :option:`--aq-mode` 1 * :option:`--aq-strength` 0.3 And it similarly tunes rate control to prevent the slice QP from diff -Nru x265-1.5/doc/reST/threading.rst x265-1.6/doc/reST/threading.rst --- x265-1.5/doc/reST/threading.rst 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/doc/reST/threading.rst 2015-04-02 16:46:36.000000000 +0000 @@ -2,41 +2,34 @@ Threading ********* -Thread Pool -=========== +Thread Pools +============ -x265 creates a pool of worker threads and shares this thread pool -with all encoders within the same process (it is process global, aka a -singleton). The number of threads within the thread pool is determined -by the encoder which first allocates the pool, which by definition is -the first encoder created within each process. - -:option:`--threads` specifies the number of threads the encoder will -try to allocate for its thread pool. If the thread pool was already -allocated this parameter is ignored. By default x265 allocates one -thread per (hyperthreaded) CPU core in your system. - -Work distribution is job based. Idle worker threads ask their parent -pool object for jobs to perform. When no jobs are available, idle -worker threads block and consume no CPU cycles. +x265 creates one or more thread pools per encoder, one pool per NUMA +node (typically a CPU socket). :option:`--pools` specifies the number of +pools and the number of threads per pool the encoder will allocate. By +default x265 allocates one thread per (hyperthreaded) CPU core on each +NUMA node. + +If you are running multiple encoders on a system with multiple NUMA +nodes, it is recommended to isolate each of them to a single node in +order to avoid the NUMA overhead of remote memory access. + +Work distribution is job based. Idle worker threads scan the job +providers assigned to their thread pool for jobs to perform. When no +jobs are available, the idle worker threads block and consume no CPU +cycles. Objects which desire to distribute work to worker threads are known as -job providers (and they derive from the JobProvider class). When job -providers have work they enqueue themselves into the pool's provider -list (and dequeue themselves when they no longer have work). The thread +job providers (and they derive from the JobProvider class). The thread pool has a method to **poke** awake a blocked idle thread, and job providers are recommended to call this method when they make new jobs available. Worker jobs are not allowed to block except when abosultely necessary -for data locking. If a job becomes blocked, the worker thread is -expected to drop that job and go back to the pool and find more work. - -.. note:: - - x265_cleanup() frees the process-global thread pool, allowing - it to be reallocated if necessary, but only if no encoders are - allocated at the time it is called. +for data locking. If a job becomes blocked, the work function is +expected to drop that job so the worker thread may go back to the pool +and find more work. Wavefront Parallel Processing ============================= @@ -82,24 +75,35 @@ thread count to be higher than if WPP was enabled. The exact formulas are described in the next section. +Bonded Task Groups +================== + +If a worker thread job has work which can be performed in parallel by +many threads, it may allocate a bonded task group and enlist the help of +other idle worker threads in the same pool. Those threads will cooperate +to complete the work of the bonded task group and then return to their +idle states. The larger and more uniform those tasks are, the better the +bonded task group will perform. + Parallel Mode Analysis -====================== +~~~~~~~~~~~~~~~~~~~~~~ When :option:`--pmode` is enabled, each CU (at all depths from 64x64 to -8x8) will distribute its analysis work to the thread pool. Each analysis -job will measure the cost of one prediction for the CU: merge, skip, -intra, inter (2Nx2N, Nx2N, 2NxN, and AMP). At slower presets, the amount -of increased parallelism is often enough to be able to reduce frame -parallelism while achieving the same overall CPU utilization. Reducing -frame threads is often beneficial to ABR and VBV rate control. +8x8) will distribute its analysis work to the thread pool via a bonded +task group. Each analysis job will measure the cost of one prediction +for the CU: merge, skip, intra, inter (2Nx2N, Nx2N, 2NxN, and AMP). At +slower presets, the amount of increased parallelism is often enough to +be able to reduce frame parallelism while achieving the same overall CPU +utilization. Reducing frame threads is often beneficial to ABR and VBV +rate control. Parallel Motion Estimation -========================== +~~~~~~~~~~~~~~~~~~~~~~~~~~ When :option:`--pme` is enabled all of the analysis functions which perform motion searches to reference frames will distribute those motion -searches as jobs for worker threads (if more than two motion searches -are required). +searches as jobs for worker threads via a bonded task group (if more +than two motion searches are required). Frame Threading =============== @@ -125,16 +129,21 @@ for motion reference must be processed by the loop filters and the loop filters cannot run until a full row has been encoded, and it must run a full row behind the encode process so that the pixels below the row -being filtered are available. When you add up all the row lags each -frame ends up being 3 CTU rows behind its reference frames (the -equivalent of 12 macroblock rows for x264) +being filtered are available. On top of this, HEVC has two loop filters: +deblocking and SAO, which must be run in series with a row lag between +them. When you add up all the row lags each frame ends up being 3 CTU +rows behind its reference frames (the equivalent of 12 macroblock rows +for x264). And keep in mind the wave-front progression pattern; by the +time the reference frame finishes the third row of CTUs, nearly half of +the CTUs in the frame may be compressed (depending on the display aspect +ratio). The third extenuating circumstance is that when a frame being encoded becomes blocked by a reference frame row being available, that frame's wave-front becomes completely stalled and when the row becomes available again it can take quite some time for the wave to be restarted, if it -ever does. This makes WPP many times less effective when frame -parallelism is in use. +ever does. This makes WPP less effective when frame parallelism is in +use. :option:`--merange` can have a negative impact on frame parallelism. If the range is too large, more rows of CTU lag must be added to ensure @@ -213,13 +222,13 @@ The lookahead module of x265 (the lowres pre-encode which determines scene cuts and slice types) uses the thread pool to distribute the -lowres cost analysis to worker threads. It follows the same wave-front -pattern as the main encoder except it works in reverse-scan order. - -The function slicetypeDecide() itself may also be performed by a worker -thread if your system has enough CPU cores to make this a beneficial -trade-off, else it runs within the context of the thread which calls the -x265_encoder_encode(). +lowres cost analysis to worker threads. It will use bonded task groups +to perform batches of frame cost estimates, and it may optionally use +bonded task groups to measure single frame cost estimates using slices. + +The function slicetypeDecide() itself is also be performed by a worker +thread if your encoder has a thread pool, else it runs within the +context of the thread which calls the x265_encoder_encode(). SAO === diff -Nru x265-1.5/.hg_archival.txt x265-1.6/.hg_archival.txt --- x265-1.5/.hg_archival.txt 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/.hg_archival.txt 2015-04-02 16:46:36.000000000 +0000 @@ -1,4 +1,4 @@ repo: 09fe40627f03a0f9c3e6ac78b22ac93da23f9fdf -node: 9f0324125f53a12f766f6ed6f98f16e2f42337f4 +node: cbeb7d8a4880e4020c4545dd8e498432c3c6cad3 branch: stable -tag: 1.5 +tag: 1.6 diff -Nru x265-1.5/.hgtags x265-1.6/.hgtags --- x265-1.5/.hgtags 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/.hgtags 2015-04-02 16:46:36.000000000 +0000 @@ -13,3 +13,4 @@ d6257335c5370ee54317a0426a12c1f0724b18b9 1.2 c1e4fc0162c14fdb84f5c3bd404fb28cfe10a17f 1.3 5e604833c5aa605d0b6efbe5234492b5e7d8ac61 1.4 +9f0324125f53a12f766f6ed6f98f16e2f42337f4 1.5 diff -Nru x265-1.5/readme.rst x265-1.6/readme.rst --- x265-1.5/readme.rst 1970-01-01 00:00:00.000000000 +0000 +++ x265-1.6/readme.rst 2015-04-02 16:46:36.000000000 +0000 @@ -0,0 +1,14 @@ +================= +x265 HEVC Encoder +================= + +| **Read:** | Online `documentation `_ | Developer `wiki `_ +| **Download:** | `releases `_ +| **Interact:** | #x265 on freenode.irc.net | `x265-devel@videolan.org `_ | `Report an issue `_ + +`x265 `_ is an open +source HEVC encoder. See the developer wiki for instructions for +downloading and building the source. + +x265 is free to use under the `GNU GPL `_ +and is also available under a commercial `license `_ diff -Nru x265-1.5/source/cmake/FindNuma.cmake x265-1.6/source/cmake/FindNuma.cmake --- x265-1.5/source/cmake/FindNuma.cmake 1970-01-01 00:00:00.000000000 +0000 +++ x265-1.6/source/cmake/FindNuma.cmake 2015-04-02 16:46:36.000000000 +0000 @@ -0,0 +1,43 @@ +# Module for locating libnuma +# +# Read-only variables: +# NUMA_FOUND +# Indicates that the library has been found. +# +# NUMA_INCLUDE_DIR +# Points to the libnuma include directory. +# +# NUMA_LIBRARY_DIR +# Points to the directory that contains the libraries. +# The content of this variable can be passed to link_directories. +# +# NUMA_LIBRARY +# Points to the libnuma that can be passed to target_link_libararies. +# +# Copyright (c) 2015 Steve Borho + +include(FindPackageHandleStandardArgs) + +find_path(NUMA_ROOT_DIR + NAMES include/numa.h + PATHS ENV NUMA_ROOT + DOC "NUMA root directory") + +find_path(NUMA_INCLUDE_DIR + NAMES numa.h + HINTS ${NUMA_ROOT_DIR} + PATH_SUFFIXES include + DOC "NUMA include directory") + +find_library(NUMA_LIBRARY + NAMES numa + HINTS ${NUMA_ROOT_DIR} + DOC "NUMA library") + +if (NUMA_LIBRARY) + get_filename_component(NUMA_LIBRARY_DIR ${NUMA_LIBRARY} PATH) +endif() + +mark_as_advanced(NUMA_INCLUDE_DIR NUMA_LIBRARY_DIR NUMA_LIBRARY) + +find_package_handle_standard_args(NUMA REQUIRED_VARS NUMA_ROOT_DIR NUMA_INCLUDE_DIR NUMA_LIBRARY) diff -Nru x265-1.5/source/cmake/version.cmake x265-1.6/source/cmake/version.cmake --- x265-1.5/source/cmake/version.cmake 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/cmake/version.cmake 2015-04-02 16:46:36.000000000 +0000 @@ -10,9 +10,9 @@ set(X265_LATEST_TAG "0.0") set(X265_TAG_DISTANCE "0") -if(EXISTS ${CMAKE_SOURCE_DIR}/../.hg_archival.txt) +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.hg_archival.txt) # read the lines of the archive summary file to extract the version - file(READ ${CMAKE_SOURCE_DIR}/../.hg_archival.txt archive) + file(READ ${CMAKE_CURRENT_SOURCE_DIR}/../.hg_archival.txt archive) STRING(REGEX REPLACE "\n" ";" archive "${archive}") foreach(f ${archive}) string(FIND "${f}" ": " pos) @@ -29,7 +29,7 @@ string(SUBSTRING "${hg_node}" 0 16 hg_id) set(X265_VERSION "${hg_latesttag}+${hg_latesttagdistance}-${hg_id}") endif() -elseif(HG_EXECUTABLE AND EXISTS ${CMAKE_SOURCE_DIR}/../.hg) +elseif(HG_EXECUTABLE AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.hg) if(EXISTS "${HG_EXECUTABLE}.bat") # mercurial source installs on Windows require .bat extension set(HG_EXECUTABLE "${HG_EXECUTABLE}.bat") @@ -38,14 +38,14 @@ execute_process(COMMAND ${HG_EXECUTABLE} log -r. --template "{latesttag}" - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE X265_LATEST_TAG ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE ) execute_process(COMMAND ${HG_EXECUTABLE} log -r. --template "{latesttagdistance}" - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE X265_TAG_DISTANCE ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE @@ -53,7 +53,7 @@ execute_process( COMMAND ${HG_EXECUTABLE} log -r. --template "{node|short}" - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE HG_REVISION_ID ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE @@ -67,11 +67,11 @@ else() set(X265_VERSION "${X265_LATEST_TAG}+${X265_TAG_DISTANCE}-${HG_REVISION_ID}") endif() -elseif(GIT_EXECUTABLE AND EXISTS ${CMAKE_SOURCE_DIR}/../.git) +elseif(GIT_EXECUTABLE AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/../.git) execute_process( COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE X265_LATEST_TAG ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE @@ -80,7 +80,7 @@ execute_process( COMMAND ${GIT_EXECUTABLE} describe --tags - WORKING_DIRECTORY ${PROJECT_SOURCE_DIR} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} OUTPUT_VARIABLE X265_VERSION ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE diff -Nru x265-1.5/source/CMakeLists.txt x265-1.6/source/CMakeLists.txt --- x265-1.5/source/CMakeLists.txt 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/CMakeLists.txt 2015-04-02 16:46:36.000000000 +0000 @@ -12,6 +12,9 @@ if(POLICY CMP0042) cmake_policy(SET CMP0042 NEW) # MACOSX_RPATH endif() +if(POLICY CMP0054) + cmake_policy(SET CMP0054 OLD) # Only interpret if() arguments as variables or keywords when unquoted +endif() project (x265) cmake_minimum_required (VERSION 2.8.8) # OBJECT libraries require 2.8.8 @@ -20,8 +23,14 @@ include(CheckSymbolExists) include(CheckCXXCompilerFlag) +option(FPROFILE_GENERATE "Compile executable to generate usage data" OFF) +option(FPROFILE_USE "Compile executable using generated usage data" OFF) +option(NATIVE_BUILD "Target the build CPU" OFF) +option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) +mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD) + # X265_BUILD must be incremented each time the public API is changed -set(X265_BUILD 43) +set(X265_BUILD 51) configure_file("${PROJECT_SOURCE_DIR}/x265.def.in" "${PROJECT_BINARY_DIR}/x265.def") configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in" @@ -29,11 +38,6 @@ SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}") -option(CHECKED_BUILD "Enable run-time sanity checks (debugging)" OFF) -if(CHECKED_BUILD) - add_definitions(-DCHECKED_BUILD=1) -endif() - # System architecture detection string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC) set(X86_ALIASES x86 i386 i686 x86_64 amd64) @@ -61,6 +65,19 @@ if(LIBRT) list(APPEND PLATFORM_LIBS rt) endif() + find_package(Numa) + if(NUMA_FOUND) + list(APPEND CMAKE_REQUIRED_LIBRARIES ${NUMA_LIBRARY}) + check_symbol_exists(numa_node_of_cpu numa.h NUMA_V2) + if(NUMA_V2) + add_definitions(-DHAVE_LIBNUMA) + message(STATUS "libnuma found, building with support for NUMA nodes") + list(APPEND PLATFORM_LIBS ${NUMA_LIBRARY}) + link_directories(${NUMA_LIBRARY_DIR}) + include_directories(${NUMA_INCLUDE_DIR}) + endif() + endif() + mark_as_advanced(LIBRT NUMA_FOUND) endif(UNIX) if(X64 AND NOT WIN32) @@ -77,13 +94,13 @@ add_definitions(-DMACOS) endif() -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") +if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang") set(CLANG 1) endif() -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") +if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel") set(INTEL_CXX 1) endif() -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") +if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU") set(GCC 1) endif() @@ -92,13 +109,12 @@ set(MSVC 1) endif() if(MSVC) - option(STATIC_LINK_CRT "Statically link C runtime for release builds" OFF) - if (STATIC_LINK_CRT) + if(STATIC_LINK_CRT) set(CompilerFlags CMAKE_CXX_FLAGS_RELEASE CMAKE_C_FLAGS_RELEASE) foreach(CompilerFlag ${CompilerFlags}) string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") endforeach() - endif (STATIC_LINK_CRT) + endif(STATIC_LINK_CRT) add_definitions(/W4) # Full warnings add_definitions(/Ob2) # always inline add_definitions(/MP) # multithreaded build @@ -130,12 +146,56 @@ if(ENABLE_PIC) add_definitions(-fPIC) endif(ENABLE_PIC) - if(X86 AND NOT X64) + if(NATIVE_BUILD) + if(INTEL_CXX) + add_definitions(-xhost) + else() + add_definitions(-march=native) + endif() + elseif(X86 AND NOT X64) add_definitions(-march=i686) endif() if(ARM) add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp) endif() + if(FPROFILE_GENERATE) + if(INTEL_CXX) + add_definitions(-prof-gen -prof-dir="${CMAKE_CURRENT_BINARY_DIR}") + list(APPEND LINKER_OPTIONS "-prof-gen") + else() + check_cxx_compiler_flag(-fprofile-generate CC_HAS_PROFILE_GENERATE) + if(CC_HAS_PROFILE_GENERATE) + add_definitions(-fprofile-generate) + list(APPEND LINKER_OPTIONS "-fprofile-generate") + endif(CC_HAS_PROFILE_GENERATE) + endif(INTEL_CXX) + endif(FPROFILE_GENERATE) + if(FPROFILE_USE) + if(INTEL_CXX) + add_definitions(-prof-use -prof-dir="${CMAKE_CURRENT_BINARY_DIR}") + list(APPEND LINKER_OPTIONS "-prof-use") + else() + check_cxx_compiler_flag(-fprofile-use CC_HAS_PROFILE_USE) + check_cxx_compiler_flag(-fprofile-correction CC_HAS_PROFILE_CORRECTION) + check_cxx_compiler_flag(-Wno-error=coverage-mismatch CC_HAS_COVMISMATCH) + if(CC_HAS_PROFILE_USE) + add_definitions(-fprofile-use) + list(APPEND LINKER_OPTIONS "-fprofile-use") + endif(CC_HAS_PROFILE_USE) + if(CC_HAS_PROFILE_CORRECTION) + # auto-correct corrupted counters (happens a lot with x265) + add_definitions(-fprofile-correction) + endif(CC_HAS_PROFILE_CORRECTION) + if(CC_HAS_COVMISMATCH) + # ignore coverage mismatches (also happens a lot) + add_definitions(-Wno-error=coverage-mismatch) + endif(CC_HAS_COVMISMATCH) + endif(INTEL_CXX) + endif(FPROFILE_USE) + if(STATIC_LINK_CRT) + add_definitions(-static) + list(APPEND LINKER_OPTIONS "-static") + endif(STATIC_LINK_CRT) check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING) check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS) if (CC_HAS_NO_ARRAY_BOUNDS) @@ -154,6 +214,35 @@ if(CC_HAS_FNO_EXCEPTIONS_FLAG) add_definitions(-fno-exceptions) endif() + set(FSANITIZE "" CACHE STRING "-fsanitize options for GCC/clang") + if(FSANITIZE) + add_definitions(-fsanitize=${FSANITIZE}) + # clang and gcc need the sanitize options to be passed at link + # time so the appropriate ASAN/TSAN runtime libraries can be + # linked. + list(APPEND LINKER_OPTIONS "-fsanitize=${FSANITIZE}") + endif() + option(ENABLE_AGGRESSIVE_CHECKS "Enable stack protection and -ftrapv" OFF) + if(ENABLE_AGGRESSIVE_CHECKS) + # use with care, -ftrapv can cause testbench SIGILL exceptions + # since it is testing corner cases of signed integer math + add_definitions(-DUSING_FTRAPV=1) + check_cxx_compiler_flag(-fsanitize=undefined-trap CC_HAS_CATCH_UNDEFINED) # clang + check_cxx_compiler_flag(-ftrapv CC_HAS_FTRAPV) # gcc + check_cxx_compiler_flag(-fstack-protector-all CC_HAS_STACK_PROTECT) # gcc + if(CC_HAS_FTRAPV) + add_definitions(-ftrapv) + endif() + if(CC_HAS_CATCH_UNDEFINED) + add_definitions(-fsanitize=undefined-trap -fsanitize-undefined-trap-on-error) + endif() + if(CC_HAS_STACK_PROTECT) + add_definitions(-fstack-protector-all) + if(MINGW) + list(APPEND PLATFORM_LIBS ssp) + endif() + endif() + endif(ENABLE_AGGRESSIVE_CHECKS) execute_process(COMMAND ${CMAKE_CXX_COMPILER} -dumpversion OUTPUT_VARIABLE CC_VERSION) endif(GCC) @@ -168,6 +257,11 @@ endif() endif() +option(CHECKED_BUILD "Enable run-time sanity checks (debugging)" OFF) +if(CHECKED_BUILD) + add_definitions(-DCHECKED_BUILD=1) +endif() + # Build options set(LIB_INSTALL_DIR lib CACHE STRING "Install location of libraries") set(BIN_INSTALL_DIR bin CACHE STRING "Install location of executables") @@ -179,6 +273,7 @@ # can disable this if(X64) check if you desparately need a 32bit # build with 10bit/12bit support, but this violates the "shrink wrap # license" so to speak. If it breaks you get to keep both halves. + # You will likely need to compile without assembly option(HIGH_BIT_DEPTH "Store pixels as 16bit values" OFF) endif(X64) if(HIGH_BIT_DEPTH) @@ -240,6 +335,11 @@ add_subdirectory(profile/vtune) endif(ENABLE_VTUNE) +option(DETAILED_CU_STATS "Enable internal profiling of encoder work" OFF) +if(DETAILED_CU_STATS) + add_definitions(-DDETAILED_CU_STATS) +endif(DETAILED_CU_STATS) + add_subdirectory(encoder) add_subdirectory(common) @@ -321,6 +421,11 @@ ARCHIVE DESTINATION ${LIB_INSTALL_DIR} RUNTIME DESTINATION ${BIN_INSTALL_DIR}) endif() + if(LINKER_OPTIONS) + # set_target_properties can't do list expansion + string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}") + set_target_properties(x265-shared PROPERTIES LINK_FLAGS "${LINKER_OPTION_STR}") + endif() endif() if(X265_LATEST_TAG) @@ -375,10 +480,10 @@ if(XCODE) # Xcode seems unable to link the CLI with libs, so link as one targget - add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} x265.cpp x265.h + add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} x265.cpp x265.h x265cli.h $ $ ${YASM_OBJS} ${YASM_SRCS}) else() - add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} ${X265_RC_FILE} x265.cpp x265.h) + add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} ${X265_RC_FILE} x265.cpp x265.h x265cli.h) if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX) # The CLI cannot link to the shared library on Windows, it # requires internal APIs not exported from the DLL @@ -388,6 +493,11 @@ endif() endif() set_target_properties(cli PROPERTIES OUTPUT_NAME x265) + if(LINKER_OPTIONS) + # set_target_properties can't do list expansion + string(REPLACE ";" " " LINKER_OPTION_STR "${LINKER_OPTIONS}") + set_target_properties(cli PROPERTIES LINK_FLAGS "${LINKER_OPTION_STR}") + endif() install(TARGETS cli DESTINATION ${BIN_INSTALL_DIR}) endif(ENABLE_CLI) diff -Nru x265-1.5/source/common/bitstream.cpp x265-1.6/source/common/bitstream.cpp --- x265-1.5/source/common/bitstream.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/bitstream.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -27,7 +27,7 @@ uint8_t *temp = X265_MALLOC(uint8_t, m_byteAlloc * 2); if (temp) { - ::memcpy(temp, m_fifo, m_byteOccupancy); + memcpy(temp, m_fifo, m_byteOccupancy); X265_FREE(m_fifo); m_fifo = temp; m_byteAlloc *= 2; @@ -44,7 +44,7 @@ void Bitstream::write(uint32_t val, uint32_t numBits) { X265_CHECK(numBits <= 32, "numBits out of range\n"); - X265_CHECK(numBits == 32 || ((val & (~0 << numBits)) == 0), "numBits & val out of range\n"); + X265_CHECK(numBits == 32 || ((val & (~0u << numBits)) == 0), "numBits & val out of range\n"); uint32_t totalPartialBits = m_partialByteBits + numBits; uint32_t nextPartialBits = totalPartialBits & 7; @@ -55,7 +55,11 @@ { /* topword aligns m_partialByte with the msb of val */ uint32_t topword = (numBits - nextPartialBits) & ~7; +#if USING_FTRAPV + uint32_t write_bits = (topword < 32 ? m_partialByte << topword : 0) | (val >> nextPartialBits); +#else uint32_t write_bits = (m_partialByte << topword) | (val >> nextPartialBits); +#endif switch (writeBytes) { diff -Nru x265-1.5/source/common/CMakeLists.txt x265-1.6/source/common/CMakeLists.txt --- x265-1.5/source/common/CMakeLists.txt 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/CMakeLists.txt 2015-04-02 16:46:36.000000000 +0000 @@ -1,7 +1,7 @@ # vim: syntax=cmake if(ENABLE_ASSEMBLY) - set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1) + set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1) set(SSE3 vec/dct-sse3.cpp) set(SSSE3 vec/dct-ssse3.cpp) @@ -48,7 +48,7 @@ if(HIGH_BIT_DEPTH) set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm) else() - set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm ipfilter8.asm loopfilter.asm) + set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm) endif() if(NOT X64) diff -Nru x265-1.5/source/common/common.cpp x265-1.6/source/common/common.cpp --- x265-1.5/source/common/common.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/common.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -33,6 +33,10 @@ #include #endif +#if CHECKED_BUILD || _DEBUG +int g_checkFailures; +#endif + int64_t x265_mdate(void) { #if _WIN32 diff -Nru x265-1.5/source/common/common.h x265-1.6/source/common/common.h --- x265-1.5/source/common/common.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/common.h 2015-04-02 16:46:36.000000000 +0000 @@ -74,13 +74,6 @@ #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16))) #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32))) -#if X265_ARCH_X86 && !defined(X86_64) -extern "C" intptr_t x265_stack_align(void (*func)(), ...); -#define x265_stack_align(func, ...) x265_stack_align((void (*)())func, __VA_ARGS__) -#else -#define x265_stack_align(func, ...) func(__VA_ARGS__) -#endif - #if defined(__MINGW32__) #define fseeko fseeko64 #endif @@ -90,7 +83,6 @@ #define ALIGN_VAR_8(T, var) __declspec(align(8)) T var #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var -#define x265_stack_align(func, ...) func(__VA_ARGS__) #define fseeko _fseeki64 #endif // if defined(__GNUC__) @@ -106,19 +98,20 @@ #if _DEBUG && defined(_MSC_VER) #define DEBUG_BREAK() __debugbreak() #elif __APPLE_CC__ -#define DEBUG_BREAK() __builtin_trap(); +#define DEBUG_BREAK() __builtin_trap() #else -#define DEBUG_BREAK() +#define DEBUG_BREAK() abort() #endif /* If compiled with CHECKED_BUILD perform run-time checks and log any that * fail, both to stderr and to a file */ #if CHECKED_BUILD || _DEBUG +extern int g_checkFailures; #define X265_CHECK(expr, ...) if (!(expr)) { \ x265_log(NULL, X265_LOG_ERROR, __VA_ARGS__); \ - DEBUG_BREAK(); \ FILE *fp = fopen("x265_check_failures.txt", "a"); \ if (fp) { fprintf(fp, "%s:%d\n", __FILE__, __LINE__); fprintf(fp, __VA_ARGS__); fclose(fp); } \ + g_checkFailures++; DEBUG_BREAK(); \ } #if _MSC_VER #pragma warning(disable: 4127) // some checks have constant conditions @@ -257,7 +250,7 @@ #define UNIT_SIZE (1 << LOG2_UNIT_SIZE) // unit size of CU partition #define MAX_NUM_PARTITIONS 256 -#define NUM_CU_PARTITIONS (1U << (g_maxFullDepth << 1)) +#define NUM_4x4_PARTITIONS (1U << (g_unitSizeDepth << 1)) // number of 4x4 units in max CU size #define MIN_PU_SIZE 4 #define MIN_TU_SIZE 4 @@ -376,6 +369,7 @@ int32_t* ref; uint8_t* depth; uint8_t* modes; + uint32_t* bestMergeCand; }; /* Stores intra analysis data for a single frame. This struct needs better packing */ @@ -384,6 +378,7 @@ uint8_t* depth; uint8_t* modes; char* partSizes; + uint8_t* chromaModes; }; enum TextType @@ -430,6 +425,8 @@ void x265_free(void *ptr); char* x265_slurp_file(const char *filename); +void x265_setup_primitives(x265_param* param, int cpu); /* primitives.cpp */ + #include "constants.h" #endif // ifndef X265_COMMON_H diff -Nru x265-1.5/source/common/constants.cpp x265-1.6/source/common/constants.cpp --- x265-1.5/source/common/constants.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/constants.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -119,9 +119,10 @@ 65535 }; +int g_ctuSizeConfigured = 0; uint32_t g_maxLog2CUSize = MAX_LOG2_CU_SIZE; uint32_t g_maxCUSize = MAX_CU_SIZE; -uint32_t g_maxFullDepth = NUM_FULL_DEPTH - 1; +uint32_t g_unitSizeDepth = NUM_CU_DEPTH; uint32_t g_maxCUDepth = NUM_CU_DEPTH - 1; uint32_t g_zscanToRaster[MAX_NUM_PARTITIONS] = { 0, }; uint32_t g_rasterToZscan[MAX_NUM_PARTITIONS] = { 0, }; diff -Nru x265-1.5/source/common/constants.h x265-1.6/source/common/constants.h --- x265-1.5/source/common/constants.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/constants.h 2015-04-02 16:46:36.000000000 +0000 @@ -29,6 +29,8 @@ namespace x265 { // private namespace +extern int g_ctuSizeConfigured; + void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx); void initRasterToZscan(uint32_t maxFullDepth); @@ -55,7 +57,7 @@ extern uint32_t g_maxLog2CUSize; extern uint32_t g_maxCUSize; extern uint32_t g_maxCUDepth; -extern uint32_t g_maxFullDepth; +extern uint32_t g_unitSizeDepth; // Depth at which 4x4 unit occurs from max CU size extern const int16_t g_t4[4][4]; extern const int16_t g_t8[8][8]; diff -Nru x265-1.5/source/common/cudata.cpp x265-1.6/source/common/cudata.cpp --- x265-1.5/source/common/cudata.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/cudata.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -38,7 +38,7 @@ void bcast1(uint8_t* dst, uint8_t val) { dst[0] = val; } void copy4(uint8_t* dst, uint8_t* src) { ((uint32_t*)dst)[0] = ((uint32_t*)src)[0]; } -void bcast4(uint8_t* dst, uint8_t val) { ((uint32_t*)dst)[0] = 0x01010101 * val; } +void bcast4(uint8_t* dst, uint8_t val) { ((uint32_t*)dst)[0] = 0x01010101u * val; } void copy16(uint8_t* dst, uint8_t* src) { ((uint64_t*)dst)[0] = ((uint64_t*)src)[0]; ((uint64_t*)dst)[1] = ((uint64_t*)src)[1]; } void bcast16(uint8_t* dst, uint8_t val) { uint64_t bval = 0x0101010101010101ULL * val; ((uint64_t*)dst)[0] = bval; ((uint64_t*)dst)[1] = bval; } @@ -159,11 +159,11 @@ m_chromaFormat = csp; m_hChromaShift = CHROMA_H_SHIFT(csp); m_vChromaShift = CHROMA_V_SHIFT(csp); - m_numPartitions = NUM_CU_PARTITIONS >> (depth * 2); + m_numPartitions = NUM_4x4_PARTITIONS >> (depth * 2); if (!s_partSet[0]) { - s_numPartInCUSize = 1 << g_maxFullDepth; + s_numPartInCUSize = 1 << g_unitSizeDepth; switch (g_maxLog2CUSize) { case 6: @@ -272,7 +272,7 @@ m_cuPelX = (cuAddr % m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize; m_cuPelY = (cuAddr / m_slice->m_sps->numCuInWidth) << g_maxLog2CUSize; m_absIdxInCTU = 0; - m_numPartitions = NUM_CU_PARTITIONS; + m_numPartitions = NUM_4x4_PARTITIONS; /* sequential memsets */ m_partSet((uint8_t*)m_qp, (uint8_t)qp); @@ -300,12 +300,12 @@ // initialize Sub partition void CUData::initSubCU(const CUData& ctu, const CUGeom& cuGeom) { - m_absIdxInCTU = cuGeom.encodeIdx; + m_absIdxInCTU = cuGeom.absPartIdx; m_encData = ctu.m_encData; m_slice = ctu.m_slice; m_cuAddr = ctu.m_cuAddr; - m_cuPelX = ctu.m_cuPelX + g_zscanToPelX[cuGeom.encodeIdx]; - m_cuPelY = ctu.m_cuPelY + g_zscanToPelY[cuGeom.encodeIdx]; + m_cuPelX = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]; + m_cuPelY = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]; m_cuLeft = ctu.m_cuLeft; m_cuAbove = ctu.m_cuAbove; m_cuAboveLeft = ctu.m_cuAboveLeft; @@ -392,7 +392,7 @@ m_cuAbove = cu.m_cuAbove; m_cuAboveLeft = cu.m_cuAboveLeft; m_cuAboveRight = cu.m_cuAboveRight; - m_absIdxInCTU = cuGeom.encodeIdx; + m_absIdxInCTU = cuGeom.absPartIdx; m_numPartitions = cuGeom.numPartitions; memcpy(m_qp, cu.m_qp, BytesPerPartition * m_numPartitions); memcpy(m_mv[0], cu.m_mv[0], m_numPartitions * sizeof(MV)); @@ -462,9 +462,9 @@ m_encData = ctu.m_encData; m_slice = ctu.m_slice; m_cuAddr = ctu.m_cuAddr; - m_cuPelX = ctu.m_cuPelX + g_zscanToPelX[cuGeom.encodeIdx]; - m_cuPelY = ctu.m_cuPelY + g_zscanToPelY[cuGeom.encodeIdx]; - m_absIdxInCTU = cuGeom.encodeIdx; + m_cuPelX = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]; + m_cuPelY = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]; + m_absIdxInCTU = cuGeom.absPartIdx; m_numPartitions = cuGeom.numPartitions; /* copy out all prediction info for this part */ @@ -559,7 +559,7 @@ return this; } - aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize]; + aPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_4x4_PARTITIONS - s_numPartInCUSize]; return m_cuAbove; } @@ -581,7 +581,7 @@ return this; } } - alPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_CU_PARTITIONS - s_numPartInCUSize - 1]; + alPartUnitIdx = g_rasterToZscan[absPartIdx + NUM_4x4_PARTITIONS - s_numPartInCUSize - 1]; return m_cuAbove; } @@ -591,7 +591,7 @@ return m_cuLeft; } - alPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - 1]; + alPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - 1]; return m_cuAboveLeft; } @@ -620,14 +620,14 @@ } return NULL; } - arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + 1]; + arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_4x4_PARTITIONS - s_numPartInCUSize + 1]; return m_cuAbove; } if (!isZeroRow(absPartIdxRT, s_numPartInCUSize)) return NULL; - arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize]; + arPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - s_numPartInCUSize]; return m_cuAboveRight; } @@ -720,21 +720,21 @@ } return NULL; } - arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset]; + arPartUnitIdx = g_rasterToZscan[absPartIdxRT + NUM_4x4_PARTITIONS - s_numPartInCUSize + partUnitOffset]; return m_cuAbove; } if (!isZeroRow(absPartIdxRT, s_numPartInCUSize)) return NULL; - arPartUnitIdx = g_rasterToZscan[NUM_CU_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1]; + arPartUnitIdx = g_rasterToZscan[NUM_4x4_PARTITIONS - s_numPartInCUSize + partUnitOffset - 1]; return m_cuAboveRight; } /* Get left QpMinCu */ const CUData* CUData::getQpMinCuLeft(uint32_t& lPartUnitIdx, uint32_t curAbsIdxInCTU) const { - uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2); + uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2); uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx]; // check for left CTU boundary @@ -751,7 +751,7 @@ /* Get above QpMinCu */ const CUData* CUData::getQpMinCuAbove(uint32_t& aPartUnitIdx, uint32_t curAbsIdxInCTU) const { - uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2); + uint32_t absZorderQpMinCUIdx = curAbsIdxInCTU & (0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2); uint32_t absRorderQpMinCUIdx = g_zscanToRaster[absZorderQpMinCUIdx]; // check for top CTU boundary @@ -790,7 +790,7 @@ int8_t CUData::getLastCodedQP(uint32_t absPartIdx) const { - uint32_t quPartIdxMask = 0xFF << (g_maxFullDepth - m_slice->m_pps->maxCuDQPDepth) * 2; + uint32_t quPartIdxMask = 0xFF << (g_unitSizeDepth - m_slice->m_pps->maxCuDQPDepth) * 2; int lastValidPartIdx = getLastValidPartIdx(absPartIdx & quPartIdxMask); if (lastValidPartIdx >= 0) @@ -800,7 +800,7 @@ if (m_absIdxInCTU) return m_encData->getPicCTU(m_cuAddr)->getLastCodedQP(m_absIdxInCTU); else if (m_cuAddr > 0 && !(m_slice->m_pps->bEntropyCodingSyncEnabled && !(m_cuAddr % m_slice->m_sps->numCuInWidth))) - return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_CU_PARTITIONS); + return m_encData->getPicCTU(m_cuAddr - 1)->getLastCodedQP(NUM_4x4_PARTITIONS); else return (int8_t)m_slice->m_sliceQp; } @@ -932,7 +932,7 @@ bool CUData::setQPSubCUs(int8_t qp, uint32_t absPartIdx, uint32_t depth) { - uint32_t curPartNumb = NUM_CU_PARTITIONS >> (depth << 1); + uint32_t curPartNumb = NUM_4x4_PARTITIONS >> (depth << 1); uint32_t curPartNumQ = curPartNumb >> 2; if (m_cuDepth[absPartIdx] > depth) @@ -1375,8 +1375,8 @@ return true; } -/* Construct list of merging candidates */ -uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField(*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const +/* Construct list of merging candidates, returns count */ +uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField(*candMvField)[2], uint8_t* candDir) const { uint32_t absPartAddr = m_absIdxInCTU + absPartIdx; const bool isInterB = m_slice->isInterB(); @@ -1385,10 +1385,10 @@ for (uint32_t i = 0; i < maxNumMergeCand; ++i) { - mvFieldNeighbours[i][0].mv = 0; - mvFieldNeighbours[i][1].mv = 0; - mvFieldNeighbours[i][0].refIdx = REF_NOT_VALID; - mvFieldNeighbours[i][1].refIdx = REF_NOT_VALID; + candMvField[i][0].mv = 0; + candMvField[i][1].mv = 0; + candMvField[i][0].refIdx = REF_NOT_VALID; + candMvField[i][1].refIdx = REF_NOT_VALID; } /* calculate the location of upper-left corner pixel and size of the current PU */ @@ -1420,15 +1420,13 @@ if (isAvailableA1) { // get Inter Dir - interDirNeighbours[count] = cuLeft->m_interDir[leftPartIdx]; + candDir[count] = cuLeft->m_interDir[leftPartIdx]; // get Mv from Left - cuLeft->getMvField(cuLeft, leftPartIdx, 0, mvFieldNeighbours[count][0]); + cuLeft->getMvField(cuLeft, leftPartIdx, 0, candMvField[count][0]); if (isInterB) - cuLeft->getMvField(cuLeft, leftPartIdx, 1, mvFieldNeighbours[count][1]); + cuLeft->getMvField(cuLeft, leftPartIdx, 1, candMvField[count][1]); - count++; - - if (count == maxNumMergeCand) + if (++count == maxNumMergeCand) return maxNumMergeCand; } @@ -1444,15 +1442,13 @@ if (isAvailableB1 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAbove, abovePartIdx))) { // get Inter Dir - interDirNeighbours[count] = cuAbove->m_interDir[abovePartIdx]; + candDir[count] = cuAbove->m_interDir[abovePartIdx]; // get Mv from Left - cuAbove->getMvField(cuAbove, abovePartIdx, 0, mvFieldNeighbours[count][0]); + cuAbove->getMvField(cuAbove, abovePartIdx, 0, candMvField[count][0]); if (isInterB) - cuAbove->getMvField(cuAbove, abovePartIdx, 1, mvFieldNeighbours[count][1]); + cuAbove->getMvField(cuAbove, abovePartIdx, 1, candMvField[count][1]); - count++; - - if (count == maxNumMergeCand) + if (++count == maxNumMergeCand) return maxNumMergeCand; } @@ -1465,15 +1461,13 @@ if (isAvailableB0 && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveRight, aboveRightPartIdx))) { // get Inter Dir - interDirNeighbours[count] = cuAboveRight->m_interDir[aboveRightPartIdx]; + candDir[count] = cuAboveRight->m_interDir[aboveRightPartIdx]; // get Mv from Left - cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 0, mvFieldNeighbours[count][0]); + cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 0, candMvField[count][0]); if (isInterB) - cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 1, mvFieldNeighbours[count][1]); - - count++; + cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 1, candMvField[count][1]); - if (count == maxNumMergeCand) + if (++count == maxNumMergeCand) return maxNumMergeCand; } @@ -1486,15 +1480,13 @@ if (isAvailableA0 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuLeftBottom, leftBottomPartIdx))) { // get Inter Dir - interDirNeighbours[count] = cuLeftBottom->m_interDir[leftBottomPartIdx]; + candDir[count] = cuLeftBottom->m_interDir[leftBottomPartIdx]; // get Mv from Left - cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 0, mvFieldNeighbours[count][0]); + cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 0, candMvField[count][0]); if (isInterB) - cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 1, mvFieldNeighbours[count][1]); + cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 1, candMvField[count][1]); - count++; - - if (count == maxNumMergeCand) + if (++count == maxNumMergeCand) return maxNumMergeCand; } @@ -1510,15 +1502,13 @@ && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveLeft, aboveLeftPartIdx))) { // get Inter Dir - interDirNeighbours[count] = cuAboveLeft->m_interDir[aboveLeftPartIdx]; + candDir[count] = cuAboveLeft->m_interDir[aboveLeftPartIdx]; // get Mv from Left - cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 0, mvFieldNeighbours[count][0]); + cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 0, candMvField[count][0]); if (isInterB) - cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 1, mvFieldNeighbours[count][1]); - - count++; + cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 1, candMvField[count][1]); - if (count == maxNumMergeCand) + if (++count == maxNumMergeCand) return maxNumMergeCand; } } @@ -1553,41 +1543,29 @@ absPartAddr = 0; } - int refIdx = 0; - uint32_t partIdxCenter = deriveCenterIdx(puIdx); - uint32_t curCTUIdx = m_cuAddr; - int dir = 0; - bool bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, 0, ctuIdx, absPartAddr); - if (!bExistMV) - bExistMV = getColMVP(colmv, refIdx, 0, curCTUIdx, partIdxCenter); - if (bExistMV) - { - dir |= 1; - mvFieldNeighbours[count][0].mv = colmv; - mvFieldNeighbours[count][0].refIdx = refIdx; - } - - if (isInterB) + int maxList = isInterB ? 2 : 1; + int dir = 0, refIdx = 0; + for (int list = 0; list < maxList; list++) { - bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, 1, ctuIdx, absPartAddr); + bool bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, list, ctuIdx, absPartAddr); if (!bExistMV) - bExistMV = getColMVP(colmv, refIdx, 1, curCTUIdx, partIdxCenter); - + { + uint32_t partIdxCenter = deriveCenterIdx(puIdx); + bExistMV = getColMVP(colmv, refIdx, list, m_cuAddr, partIdxCenter); + } if (bExistMV) { - dir |= 2; - mvFieldNeighbours[count][1].mv = colmv; - mvFieldNeighbours[count][1].refIdx = refIdx; + dir |= (1 << list); + candMvField[count][list].mv = colmv; + candMvField[count][list].refIdx = refIdx; } } if (dir != 0) { - interDirNeighbours[count] = (uint8_t)dir; + candDir[count] = (uint8_t)dir; - count++; - - if (count == maxNumMergeCand) + if (++count == maxNumMergeCand) return maxNumMergeCand; } } @@ -1598,31 +1576,27 @@ uint32_t priorityList0 = 0xEDC984; // { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 } uint32_t priorityList1 = 0xB73621; // { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 } - for (uint32_t idx = 0; idx < cutoff; idx++) + for (uint32_t idx = 0; idx < cutoff; idx++, priorityList0 >>= 2, priorityList1 >>= 2) { int i = priorityList0 & 3; int j = priorityList1 & 3; - priorityList0 >>= 2; - priorityList1 >>= 2; - if ((interDirNeighbours[i] & 0x1) && (interDirNeighbours[j] & 0x2)) + if ((candDir[i] & 0x1) && (candDir[j] & 0x2)) { // get Mv from cand[i] and cand[j] - int refIdxL0 = mvFieldNeighbours[i][0].refIdx; - int refIdxL1 = mvFieldNeighbours[j][1].refIdx; + int refIdxL0 = candMvField[i][0].refIdx; + int refIdxL1 = candMvField[j][1].refIdx; int refPOCL0 = m_slice->m_refPOCList[0][refIdxL0]; int refPOCL1 = m_slice->m_refPOCList[1][refIdxL1]; - if (!(refPOCL0 == refPOCL1 && mvFieldNeighbours[i][0].mv == mvFieldNeighbours[j][1].mv)) + if (!(refPOCL0 == refPOCL1 && candMvField[i][0].mv == candMvField[j][1].mv)) { - mvFieldNeighbours[count][0].mv = mvFieldNeighbours[i][0].mv; - mvFieldNeighbours[count][0].refIdx = refIdxL0; - mvFieldNeighbours[count][1].mv = mvFieldNeighbours[j][1].mv; - mvFieldNeighbours[count][1].refIdx = refIdxL1; - interDirNeighbours[count] = 3; - - count++; + candMvField[count][0].mv = candMvField[i][0].mv; + candMvField[count][0].refIdx = refIdxL0; + candMvField[count][1].mv = candMvField[j][1].mv; + candMvField[count][1].refIdx = refIdxL1; + candDir[count] = 3; - if (count == maxNumMergeCand) + if (++count == maxNumMergeCand) return maxNumMergeCand; } } @@ -1633,15 +1607,15 @@ int refcnt = 0; while (count < maxNumMergeCand) { - interDirNeighbours[count] = 1; - mvFieldNeighbours[count][0].mv = 0; - mvFieldNeighbours[count][0].refIdx = r; + candDir[count] = 1; + candMvField[count][0].mv.word = 0; + candMvField[count][0].refIdx = r; if (isInterB) { - interDirNeighbours[count] = 3; - mvFieldNeighbours[count][1].mv.word = 0; - mvFieldNeighbours[count][1].refIdx = r; + candDir[count] = 3; + candMvField[count][1].mv.word = 0; + candMvField[count][1].refIdx = r; } count++; @@ -1658,99 +1632,122 @@ return count; } -/* Check whether the current PU and a spatial neighboring PU are in a same ME region */ -bool CUData::isDiffMER(int xN, int yN, int xP, int yP) const +// Create the PMV list. Called for each reference index. +int CUData::getPMV(InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx, MV* amvpCand, MV* pmv) const { - uint32_t plevel = 2; + MV directMV[MD_ABOVE_LEFT + 1]; + MV indirectMV[MD_ABOVE_LEFT + 1]; + bool validDirect[MD_ABOVE_LEFT + 1]; + bool validIndirect[MD_ABOVE_LEFT + 1]; + + // Left candidate. + validDirect[MD_BELOW_LEFT] = getDirectPMV(directMV[MD_BELOW_LEFT], neighbours + MD_BELOW_LEFT, picList, refIdx); + validDirect[MD_LEFT] = getDirectPMV(directMV[MD_LEFT], neighbours + MD_LEFT, picList, refIdx); + // Top candidate. + validDirect[MD_ABOVE_RIGHT] = getDirectPMV(directMV[MD_ABOVE_RIGHT], neighbours + MD_ABOVE_RIGHT, picList, refIdx); + validDirect[MD_ABOVE] = getDirectPMV(directMV[MD_ABOVE], neighbours + MD_ABOVE, picList, refIdx); + validDirect[MD_ABOVE_LEFT] = getDirectPMV(directMV[MD_ABOVE_LEFT], neighbours + MD_ABOVE_LEFT, picList, refIdx); + + // Left candidate. + validIndirect[MD_BELOW_LEFT] = getIndirectPMV(indirectMV[MD_BELOW_LEFT], neighbours + MD_BELOW_LEFT, picList, refIdx); + validIndirect[MD_LEFT] = getIndirectPMV(indirectMV[MD_LEFT], neighbours + MD_LEFT, picList, refIdx); + // Top candidate. + validIndirect[MD_ABOVE_RIGHT] = getIndirectPMV(indirectMV[MD_ABOVE_RIGHT], neighbours + MD_ABOVE_RIGHT, picList, refIdx); + validIndirect[MD_ABOVE] = getIndirectPMV(indirectMV[MD_ABOVE], neighbours + MD_ABOVE, picList, refIdx); + validIndirect[MD_ABOVE_LEFT] = getIndirectPMV(indirectMV[MD_ABOVE_LEFT], neighbours + MD_ABOVE_LEFT, picList, refIdx); - if ((xN >> plevel) != (xP >> plevel)) - return true; - if ((yN >> plevel) != (yP >> plevel)) - return true; - return false; -} - -/* Constructs a list of candidates for AMVP, and a larger list of motion candidates */ -int CUData::fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const -{ int num = 0; - - // spatial MV - uint32_t partIdxLT, partIdxRT, partIdxLB = deriveLeftBottomIdx(puIdx); - - deriveLeftRightTopIdx(puIdx, partIdxLT, partIdxRT); - - MV mv[MD_ABOVE_LEFT + 1]; - MV mvOrder[MD_ABOVE_LEFT + 1]; - bool valid[MD_ABOVE_LEFT + 1]; - bool validOrder[MD_ABOVE_LEFT + 1]; - - valid[MD_BELOW_LEFT] = addMVPCand(mv[MD_BELOW_LEFT], picList, refIdx, partIdxLB, MD_BELOW_LEFT); - valid[MD_LEFT] = addMVPCand(mv[MD_LEFT], picList, refIdx, partIdxLB, MD_LEFT); - valid[MD_ABOVE_RIGHT] = addMVPCand(mv[MD_ABOVE_RIGHT], picList, refIdx, partIdxRT, MD_ABOVE_RIGHT); - valid[MD_ABOVE] = addMVPCand(mv[MD_ABOVE], picList, refIdx, partIdxRT, MD_ABOVE); - valid[MD_ABOVE_LEFT] = addMVPCand(mv[MD_ABOVE_LEFT], picList, refIdx, partIdxLT, MD_ABOVE_LEFT); - - validOrder[MD_BELOW_LEFT] = addMVPCandOrder(mvOrder[MD_BELOW_LEFT], picList, refIdx, partIdxLB, MD_BELOW_LEFT); - validOrder[MD_LEFT] = addMVPCandOrder(mvOrder[MD_LEFT], picList, refIdx, partIdxLB, MD_LEFT); - validOrder[MD_ABOVE_RIGHT] = addMVPCandOrder(mvOrder[MD_ABOVE_RIGHT], picList, refIdx, partIdxRT, MD_ABOVE_RIGHT); - validOrder[MD_ABOVE] = addMVPCandOrder(mvOrder[MD_ABOVE], picList, refIdx, partIdxRT, MD_ABOVE); - validOrder[MD_ABOVE_LEFT] = addMVPCandOrder(mvOrder[MD_ABOVE_LEFT], picList, refIdx, partIdxLT, MD_ABOVE_LEFT); - // Left predictor search - if (valid[MD_BELOW_LEFT]) - amvpCand[num++] = mv[MD_BELOW_LEFT]; - else if (valid[MD_LEFT]) - amvpCand[num++] = mv[MD_LEFT]; - else if (validOrder[MD_BELOW_LEFT]) - amvpCand[num++] = mvOrder[MD_BELOW_LEFT]; - else if (validOrder[MD_LEFT]) - amvpCand[num++] = mvOrder[MD_LEFT]; + if (validDirect[MD_BELOW_LEFT]) + amvpCand[num++] = directMV[MD_BELOW_LEFT]; + else if (validDirect[MD_LEFT]) + amvpCand[num++] = directMV[MD_LEFT]; + else if (validIndirect[MD_BELOW_LEFT]) + amvpCand[num++] = indirectMV[MD_BELOW_LEFT]; + else if (validIndirect[MD_LEFT]) + amvpCand[num++] = indirectMV[MD_LEFT]; bool bAddedSmvp = num > 0; // Above predictor search - if (valid[MD_ABOVE_RIGHT]) - amvpCand[num++] = mv[MD_ABOVE_RIGHT]; - else if (valid[MD_ABOVE]) - amvpCand[num++] = mv[MD_ABOVE]; - else if (valid[MD_ABOVE_LEFT]) - amvpCand[num++] = mv[MD_ABOVE_LEFT]; + if (validDirect[MD_ABOVE_RIGHT]) + amvpCand[num++] = directMV[MD_ABOVE_RIGHT]; + else if (validDirect[MD_ABOVE]) + amvpCand[num++] = directMV[MD_ABOVE]; + else if (validDirect[MD_ABOVE_LEFT]) + amvpCand[num++] = directMV[MD_ABOVE_LEFT]; if (!bAddedSmvp) { - if (validOrder[MD_ABOVE_RIGHT]) - amvpCand[num++] = mvOrder[MD_ABOVE_RIGHT]; - else if (validOrder[MD_ABOVE]) - amvpCand[num++] = mvOrder[MD_ABOVE]; - else if (validOrder[MD_ABOVE_LEFT]) - amvpCand[num++] = mvOrder[MD_ABOVE_LEFT]; + if (validIndirect[MD_ABOVE_RIGHT]) + amvpCand[num++] = indirectMV[MD_ABOVE_RIGHT]; + else if (validIndirect[MD_ABOVE]) + amvpCand[num++] = indirectMV[MD_ABOVE]; + else if (validIndirect[MD_ABOVE_LEFT]) + amvpCand[num++] = indirectMV[MD_ABOVE_LEFT]; } int numMvc = 0; for (int dir = MD_LEFT; dir <= MD_ABOVE_LEFT; dir++) { - if (valid[dir] && mv[dir].notZero()) - mvc[numMvc++] = mv[dir]; + if (validDirect[dir] && directMV[dir].notZero()) + pmv[numMvc++] = directMV[dir]; - if (validOrder[dir] && mvOrder[dir].notZero()) - mvc[numMvc++] = mvOrder[dir]; + if (validIndirect[dir] && indirectMV[dir].notZero()) + pmv[numMvc++] = indirectMV[dir]; } if (num == 2) + num -= amvpCand[0] == amvpCand[1]; + + // Get the collocated candidate. At this step, either the first candidate + // was found or its value is 0. + if (m_slice->m_sps->bTemporalMVPEnabled && num < 2) { - if (amvpCand[0] == amvpCand[1]) - num = 1; - else - /* AMVP_NUM_CANDS = 2 */ - return numMvc; + int tempRefIdx = neighbours[MD_COLLOCATED].refIdx[picList]; + if (tempRefIdx != -1) + { + uint32_t cuAddr = neighbours[MD_COLLOCATED].cuAddr[picList]; + const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx]; + const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr); + + // Scale the vector + int colRefPOC = colCU->m_slice->m_refPOCList[tempRefIdx >> 4][tempRefIdx & 0xf]; + int colPOC = colCU->m_slice->m_poc; + + int curRefPOC = m_slice->m_refPOCList[picList][refIdx]; + int curPOC = m_slice->m_poc; + + pmv[numMvc++] = amvpCand[num++] = scaleMvByPOCDist(neighbours[MD_COLLOCATED].mv[picList], curPOC, curRefPOC, colPOC, colRefPOC); + } } + while (num < AMVP_NUM_CANDS) + amvpCand[num++] = 0; + + return numMvc; +} + +/* Constructs a list of candidates for AMVP, and a larger list of motion candidates */ +void CUData::getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const +{ + // Set the temporal neighbour to unavailable by default. + neighbours[MD_COLLOCATED].unifiedRef = -1; + + uint32_t partIdxLT, partIdxRT, partIdxLB = deriveLeftBottomIdx(puIdx); + deriveLeftRightTopIdx(puIdx, partIdxLT, partIdxRT); + + // Load the spatial MVs. + getInterNeighbourMV(neighbours + MD_BELOW_LEFT, partIdxLB, MD_BELOW_LEFT); + getInterNeighbourMV(neighbours + MD_LEFT, partIdxLB, MD_LEFT); + getInterNeighbourMV(neighbours + MD_ABOVE_RIGHT,partIdxRT, MD_ABOVE_RIGHT); + getInterNeighbourMV(neighbours + MD_ABOVE, partIdxRT, MD_ABOVE); + getInterNeighbourMV(neighbours + MD_ABOVE_LEFT, partIdxLT, MD_ABOVE_LEFT); + if (m_slice->m_sps->bTemporalMVPEnabled) { uint32_t absPartAddr = m_absIdxInCTU + absPartIdx; uint32_t partIdxRB = deriveRightBottomIdx(puIdx); - MV colmv; // co-located RightBottom temporal predictor (H) int ctuIdx = -1; @@ -1779,44 +1776,17 @@ else // is the right bottom corner of CTU absPartAddr = 0; } - if (ctuIdx >= 0 && getColMVP(colmv, refIdx, picList, ctuIdx, absPartAddr)) - { - amvpCand[num++] = colmv; - mvc[numMvc++] = colmv; - } - else + + if (!(ctuIdx >= 0 && getCollocatedMV(ctuIdx, absPartAddr, neighbours + MD_COLLOCATED))) { uint32_t partIdxCenter = deriveCenterIdx(puIdx); uint32_t curCTUIdx = m_cuAddr; - if (getColMVP(colmv, refIdx, picList, curCTUIdx, partIdxCenter)) - { - amvpCand[num++] = colmv; - mvc[numMvc++] = colmv; - } + getCollocatedMV(curCTUIdx, partIdxCenter, neighbours + MD_COLLOCATED); } } - - while (num < AMVP_NUM_CANDS) - amvpCand[num++] = 0; - - return numMvc; -} - -void CUData::clipMv(MV& outMV) const -{ - int mvshift = 2; - int offset = 8; - int xmax = (m_slice->m_sps->picWidthInLumaSamples + offset - m_cuPelX - 1) << mvshift; - int xmin = (-(int)g_maxCUSize - offset - (int)m_cuPelX + 1) << mvshift; - - int ymax = (m_slice->m_sps->picHeightInLumaSamples + offset - m_cuPelY - 1) << mvshift; - int ymin = (-(int)g_maxCUSize - offset - (int)m_cuPelY + 1) << mvshift; - - outMV.x = (int16_t)X265_MIN(xmax, X265_MAX(xmin, (int)outMV.x)); - outMV.y = (int16_t)X265_MIN(ymax, X265_MAX(ymin, (int)outMV.y)); } -bool CUData::addMVPCand(MV& mvp, int picList, int refIdx, uint32_t partUnitIdx, MVP_DIR dir) const +void CUData::getInterNeighbourMV(InterNeighbourMV *neighbour, uint32_t partUnitIdx, MVP_DIR dir) const { const CUData* tmpCU = NULL; uint32_t idx = 0; @@ -1839,103 +1809,77 @@ tmpCU = getPUAboveLeft(idx, partUnitIdx); break; default: - return false; + break; } if (!tmpCU) - return false; + { + // Mark the PMV as unavailable. + for (int i = 0; i < 2; i++) + neighbour->refIdx[i] = -1; + return; + } + + for (int i = 0; i < 2; i++) + { + // Get the MV. + neighbour->mv[i] = tmpCU->m_mv[i][idx]; - int refPOC = m_slice->m_refPOCList[picList][refIdx]; - int partRefIdx = tmpCU->m_refIdx[picList][idx]; - if (partRefIdx >= 0 && refPOC == tmpCU->m_slice->m_refPOCList[picList][partRefIdx]) - { - mvp = tmpCU->m_mv[picList][idx]; - return true; + // Get the reference idx. + neighbour->refIdx[i] = tmpCU->m_refIdx[i][idx]; } +} - int refPicList2nd = 0; - if (picList == 0) - refPicList2nd = 1; - else if (picList == 1) - refPicList2nd = 0; +void CUData::clipMv(MV& outMV) const +{ + const uint32_t mvshift = 2; + uint32_t offset = 8; - int curRefPOC = m_slice->m_refPOCList[picList][refIdx]; - int neibRefPOC; + int16_t xmax = (int16_t)((m_slice->m_sps->picWidthInLumaSamples + offset - m_cuPelX - 1) << mvshift); + int16_t xmin = -(int16_t)((g_maxCUSize + offset + m_cuPelX - 1) << mvshift); - partRefIdx = tmpCU->m_refIdx[refPicList2nd][idx]; - if (partRefIdx >= 0) + int16_t ymax = (int16_t)((m_slice->m_sps->picHeightInLumaSamples + offset - m_cuPelY - 1) << mvshift); + int16_t ymin = -(int16_t)((g_maxCUSize + offset + m_cuPelY - 1) << mvshift); + + outMV.x = X265_MIN(xmax, X265_MAX(xmin, outMV.x)); + outMV.y = X265_MIN(ymax, X265_MAX(ymin, outMV.y)); +} + +// Load direct spatial MV if available. +bool CUData::getDirectPMV(MV& pmv, InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx) const +{ + int curRefPOC = m_slice->m_refPOCList[picList][refIdx]; + for (int i = 0; i < 2; i++, picList = !picList) { - neibRefPOC = tmpCU->m_slice->m_refPOCList[refPicList2nd][partRefIdx]; - if (neibRefPOC == curRefPOC) + int partRefIdx = neighbours->refIdx[picList]; + if (partRefIdx >= 0 && curRefPOC == m_slice->m_refPOCList[picList][partRefIdx]) { - // Same reference frame but different list - mvp = tmpCU->m_mv[refPicList2nd][idx]; + pmv = neighbours->mv[picList]; return true; } } return false; } -bool CUData::addMVPCandOrder(MV& outMV, int picList, int refIdx, uint32_t partUnitIdx, MVP_DIR dir) const +// Load indirect spatial MV if available. An indirect MV has to be scaled. +bool CUData::getIndirectPMV(MV& outMV, InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx) const { - const CUData* tmpCU = NULL; - uint32_t idx = 0; - - switch (dir) - { - case MD_LEFT: - tmpCU = getPULeft(idx, partUnitIdx); - break; - case MD_ABOVE: - tmpCU = getPUAbove(idx, partUnitIdx); - break; - case MD_ABOVE_RIGHT: - tmpCU = getPUAboveRight(idx, partUnitIdx); - break; - case MD_BELOW_LEFT: - tmpCU = getPUBelowLeft(idx, partUnitIdx); - break; - case MD_ABOVE_LEFT: - tmpCU = getPUAboveLeft(idx, partUnitIdx); - break; - default: - return false; - } - - if (!tmpCU) - return false; - - int refPicList2nd = 0; - if (picList == 0) - refPicList2nd = 1; - else if (picList == 1) - refPicList2nd = 0; - int curPOC = m_slice->m_poc; - int curRefPOC = m_slice->m_refPOCList[picList][refIdx]; int neibPOC = curPOC; - int neibRefPOC; - - int partRefIdx = tmpCU->m_refIdx[picList][idx]; - if (partRefIdx >= 0) - { - neibRefPOC = tmpCU->m_slice->m_refPOCList[picList][partRefIdx]; - MV mvp = tmpCU->m_mv[picList][idx]; - - scaleMvByPOCDist(outMV, mvp, curPOC, curRefPOC, neibPOC, neibRefPOC); - return true; - } + int curRefPOC = m_slice->m_refPOCList[picList][refIdx]; - partRefIdx = tmpCU->m_refIdx[refPicList2nd][idx]; - if (partRefIdx >= 0) + for (int i = 0; i < 2; i++, picList = !picList) { - neibRefPOC = tmpCU->m_slice->m_refPOCList[refPicList2nd][partRefIdx]; - MV mvp = tmpCU->m_mv[refPicList2nd][idx]; + int partRefIdx = neighbours->refIdx[picList]; + if (partRefIdx >= 0) + { + int neibRefPOC = m_slice->m_refPOCList[picList][partRefIdx]; + MV mvp = neighbours->mv[picList]; - scaleMvByPOCDist(outMV, mvp, curPOC, curRefPOC, neibPOC, neibRefPOC); - return true; + outMV = scaleMvByPOCDist(mvp, curPOC, curRefPOC, neibPOC, neibRefPOC); + return true; + } } - return false; } @@ -1944,12 +1888,8 @@ const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx]; const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr); - if (colCU->m_predMode[partUnitIdx] == MODE_NONE) - return false; - uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK; - - if (colCU->isIntra(absPartAddr)) + if (colCU->m_predMode[partUnitIdx] == MODE_NONE || colCU->isIntra(absPartAddr)) return false; int colRefPicList = m_slice->m_bCheckLDC ? picList : m_slice->m_colFromL0Flag; @@ -1958,7 +1898,7 @@ if (colRefIdx < 0) { - colRefPicList = 1 - colRefPicList; + colRefPicList = !colRefPicList; colRefIdx = colCU->m_refIdx[colRefPicList][absPartAddr]; if (colRefIdx < 0) @@ -1973,24 +1913,52 @@ int curRefPOC = m_slice->m_refPOCList[picList][outRefIdx]; int curPOC = m_slice->m_poc; - scaleMvByPOCDist(outMV, colmv, curPOC, curRefPOC, colPOC, colRefPOC); + outMV = scaleMvByPOCDist(colmv, curPOC, curRefPOC, colPOC, colRefPOC); return true; } -void CUData::scaleMvByPOCDist(MV& outMV, const MV& inMV, int curPOC, int curRefPOC, int colPOC, int colRefPOC) const +// Cache the collocated MV. +bool CUData::getCollocatedMV(int cuAddr, int partUnitIdx, InterNeighbourMV *neighbour) const +{ + const Frame* colPic = m_slice->m_refPicList[m_slice->isInterB() && !m_slice->m_colFromL0Flag][m_slice->m_colRefIdx]; + const CUData* colCU = colPic->m_encData->getPicCTU(cuAddr); + + uint32_t absPartAddr = partUnitIdx & TMVP_UNIT_MASK; + if (colCU->m_predMode[partUnitIdx] == MODE_NONE || colCU->isIntra(absPartAddr)) + return false; + + for (int list = 0; list < 2; list++) + { + neighbour->cuAddr[list] = cuAddr; + int colRefPicList = m_slice->m_bCheckLDC ? list : m_slice->m_colFromL0Flag; + int colRefIdx = colCU->m_refIdx[colRefPicList][absPartAddr]; + + if (colRefIdx < 0) + colRefPicList = !colRefPicList; + + neighbour->refIdx[list] = colCU->m_refIdx[colRefPicList][absPartAddr]; + neighbour->refIdx[list] |= colRefPicList << 4; + + neighbour->mv[list] = colCU->m_mv[colRefPicList][absPartAddr]; + } + + return neighbour->unifiedRef != -1; +} + +MV CUData::scaleMvByPOCDist(const MV& inMV, int curPOC, int curRefPOC, int colPOC, int colRefPOC) const { int diffPocD = colPOC - colRefPOC; int diffPocB = curPOC - curRefPOC; if (diffPocD == diffPocB) - outMV = inMV; + return inMV; else { int tdb = x265_clip3(-128, 127, diffPocB); int tdd = x265_clip3(-128, 127, diffPocD); int x = (0x4000 + abs(tdd / 2)) / tdd; int scale = x265_clip3(-4096, 4095, (tdb * x + 32) >> 6); - outMV = scaleMv(inMV, scale); + return scaleMv(inMV, scale); } } @@ -2006,74 +1974,59 @@ + (puWidth >> (LOG2_UNIT_SIZE + 1))]; } -ScanType CUData::getCoefScanIdx(uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma, bool bIsIntra) const +void CUData::getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const { - uint32_t dirMode; + bool bIsIntra = isIntra(absPartIdx); - if (!bIsIntra) - return SCAN_DIAG; - - // check that MDCS can be used for this TU - if (bIsLuma) - { - if (log2TrSize > MDCS_LOG2_MAX_SIZE) - return SCAN_DIAG; + // set the group layout + result.log2TrSizeCG = log2TrSize - 2; - dirMode = m_lumaIntraDir[absPartIdx]; - } - else + // set the scan orders + if (bIsIntra) { - if (log2TrSize > (uint32_t)(MDCS_LOG2_MAX_SIZE - m_hChromaShift)) - return SCAN_DIAG; + uint32_t dirMode; - dirMode = m_chromaIntraDir[absPartIdx]; - if (dirMode == DM_CHROMA_IDX) + if (bIsLuma) + dirMode = m_lumaIntraDir[absPartIdx]; + else { - dirMode = m_lumaIntraDir[(m_chromaFormat == X265_CSP_I444) ? absPartIdx : absPartIdx & 0xFC]; - dirMode = (m_chromaFormat == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[dirMode] : dirMode; + dirMode = m_chromaIntraDir[absPartIdx]; + if (dirMode == DM_CHROMA_IDX) + { + dirMode = m_lumaIntraDir[(m_chromaFormat == X265_CSP_I444) ? absPartIdx : absPartIdx & 0xFC]; + dirMode = (m_chromaFormat == X265_CSP_I422) ? g_chroma422IntraAngleMappingTable[dirMode] : dirMode; + } } - } - if (abs((int)dirMode - VER_IDX) <= MDCS_ANGLE_LIMIT) - return SCAN_HOR; - else if (abs((int)dirMode - HOR_IDX) <= MDCS_ANGLE_LIMIT) - return SCAN_VER; + if (log2TrSize <= (MDCS_LOG2_MAX_SIZE - m_hChromaShift) || (bIsLuma && log2TrSize == MDCS_LOG2_MAX_SIZE)) + result.scanType = dirMode >= 22 && dirMode <= 30 ? SCAN_HOR : dirMode >= 6 && dirMode <= 14 ? SCAN_VER : SCAN_DIAG; + else + result.scanType = SCAN_DIAG; + } else - return SCAN_DIAG; -} - -void CUData::getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const -{ - // set the group layout - result.log2TrSizeCG = log2TrSize - 2; + result.scanType = SCAN_DIAG; - // set the scan orders - result.scanType = getCoefScanIdx(absPartIdx, log2TrSize, bIsLuma, isIntra(absPartIdx)); result.scan = g_scanOrder[result.scanType][log2TrSize - 2]; result.scanCG = g_scanOrderCG[result.scanType][result.log2TrSizeCG]; if (log2TrSize == 2) result.firstSignificanceMapContext = 0; else if (log2TrSize == 3) - { - result.firstSignificanceMapContext = 9; - if (result.scanType != SCAN_DIAG && bIsLuma) - result.firstSignificanceMapContext += 6; - } + result.firstSignificanceMapContext = (result.scanType != SCAN_DIAG && bIsLuma) ? 15 : 9; else result.firstSignificanceMapContext = bIsLuma ? 21 : 12; } #define CU_SET_FLAG(bitfield, flag, value) (bitfield) = ((bitfield) & (~(flag))) | ((~((value) - 1)) & (flag)) -void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) +void CUData::calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]) { // Initialize the coding blocks inside the CTB - for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= MIN_LOG2_CU_SIZE; log2CUSize--) + for (uint32_t log2CUSize = g_log2Size[maxCUSize], rangeCUIdx = 0; log2CUSize >= g_log2Size[minCUSize]; log2CUSize--) { uint32_t blockSize = 1 << log2CUSize; uint32_t sbWidth = 1 << (g_log2Size[maxCUSize] - log2CUSize); - int32_t lastLevelFlag = log2CUSize == MIN_LOG2_CU_SIZE; + int32_t lastLevelFlag = log2CUSize == g_log2Size[minCUSize]; for (uint32_t sbY = 0; sbY < sbWidth; sbY++) { for (uint32_t sbX = 0; sbX < sbWidth; sbX++) @@ -2094,8 +2047,8 @@ CUGeom *cu = cuDataArray + cuIdx; cu->log2CUSize = log2CUSize; cu->childOffset = childIdx - cuIdx; - cu->encodeIdx = g_depthScanIdx[yOffset][xOffset] * 4; - cu->numPartitions = (NUM_CU_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2)); + cu->absPartIdx = g_depthScanIdx[yOffset][xOffset] * 4; + cu->numPartitions = (NUM_4x4_PARTITIONS >> ((g_maxLog2CUSize - cu->log2CUSize) * 2)); cu->depth = g_log2Size[maxCUSize] - log2CUSize; cu->flags = 0; diff -Nru x265-1.5/source/common/cudata.h x265-1.6/source/common/cudata.h --- x265-1.5/source/common/cudata.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/cudata.h 2015-04-02 16:46:36.000000000 +0000 @@ -64,7 +64,8 @@ MD_ABOVE, // MVP of above block MD_ABOVE_RIGHT, // MVP of above right block MD_BELOW_LEFT, // MVP of below left block - MD_ABOVE_LEFT // MVP of above left block + MD_ABOVE_LEFT, // MVP of above left block + MD_COLLOCATED // MVP of temporal neighbour }; struct CUGeom @@ -82,7 +83,7 @@ uint32_t log2CUSize; // Log of the CU size. uint32_t childOffset; // offset of the first child CU from current CU - uint32_t encodeIdx; // Encoding index of this CU in terms of 4x4 blocks. + uint32_t absPartIdx; // Part index of this CU in terms of 4x4 blocks. uint32_t numPartitions; // Number of 4x4 blocks in the CU uint32_t depth; // depth of this CU relative from CTU uint32_t flags; // CU flags. @@ -94,6 +95,26 @@ int refIdx; }; +// Structure that keeps the neighbour's MV information. +struct InterNeighbourMV +{ + // Neighbour MV. The index represents the list. + MV mv[2]; + + // Collocated right bottom CU addr. + uint32_t cuAddr[2]; + + // For spatial prediction, this field contains the reference index + // in each list (-1 if not available). + // + // For temporal prediction, the first value is used for the + // prediction with list 0. The second value is used for the prediction + // with list 1. For each value, the first four bits are the reference index + // associated to the PMV, and the fifth bit is the list associated to the PMV. + // if both reference indices are -1, then unifiedRef is also -1 + union { int16_t refIdx[2]; int32_t unifiedRef; }; +}; + typedef void(*cucopy_t)(uint8_t* dst, uint8_t* src); // dst and src are aligned to MIN(size, 32) typedef void(*cubcast_t)(uint8_t* dst, uint8_t val); // dst is aligned to MIN(size, 32) @@ -122,9 +143,9 @@ uint32_t m_cuPelY; // CU position within the picture, in pixels (Y) uint32_t m_numPartitions; // maximum number of 4x4 partitions within this CU - int m_chromaFormat; - int m_hChromaShift; - int m_vChromaShift; + uint32_t m_chromaFormat; + uint32_t m_hChromaShift; + uint32_t m_vChromaShift; /* Per-part data, stored contiguously */ int8_t* m_qp; // array of QP values @@ -158,7 +179,7 @@ CUData(); void initialize(const CUDataMemPool& dataPool, uint32_t depth, int csp, int instance); - static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]); + static void calcCTUGeoms(uint32_t ctuWidth, uint32_t ctuHeight, uint32_t maxCUSize, uint32_t minCUSize, CUGeom cuDataArray[CUGeom::MAX_GEOMS]); void initCTU(const Frame& frame, uint32_t cuAddr, int qp); void initSubCU(const CUData& ctu, const CUGeom& cuGeom); @@ -195,9 +216,10 @@ uint8_t getCbf(uint32_t absPartIdx, TextType ttype, uint32_t tuDepth) const { return (m_cbf[ttype][absPartIdx] >> tuDepth) & 0x1; } uint8_t getQtRootCbf(uint32_t absPartIdx) const { return m_cbf[0][absPartIdx] || m_cbf[1][absPartIdx] || m_cbf[2][absPartIdx]; } int8_t getRefQP(uint32_t currAbsIdxInCTU) const; - uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*mvFieldNeighbours)[2], uint8_t* interDirNeighbours) const; + uint32_t getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField (*candMvField)[2], uint8_t* candDir) const; void clipMv(MV& outMV) const; - int fillMvpCand(uint32_t puIdx, uint32_t absPartIdx, int picList, int refIdx, MV* amvpCand, MV* mvc) const; + int getPMV(InterNeighbourMV *neighbours, uint32_t reference_list, uint32_t refIdx, MV* amvpCand, MV* pmv) const; + void getNeighbourMV(uint32_t puIdx, uint32_t absPartIdx, InterNeighbourMV* neighbours) const; void getIntraTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const; void getInterTUQtDepthRange(uint32_t tuDepthRange[2], uint32_t absPartIdx) const; @@ -213,10 +235,9 @@ void getAllowedChromaDir(uint32_t absPartIdx, uint32_t* modeList) const; int getIntraDirLumaPredictor(uint32_t absPartIdx, uint32_t* intraDirPred) const; - uint32_t getSCUAddr() const { return (m_cuAddr << g_maxFullDepth * 2) + m_absIdxInCTU; } + uint32_t getSCUAddr() const { return (m_cuAddr << g_unitSizeDepth * 2) + m_absIdxInCTU; } uint32_t getCtxSplitFlag(uint32_t absPartIdx, uint32_t depth) const; uint32_t getCtxSkipFlag(uint32_t absPartIdx) const; - ScanType getCoefScanIdx(uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma, bool bIsIntra) const; void getTUEntropyCodingParameters(TUEntropyCodingParameters &result, uint32_t absPartIdx, uint32_t log2TrSize, bool bIsLuma) const; const CUData* getPULeft(uint32_t& lPartUnitIdx, uint32_t curPartUnitIdx) const; @@ -241,15 +262,18 @@ bool hasEqualMotion(uint32_t absPartIdx, const CUData& candCU, uint32_t candAbsPartIdx) const; - bool isDiffMER(int xN, int yN, int xP, int yP) const; + /* Check whether the current PU and a spatial neighboring PU are in same merge region */ + bool isDiffMER(int xN, int yN, int xP, int yP) const { return ((xN >> 2) != (xP >> 2)) || ((yN >> 2) != (yP >> 2)); } // add possible motion vector predictor candidates - bool addMVPCand(MV& mvp, int picList, int refIdx, uint32_t absPartIdx, MVP_DIR dir) const; - bool addMVPCandOrder(MV& mvp, int picList, int refIdx, uint32_t absPartIdx, MVP_DIR dir) const; + bool getDirectPMV(MV& pmv, InterNeighbourMV *neighbours, uint32_t picList, uint32_t refIdx) const; + bool getIndirectPMV(MV& outMV, InterNeighbourMV *neighbours, uint32_t reference_list, uint32_t refIdx) const; + void getInterNeighbourMV(InterNeighbourMV *neighbour, uint32_t partUnitIdx, MVP_DIR dir) const; bool getColMVP(MV& outMV, int& outRefIdx, int picList, int cuAddr, int absPartIdx) const; + bool getCollocatedMV(int cuAddr, int partUnitIdx, InterNeighbourMV *neighbour) const; - void scaleMvByPOCDist(MV& outMV, const MV& inMV, int curPOC, int curRefPOC, int colPOC, int colRefPOC) const; + MV scaleMvByPOCDist(const MV& inMV, int curPOC, int curRefPOC, int colPOC, int colRefPOC) const; void deriveLeftRightTopIdx(uint32_t puIdx, uint32_t& partIdxLT, uint32_t& partIdxRT) const; @@ -278,7 +302,7 @@ bool create(uint32_t depth, uint32_t csp, uint32_t numInstances) { - uint32_t numPartition = NUM_CU_PARTITIONS >> (depth * 2); + uint32_t numPartition = NUM_4x4_PARTITIONS >> (depth * 2); uint32_t cuSize = g_maxCUSize >> depth; uint32_t sizeL = cuSize * cuSize; uint32_t sizeC = sizeL >> (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp)); diff -Nru x265-1.5/source/common/dct.cpp x265-1.6/source/common/dct.cpp --- x265-1.5/source/common/dct.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/dct.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -709,14 +709,12 @@ return numSig; } - -int count_nonzero_c(const int16_t* quantCoeff, int numCoeff) +template +int count_nonzero_c(const int16_t* quantCoeff) { X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n"); - X265_CHECK(numCoeff > 0 && (numCoeff & 15) == 0, "numCoeff invalid %d\n", numCoeff); - int count = 0; - + int numCoeff = trSize * trSize; for (int i = 0; i < numCoeff; i++) { count += quantCoeff[i] != 0; @@ -754,6 +752,39 @@ } } +int findPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig) +{ + memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum)); + memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag)); + memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign)); + + int scanPosLast = 0; + do + { + const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE; + + const uint32_t posLast = scan[scanPosLast++]; + + const int curCoeff = coeff[posLast]; + const uint32_t isNZCoeff = (curCoeff != 0); + // get L1 sig map + // NOTE: the new algorithm is complicated, so I keep reference code here + //uint32_t posy = posLast >> log2TrSize; + //uint32_t posx = posLast - (posy << log2TrSize); + //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE); + //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY); + //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx); + numSig -= isNZCoeff; + + // TODO: optimize by instruction BTS + coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]); + coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff; + coeffNum[cgIdx] += (uint8_t)isNZCoeff; + } + while (numSig > 0); + return scanPosLast - 1; +} + } // closing - anonymous file-static namespace namespace x265 { @@ -775,12 +806,17 @@ p.cu[BLOCK_8x8].idct = idct8_c; p.cu[BLOCK_16x16].idct = idct16_c; p.cu[BLOCK_32x32].idct = idct32_c; - p.count_nonzero = count_nonzero_c; p.denoiseDct = denoiseDct_c; + p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>; + p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>; + p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>; + p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>; p.cu[BLOCK_4x4].copy_cnt = copy_count<4>; p.cu[BLOCK_8x8].copy_cnt = copy_count<8>; p.cu[BLOCK_16x16].copy_cnt = copy_count<16>; p.cu[BLOCK_32x32].copy_cnt = copy_count<32>; + + p.findPosLast = findPosLast_c; } } diff -Nru x265-1.5/source/common/deblock.cpp x265-1.6/source/common/deblock.cpp --- x265-1.5/source/common/deblock.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/deblock.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -70,7 +70,7 @@ * param Edge the direction of the edge in block boundary (horizonta/vertical), which is added newly */ void Deblock::deblockCU(const CUData* cu, const CUGeom& cuGeom, const int32_t dir, uint8_t blockStrength[]) { - uint32_t absPartIdx = cuGeom.encodeIdx; + uint32_t absPartIdx = cuGeom.absPartIdx; uint32_t depth = cuGeom.depth; if (cu->m_predMode[absPartIdx] == MODE_NONE) return; @@ -358,7 +358,7 @@ int16_t m5 = (int16_t)src[offset]; int16_t m2 = (int16_t)src[-offset * 2]; - int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) << 2) + m2 - m5 + 4) >> 3)); + int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3)); src[-offset] = x265_clip(m3 + (delta & maskP)); src[0] = x265_clip(m4 - (delta & maskQ)); } diff -Nru x265-1.5/source/common/framedata.h x265-1.6/source/common/framedata.h --- x265-1.5/source/common/framedata.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/framedata.h 2015-04-02 16:46:36.000000000 +0000 @@ -32,6 +32,7 @@ // private namespace class PicYuv; +class JobProvider; /* Per-frame data that is used during encodes and referenced while the picture * is available for reference. A FrameData instance is attached to a Frame as it @@ -52,6 +53,7 @@ PicYuv* m_reconPic; bool m_bHasReferences; /* used during DPB/RPS updates */ int m_frameEncoderID; /* the ID of the FrameEncoder encoding this frame */ + JobProvider* m_jobProvider; CUDataMemPool m_cuMemPool; CUData* m_picCTU; diff -Nru x265-1.5/source/common/intrapred.cpp x265-1.6/source/common/intrapred.cpp --- x265-1.5/source/common/intrapred.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/intrapred.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -27,6 +27,29 @@ using namespace x265; namespace { + +template +void intraFilter(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */ +{ + const int tuSize2 = tuSize << 1; + + pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2]; + + // filtering top + for (int i = 1; i < tuSize2; i++) + filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2; + filtered[tuSize2] = topLast; + + // filtering top-left + filtered[0] = ((topLeft << 1) + samples[1] + samples[tuSize2 + 1] + 2) >> 2; + + // filtering left + filtered[tuSize2 + 1] = ((samples[tuSize2 + 1] << 1) + topLeft + samples[tuSize2 + 2] + 2) >> 2; + for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++) + filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2; + filtered[tuSize2 + tuSize2] = leftLast; +} + void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size) { // boundary pixels processing @@ -216,6 +239,11 @@ void setupIntraPrimitives_c(EncoderPrimitives& p) { + p.cu[BLOCK_4x4].intra_filter = intraFilter<4>; + p.cu[BLOCK_8x8].intra_filter = intraFilter<8>; + p.cu[BLOCK_16x16].intra_filter = intraFilter<16>; + p.cu[BLOCK_32x32].intra_filter = intraFilter<32>; + p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = planar_pred_c<2>; p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = planar_pred_c<3>; p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = planar_pred_c<4>; diff -Nru x265-1.5/source/common/ipfilter.cpp x265-1.6/source/common/ipfilter.cpp --- x265-1.5/source/common/ipfilter.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/ipfilter.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -34,8 +34,27 @@ #endif namespace { +template +void pixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst) +{ + int shift = IF_INTERNAL_PREC - X265_DEPTH; + int row, col; + + for (row = 0; row < height; row++) + { + for (col = 0; col < width; col++) + { + int16_t val = src[col] << shift; + dst[col] = val - (int16_t)IF_INTERNAL_OFFS; + } + + src += srcStride; + dst += dstStride; + } +} + template -void filterConvertPelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height) +void filterPixelToShort_c(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height) { int shift = IF_INTERNAL_PREC - X265_DEPTH; int row, col; @@ -65,8 +84,8 @@ } #else - ::memset(txt - marginX, txt[0], marginX); - ::memset(txt + width, txt[width - 1], marginX); + memset(txt - marginX, txt[0], marginX); + memset(txt + width, txt[width - 1], marginX); #endif txt += stride; @@ -378,7 +397,8 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \ - p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \ + p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].chroma_p2s = pixelToShort_c; #define CHROMA_422(W, H) \ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \ @@ -386,7 +406,8 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \ - p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \ + p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].chroma_p2s = pixelToShort_c; #define CHROMA_444(W, H) \ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = interp_horiz_pp_c<4, W, H>; \ @@ -394,7 +415,8 @@ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vpp = interp_vert_pp_c<4, W, H>; \ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vps = interp_vert_ps_c<4, W, H>; \ p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vsp = interp_vert_sp_c<4, W, H>; \ - p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_vss = interp_vert_ss_c<4, W, H>; \ + p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].chroma_p2s = pixelToShort_c; #define LUMA(W, H) \ p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp_horiz_pp_c<8, W, H>; \ @@ -403,7 +425,8 @@ p.pu[LUMA_ ## W ## x ## H].luma_vps = interp_vert_ps_c<8, W, H>; \ p.pu[LUMA_ ## W ## x ## H].luma_vsp = interp_vert_sp_c<8, W, H>; \ p.pu[LUMA_ ## W ## x ## H].luma_vss = interp_vert_ss_c<8, W, H>; \ - p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>; + p.pu[LUMA_ ## W ## x ## H].luma_hvpp = interp_hv_pp_c<8, W, H>; \ + p.pu[LUMA_ ## W ## x ## H].filter_p2s = pixelToShort_c void setupFilterPrimitives_c(EncoderPrimitives& p) { @@ -507,11 +530,11 @@ CHROMA_444(48, 64); CHROMA_444(64, 16); CHROMA_444(16, 64); - p.luma_p2s = filterConvertPelToShort_c; + p.luma_p2s = filterPixelToShort_c; - p.chroma[X265_CSP_I444].p2s = filterConvertPelToShort_c; - p.chroma[X265_CSP_I420].p2s = filterConvertPelToShort_c; - p.chroma[X265_CSP_I422].p2s = filterConvertPelToShort_c; + p.chroma[X265_CSP_I444].p2s = filterPixelToShort_c; + p.chroma[X265_CSP_I420].p2s = filterPixelToShort_c; + p.chroma[X265_CSP_I422].p2s = filterPixelToShort_c; p.extendRowBorder = extendCURowColBorder; } diff -Nru x265-1.5/source/common/lowres.cpp x265-1.6/source/common/lowres.cpp --- x265-1.5/source/common/lowres.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/lowres.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -56,12 +56,11 @@ CHECKED_MALLOC(propagateCost, uint16_t, cuCount); /* allocate lowres buffers */ - for (int i = 0; i < 4; i++) - { - CHECKED_MALLOC(buffer[i], pixel, planesize); - /* initialize the whole buffer to prevent valgrind warnings on right edge */ - memset(buffer[i], 0, sizeof(pixel) * planesize); - } + CHECKED_MALLOC_ZERO(buffer[0], pixel, 4 * planesize); + + buffer[1] = buffer[0] + planesize; + buffer[2] = buffer[1] + planesize; + buffer[3] = buffer[2] + planesize; lowresPlane[0] = buffer[0] + padoffset; lowresPlane[1] = buffer[1] + padoffset; @@ -96,9 +95,7 @@ void Lowres::destroy() { - for (int i = 0; i < 4; i++) - X265_FREE(buffer[i]); - + X265_FREE(buffer[0]); X265_FREE(intraCost); X265_FREE(intraMode); @@ -126,13 +123,11 @@ } // (re) initialize lowres state -void Lowres::init(PicYuv *origPic, int poc, int type) +void Lowres::init(PicYuv *origPic, int poc) { - bIntraCalculated = false; bLastMiniGopBFrame = false; bScenecut = true; // could be a scene-cut, until ruled out by flash detection bKeyframe = false; // Not a keyframe unless identified by lookahead - sliceType = type; frameNum = poc; leadingBframes = 0; indB = 0; @@ -158,8 +153,8 @@ /* downscale and generate 4 hpel planes for lookahead */ primitives.frameInitLowres(origPic->m_picOrg[0], - lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3], - origPic->m_stride, lumaStride, width, lines); + lowresPlane[0], lowresPlane[1], lowresPlane[2], lowresPlane[3], + origPic->m_stride, lumaStride, width, lines); /* extend hpel planes for motion search */ extendPicBorder(lowresPlane[0], lumaStride, width, lines, origPic->m_lumaMarginX, origPic->m_lumaMarginY); diff -Nru x265-1.5/source/common/lowres.h x265-1.6/source/common/lowres.h --- x265-1.5/source/common/lowres.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/lowres.h 2015-04-02 16:46:36.000000000 +0000 @@ -114,7 +114,6 @@ int lines; // height of lowres frame in pixel lines int leadingBframes; // number of leading B frames for P or I - bool bIntraCalculated; bool bScenecut; // Set to false if the frame cannot possibly be part of a real scenecut. bool bKeyframe; bool bLastMiniGopBFrame; @@ -151,7 +150,7 @@ bool create(PicYuv *origPic, int _bframes, bool bAqEnabled); void destroy(); - void init(PicYuv *origPic, int poc, int sliceType); + void init(PicYuv *origPic, int poc); }; } diff -Nru x265-1.5/source/common/mv.h x265-1.6/source/common/mv.h --- x265-1.5/source/common/mv.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/mv.h 2015-04-02 16:46:36.000000000 +0000 @@ -56,12 +56,17 @@ MV& operator >>=(int i) { x >>= i; y >>= i; return *this; } +#if USING_FTRAPV + /* avoid signed left-shifts when -ftrapv is enabled */ + MV& operator <<=(int i) { x *= (1 << i); y *= (1 << i); return *this; } + MV operator <<(int i) const { return MV(x * (1 << i), y * (1 << i)); } +#else MV& operator <<=(int i) { x <<= i; y <<= i; return *this; } + MV operator <<(int i) const { return MV(x << i, y << i); } +#endif MV operator >>(int i) const { return MV(x >> i, y >> i); } - MV operator <<(int i) const { return MV(x << i, y << i); } - MV operator *(int16_t i) const { return MV(x * i, y * i); } MV operator -(const MV& other) const { return MV(x - other.x, y - other.y); } diff -Nru x265-1.5/source/common/param.cpp x265-1.6/source/common/param.cpp --- x265-1.5/source/common/param.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/param.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -52,9 +52,7 @@ */ #undef strtok_r -char* strtok_r(char * str, - const char *delim, - char ** nextp) +char* strtok_r(char* str, const char* delim, char** nextp) { if (!str) str = *nextp; @@ -87,20 +85,19 @@ } extern "C" -void x265_param_free(x265_param *p) +void x265_param_free(x265_param* p) { return x265_free(p); } extern "C" -void x265_param_default(x265_param *param) +void x265_param_default(x265_param* param) { memset(param, 0, sizeof(x265_param)); /* Applying default values to all elements in the param structure */ param->cpuid = x265::cpu_detect(); param->bEnableWavefront = 1; - param->poolNumThreads = 0; param->frameNumThreads = 0; param->logLevel = X265_LOG_INFO; @@ -127,8 +124,10 @@ /* CU definitions */ param->maxCUSize = 64; + param->minCUSize = 8; param->tuQTMaxInterDepth = 1; param->tuQTMaxIntraDepth = 1; + param->maxTUSize = 32; /* Coding Structure */ param->keyframeMin = 0; @@ -139,6 +138,7 @@ param->bFrameAdaptive = X265_B_ADAPT_TRELLIS; param->bBPyramid = 1; param->scenecutThreshold = 40; /* Magic number pulled in from x264 */ + param->lookaheadSlices = 0; /* Intra Coding Tools */ param->bEnableConstrainedIntra = 0; @@ -153,10 +153,10 @@ param->bEnableWeightedPred = 1; param->bEnableWeightedBiPred = 0; param->bEnableEarlySkip = 0; - param->bEnableCbfFastMode = 0; param->bEnableAMP = 0; param->bEnableRectInter = 0; param->rdLevel = 3; + param->rdoqLevel = 0; param->bEnableSignHiding = 1; param->bEnableTransformSkip = 0; param->bEnableTSkipFast = 0; @@ -175,12 +175,13 @@ param->crQpOffset = 0; param->rdPenalty = 0; param->psyRd = 0.3; - param->psyRdoq = 1.0; + param->psyRdoq = 0.0; param->analysisMode = 0; param->analysisFileName = NULL; param->bIntraInBFrames = 0; param->bLossless = 0; param->bCULossless = 0; + param->bEnableTemporalSubLayers = 0; /* Rate control options */ param->rc.vbvMaxBitrate = 0; @@ -232,7 +233,7 @@ } extern "C" -int x265_param_default_preset(x265_param *param, const char *preset, const char *tune) +int x265_param_default_preset(x265_param* param, const char* preset, const char* tune) { x265_param_default(param); @@ -245,10 +246,11 @@ if (!strcmp(preset, "ultrafast")) { - param->lookaheadDepth = 10; + param->lookaheadDepth = 5; param->scenecutThreshold = 0; // disable lookahead param->maxCUSize = 32; - param->searchRange = 25; + param->minCUSize = 16; + param->bframes = 3; param->bFrameAdaptive = 0; param->subpelRefine = 0; param->searchMethod = X265_DIA_SEARCH; @@ -267,7 +269,7 @@ { param->lookaheadDepth = 10; param->maxCUSize = 32; - param->searchRange = 44; + param->bframes = 3; param->bFrameAdaptive = 0; param->subpelRefine = 1; param->bEnableEarlySkip = 1; @@ -319,6 +321,8 @@ param->bEnableRectInter = 1; param->lookaheadDepth = 25; param->rdLevel = 4; + param->rdoqLevel = 2; + param->psyRdoq = 1.0; param->subpelRefine = 3; param->maxNumMergeCand = 3; param->searchMethod = X265_STAR_SEARCH; @@ -333,6 +337,8 @@ param->tuQTMaxInterDepth = 2; param->tuQTMaxIntraDepth = 2; param->rdLevel = 6; + param->rdoqLevel = 2; + param->psyRdoq = 1.0; param->subpelRefine = 3; param->maxNumMergeCand = 3; param->searchMethod = X265_STAR_SEARCH; @@ -348,6 +354,8 @@ param->tuQTMaxInterDepth = 3; param->tuQTMaxIntraDepth = 3; param->rdLevel = 6; + param->rdoqLevel = 2; + param->psyRdoq = 1.0; param->subpelRefine = 4; param->maxNumMergeCand = 4; param->searchMethod = X265_STAR_SEARCH; @@ -365,6 +373,8 @@ param->tuQTMaxInterDepth = 4; param->tuQTMaxIntraDepth = 4; param->rdLevel = 6; + param->rdoqLevel = 2; + param->psyRdoq = 1.0; param->subpelRefine = 5; param->maxNumMergeCand = 5; param->searchMethod = X265_STAR_SEARCH; @@ -415,11 +425,11 @@ param->deblockingFilterBetaOffset = -2; param->deblockingFilterTCOffset = -2; param->bIntraInBFrames = 0; + param->rdoqLevel = 1; param->psyRdoq = 30; param->psyRd = 0.5; param->rc.ipFactor = 1.1; param->rc.pbFactor = 1.1; - param->rc.aqMode = X265_AQ_VARIANCE; param->rc.aqStrength = 0.3; param->rc.qCompress = 0.8; } @@ -430,7 +440,7 @@ return 0; } -static int x265_atobool(const char *str, bool& bError) +static int x265_atobool(const char* str, bool& bError) { if (!strcmp(str, "1") || !strcmp(str, "true") || @@ -444,7 +454,7 @@ return 0; } -static double x265_atof(const char *str, bool& bError) +static double x265_atof(const char* str, bool& bError) { char *end; double v = strtod(str, &end); @@ -454,7 +464,7 @@ return v; } -static int parseName(const char *arg, const char * const * names, bool& bError) +static int parseName(const char* arg, const char* const* names, bool& bError) { for (int i = 0; names[i]; i++) if (!strcmp(arg, names[i])) @@ -471,7 +481,7 @@ #define atobool(str) (bNameWasBool = true, x265_atobool(str, bError)) extern "C" -int x265_param_parse(x265_param *p, const char *name, const char *value) +int x265_param_parse(x265_param* p, const char* name, const char* value) { bool bError = false; bool bNameWasBool = false; @@ -543,7 +553,6 @@ } } } - OPT("threads") p->poolNumThreads = atoi(value); OPT("frame-threads") p->frameNumThreads = atoi(value); OPT("pmode") p->bDistributeModeAnalysis = atobool(value); OPT("pme") p->bDistributeMotionEstimation = atobool(value); @@ -569,8 +578,10 @@ OPT("repeat-headers") p->bRepeatHeaders = atobool(value); OPT("wpp") p->bEnableWavefront = atobool(value); OPT("ctu") p->maxCUSize = (uint32_t)atoi(value); + OPT("min-cu-size") p->minCUSize = (uint32_t)atoi(value); OPT("tu-intra-depth") p->tuQTMaxIntraDepth = (uint32_t)atoi(value); OPT("tu-inter-depth") p->tuQTMaxInterDepth = (uint32_t)atoi(value); + OPT("max-tu-size") p->maxTUSize = (uint32_t)atoi(value); OPT("subme") p->subpelRefine = atoi(value); OPT("merange") p->searchRange = atoi(value); OPT("rect") p->bEnableRectInter = atobool(value); @@ -578,7 +589,6 @@ OPT("max-merge") p->maxNumMergeCand = (uint32_t)atoi(value); OPT("temporal-mvp") p->bEnableTemporalMvp = atobool(value); OPT("early-skip") p->bEnableEarlySkip = atobool(value); - OPT("fast-cbf") p->bEnableCbfFastMode = atobool(value); OPT("rdpenalty") p->rdPenalty = atoi(value); OPT("tskip") p->bEnableTransformSkip = atobool(value); OPT("no-tskip-fast") p->bEnableTSkipFast = atobool(value); @@ -586,9 +596,10 @@ OPT("strong-intra-smoothing") p->bEnableStrongIntraSmoothing = atobool(value); OPT("lossless") p->bLossless = atobool(value); OPT("cu-lossless") p->bCULossless = atobool(value); - OPT("constrained-intra") p->bEnableConstrainedIntra = atobool(value); + OPT2("constrained-intra", "cip") p->bEnableConstrainedIntra = atobool(value); OPT("fast-intra") p->bEnableFastIntra = atobool(value); OPT("open-gop") p->bOpenGOP = atobool(value); + OPT("lookahead-slices") p->lookaheadSlices = atoi(value); OPT("scenecut") { p->scenecutThreshold = atobool(value); @@ -598,6 +609,7 @@ p->scenecutThreshold = atoi(value); } } + OPT("temporal-layers") p->bEnableTemporalSubLayers = atobool(value); OPT("keyint") p->keyframeMax = atoi(value); OPT("min-keyint") p->keyframeMin = atoi(value); OPT("rc-lookahead") p->lookaheadDepth = atoi(value); @@ -627,6 +639,17 @@ OPT("cbqpoffs") p->cbQpOffset = atoi(value); OPT("crqpoffs") p->crQpOffset = atoi(value); OPT("rd") p->rdLevel = atoi(value); + OPT2("rdoq", "rdoq-level") + { + int bval = atobool(value); + if (bError || bval) + { + bError = false; + p->rdoqLevel = atoi(value); + } + else + p->rdoqLevel = 0; + } OPT("psy-rd") { int bval = atobool(value); @@ -817,6 +840,7 @@ OPT("stats") p->rc.statFileName = strdup(value); OPT("csv") p->csvfn = strdup(value); OPT("scaling-list") p->scalingLists = strdup(value); + OPT2("pools", "numa-pools") p->numaPools = strdup(value); OPT("lambda-file") p->rc.lambdaFileName = strdup(value); OPT("analysis-file") p->analysisFileName = strdup(value); else @@ -833,7 +857,7 @@ namespace x265 { // internal encoder functions -int x265_atoi(const char *str, bool& bError) +int x265_atoi(const char* str, bool& bError) { char *end; int v = strtol(str, &end, 0); @@ -848,7 +872,7 @@ * false || no - disabled * integer bitmap value * comma separated list of SIMD names, eg: SSE4.1,XOP */ -int parseCpuName(const char *value, bool& bError) +int parseCpuName(const char* value, bool& bError) { if (!value) { @@ -907,7 +931,7 @@ { 2, 1 }, }; -void setParamAspectRatio(x265_param *p, int width, int height) +void setParamAspectRatio(x265_param* p, int width, int height) { p->vui.aspectRatioIdc = X265_EXTENDED_SAR; p->vui.sarWidth = width; @@ -922,12 +946,10 @@ } } -void getParamAspectRatio(x265_param *p, int& width, int& height) +void getParamAspectRatio(x265_param* p, int& width, int& height) { if (!p->vui.aspectRatioIdc) - { width = height = 0; - } else if ((size_t)p->vui.aspectRatioIdc <= sizeof(fixedRatios) / sizeof(fixedRatios[0])) { width = fixedRatios[p->vui.aspectRatioIdc - 1][0]; @@ -939,12 +961,10 @@ height = p->vui.sarHeight; } else - { width = height = 0; - } } -static inline int _confirm(x265_param *param, bool bflag, const char* message) +static inline int _confirm(x265_param* param, bool bflag, const char* message) { if (!bflag) return 0; @@ -953,13 +973,13 @@ return 1; } -int x265_check_params(x265_param *param) +int x265_check_params(x265_param* param) { #define CHECK(expr, msg) check_failed |= _confirm(param, expr, msg) int check_failed = 0; /* abort if there is a fatal configuration problem */ CHECK(param->maxCUSize != 64 && param->maxCUSize != 32 && param->maxCUSize != 16, - "max ctu size must be 16, 32, or 64"); + "max cu size must be 16, 32, or 64"); if (check_failed == 1) return check_failed; @@ -976,6 +996,10 @@ "x265 was compiled for 8bit encodes, only 8bit internal depth supported"); #endif + CHECK(param->minCUSize != 64 && param->minCUSize != 32 && param->minCUSize != 16 && param->minCUSize != 8, + "minimim CU size must be 8, 16, 32, or 64"); + CHECK(param->minCUSize > param->maxCUSize, + "min CU size must be less than or equal to max CU size"); CHECK(param->rc.qp < -6 * (param->internalBitDepth - 8) || param->rc.qp > QP_MAX_SPEC, "QP exceeds supported range (-QpBDOffsety to 51)"); CHECK(param->fpsNum == 0 || param->fpsDenom == 0, @@ -992,8 +1016,8 @@ "subme must be less than or equal to X265_MAX_SUBPEL_LEVEL (7)"); CHECK(param->subpelRefine < 0, "subme must be greater than or equal to 0"); - CHECK(param->frameNumThreads < 0, - "frameNumThreads (--frame-threads) must be 0 or higher"); + CHECK(param->frameNumThreads < 0 || param->frameNumThreads > X265_MAX_FRAME_THREADS, + "frameNumThreads (--frame-threads) must be [0 .. X265_MAX_FRAME_THREADS)"); CHECK(param->cbQpOffset < -12, "Min. Chroma Cb QP Offset is -12"); CHECK(param->cbQpOffset > 12, "Max. Chroma Cb QP Offset is 12"); CHECK(param->crQpOffset < -12, "Min. Chroma Cr QP Offset is -12"); @@ -1010,7 +1034,8 @@ "QuadtreeTUMaxDepthIntra must be greater 0 and less than 5"); CHECK(maxLog2CUSize < tuQTMinLog2Size + param->tuQTMaxIntraDepth - 1, "QuadtreeTUMaxDepthInter must be less than or equal to the difference between log2(maxCUSize) and QuadtreeTULog2MinSize plus 1"); - + CHECK((param->maxTUSize != 32 && param->maxTUSize != 16 && param->maxTUSize != 8 && param->maxTUSize != 4), + "max TU size must be 4, 8, 16, or 32"); CHECK(param->maxNumMergeCand < 1, "MaxNumMergeCand must be 1 or greater."); CHECK(param->maxNumMergeCand > 5, "MaxNumMergeCand must be 5 or smaller."); @@ -1030,7 +1055,9 @@ "Rate control mode is out of range"); CHECK(param->rdLevel < 0 || param->rdLevel > 6, "RD Level is out of range"); - CHECK(param->bframes > param->lookaheadDepth && !param->rc.bStatRead, + CHECK(param->rdoqLevel < 0 || param->rdoqLevel > 2, + "RDOQ Level is out of range"); + CHECK(param->bframes && param->bframes >= param->lookaheadDepth && !param->rc.bStatRead, "Lookahead depth must be greater than the max consecutive bframe count"); CHECK(param->bframes < 0, "bframe count should be greater than zero"); @@ -1038,6 +1065,8 @@ "max consecutive bframe count must be 16 or smaller"); CHECK(param->lookaheadDepth > X265_LOOKAHEAD_MAX, "Lookahead depth must be less than 256"); + CHECK(param->lookaheadSlices > 16 || param->lookaheadSlices < 0, + "Lookahead slices must between 0 and 16"); CHECK(param->rc.aqMode < X265_AQ_NONE || X265_AQ_AUTO_VARIANCE < param->rc.aqMode, "Aq-Mode is out of range"); CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3, @@ -1067,11 +1096,11 @@ "Color Primaries must be undef, bt709, bt470m," " bt470bg, smpte170m, smpte240m, film or bt2020"); CHECK(param->vui.transferCharacteristics < 0 - || param->vui.transferCharacteristics > 15 + || param->vui.transferCharacteristics > 17 || param->vui.transferCharacteristics == 3, "Transfer Characteristics must be undef, bt709, bt470m, bt470bg," " smpte170m, smpte240m, linear, log100, log316, iec61966-2-4, bt1361e," - " iec61966-2-1, bt2020-10 or bt2020-12"); + " iec61966-2-1, bt2020-10, bt2020-12, smpte-st-2084 or smpte-st-428"); CHECK(param->vui.matrixCoeffs < 0 || param->vui.matrixCoeffs > 10 || param->vui.matrixCoeffs == 3, @@ -1149,39 +1178,59 @@ } } -int x265_set_globals(x265_param *param) +int x265_set_globals(x265_param* param) { - static int once /* = 0 */; + uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize]; + uint32_t minLog2CUSize = (uint32_t)g_log2Size[param->minCUSize]; - if (ATOMIC_INC(&once) > 1) + if (g_ctuSizeConfigured || ATOMIC_INC(&g_ctuSizeConfigured) > 1) { - if (param->maxCUSize != g_maxCUSize) + if (g_maxCUSize != param->maxCUSize) { x265_log(param, X265_LOG_ERROR, "maxCUSize must be the same for all encoders in a single process"); return -1; } + if (g_maxCUDepth != maxLog2CUSize - minLog2CUSize) + { + x265_log(param, X265_LOG_ERROR, "maxCUDepth must be the same for all encoders in a single process"); + return -1; + } } else { - uint32_t maxLog2CUSize = (uint32_t)g_log2Size[param->maxCUSize]; - // set max CU width & height g_maxCUSize = param->maxCUSize; g_maxLog2CUSize = maxLog2CUSize; // compute actual CU depth with respect to config depth and max transform size - g_maxCUDepth = maxLog2CUSize - MIN_LOG2_CU_SIZE; - g_maxFullDepth = maxLog2CUSize - LOG2_UNIT_SIZE; + g_maxCUDepth = maxLog2CUSize - minLog2CUSize; + g_unitSizeDepth = maxLog2CUSize - LOG2_UNIT_SIZE; // initialize partition order uint32_t* tmp = &g_zscanToRaster[0]; - initZscanToRaster(g_maxFullDepth, 1, 0, tmp); - initRasterToZscan(g_maxFullDepth); + initZscanToRaster(g_unitSizeDepth, 1, 0, tmp); + initRasterToZscan(g_unitSizeDepth); } return 0; } -void x265_print_params(x265_param *param) +static void appendtool(x265_param* param, char* buf, size_t size, const char* toolstr) +{ + static const int overhead = (int)strlen("x265 [info]: tools: "); + + if (strlen(buf) + strlen(toolstr) + overhead >= size) + { + x265_log(param, X265_LOG_INFO, "tools:%s\n", buf); + sprintf(buf, " %s", toolstr); + } + else + { + strcat(buf, " "); + strcat(buf, toolstr); + } +} + +void x265_print_params(x265_param* param) { if (param->logLevel < X265_LOG_INFO) return; @@ -1192,8 +1241,10 @@ if (param->interlaceMode) x265_log(param, X265_LOG_INFO, "Interlaced field inputs : %s\n", x265_interlace_names[param->interlaceMode]); - x265_log(param, X265_LOG_INFO, "CTU size / RQT depth inter / intra : %d / %d / %d\n", - param->maxCUSize, param->tuQTMaxInterDepth, param->tuQTMaxIntraDepth); + x265_log(param, X265_LOG_INFO, "Coding QT: max CU size, min CU size : %d / %d\n", param->maxCUSize, param->minCUSize); + + x265_log(param, X265_LOG_INFO, "Residual QT: max TU size, max depth : %d / %d inter / %d intra\n", + param->maxTUSize, param->tuQTMaxInterDepth, param->tuQTMaxIntraDepth); x265_log(param, X265_LOG_INFO, "ME / range / subpel / merge : %s / %d / %d / %d\n", x265_motion_est_names[param->searchMethod], param->searchRange, param->subpelRefine, param->maxNumMergeCand); @@ -1235,45 +1286,48 @@ x265_log(param, X265_LOG_INFO, "VBV/HRD buffer / max-rate / init : %d / %d / %.3f\n", param->rc.vbvBufferSize, param->rc.vbvMaxBitrate, param->rc.vbvBufferInit); - x265_log(param, X265_LOG_INFO, "tools: "); -#define TOOLOPT(FLAG, STR) if (FLAG) fprintf(stderr, "%s ", STR) + char buf[80] = { 0 }; + char tmp[40]; +#define TOOLOPT(FLAG, STR) if (FLAG) appendtool(param, buf, sizeof(buf), STR); +#define TOOLVAL(VAL, STR) if (VAL) { sprintf(tmp, STR, VAL); appendtool(param, buf, sizeof(buf), tmp); } TOOLOPT(param->bEnableRectInter, "rect"); TOOLOPT(param->bEnableAMP, "amp"); - fprintf(stderr, "rd=%d ", param->rdLevel); - if (param->psyRd > 0.) - fprintf(stderr, "psy-rd=%.2lf ", param->psyRd); - if (param->psyRdoq > 0.) - fprintf(stderr, "psy-rdoq=%.2lf ", param->psyRdoq); + TOOLVAL(param->rdLevel, "rd=%d"); + TOOLVAL(param->psyRd, "psy-rd=%.2lf"); + TOOLVAL(param->rdoqLevel, "rdoq=%d"); + TOOLVAL(param->psyRdoq, "psy-rdoq=%.2lf"); TOOLOPT(param->bEnableEarlySkip, "early-skip"); - TOOLOPT(param->bEnableCbfFastMode, "fast-cbf"); - if (param->noiseReductionIntra) - fprintf(stderr, "nr-intra=%d ", param->noiseReductionIntra); - if (param->noiseReductionInter) - fprintf(stderr, "nr-inter=%d ", param->noiseReductionInter); + TOOLVAL(param->noiseReductionIntra, "nr-intra=%d"); + TOOLVAL(param->noiseReductionInter, "nr-inter=%d"); + TOOLOPT(param->bEnableTSkipFast, "tskip-fast"); + TOOLOPT(!param->bEnableTSkipFast && param->bEnableTransformSkip, "tskip"); + TOOLOPT(param->bCULossless, "cu-lossless"); + TOOLOPT(param->bEnableSignHiding, "signhide"); + TOOLOPT(param->bEnableTemporalMvp, "tmvp"); + TOOLOPT(param->bEnableConstrainedIntra, "cip"); + TOOLOPT(param->bIntraInBFrames, "b-intra"); + TOOLOPT(param->bEnableFastIntra, "fast-intra"); + TOOLOPT(param->bEnableStrongIntraSmoothing, "strong-intra-smoothing"); + TOOLVAL(param->lookaheadSlices, "lslices=%d"); if (param->bEnableLoopFilter) { if (param->deblockingFilterBetaOffset || param->deblockingFilterTCOffset) - fprintf(stderr, "deblock(tC=%d:B=%d) ", param->deblockingFilterTCOffset, param->deblockingFilterBetaOffset); + { + sprintf(tmp, "deblock(tC=%d:B=%d)", param->deblockingFilterTCOffset, param->deblockingFilterBetaOffset); + appendtool(param, buf, sizeof(buf), tmp); + } else TOOLOPT(param->bEnableLoopFilter, "deblock"); } - if (param->bEnableSAO) - fprintf(stderr, "sao%s ", param->bSaoNonDeblocked ? "-non-deblock" : ""); - TOOLOPT(param->bEnableSignHiding, "signhide"); - TOOLOPT(param->bEnableConstrainedIntra, "cip"); - TOOLOPT(param->bIntraInBFrames, "b-intra"); - TOOLOPT(param->bEnableFastIntra, "fast-intra"); - TOOLOPT(param->bEnableTemporalMvp, "tmvp"); - if (param->bEnableTransformSkip) - fprintf(stderr, "tskip%s ", param->bEnableTSkipFast ? "-fast" : ""); - TOOLOPT(param->bCULossless, "cu-lossless"); + TOOLOPT(param->bSaoNonDeblocked, "sao-non-deblock"); + TOOLOPT(!param->bSaoNonDeblocked && param->bEnableSAO, "sao"); TOOLOPT(param->rc.bStatWrite, "stats-write"); TOOLOPT(param->rc.bStatRead, "stats-read"); - fprintf(stderr, "\n"); + x265_log(param, X265_LOG_INFO, "tools:%s\n", buf); fflush(stderr); } -char *x265_param2string(x265_param *p) +char *x265_param2string(x265_param* p) { char *buf, *s; @@ -1289,6 +1343,8 @@ s += sprintf(s, " bitdepth=%d", p->internalBitDepth); BOOL(p->bEnableWavefront, "wpp"); s += sprintf(s, " ctu=%d", p->maxCUSize); + s += sprintf(s, " min-cu-size=%d", p->minCUSize); + s += sprintf(s, " max-tu-size=%d", p->maxTUSize); s += sprintf(s, " tu-intra-depth=%d", p->tuQTMaxIntraDepth); s += sprintf(s, " tu-inter-depth=%d", p->tuQTMaxInterDepth); s += sprintf(s, " me=%d", p->searchMethod); @@ -1299,7 +1355,6 @@ s += sprintf(s, " max-merge=%d", p->maxNumMergeCand); BOOL(p->bEnableTemporalMvp, "temporal-mvp"); BOOL(p->bEnableEarlySkip, "early-skip"); - BOOL(p->bEnableCbfFastMode, "fast-cbf"); s += sprintf(s, " rdpenalty=%d", p->rdPenalty); BOOL(p->bEnableTransformSkip, "tskip"); BOOL(p->bEnableTSkipFast, "tskip-fast"); @@ -1309,11 +1364,13 @@ BOOL(p->bEnableConstrainedIntra, "constrained-intra"); BOOL(p->bEnableFastIntra, "fast-intra"); BOOL(p->bOpenGOP, "open-gop"); + BOOL(p->bEnableTemporalSubLayers, "temporal-layers"); s += sprintf(s, " interlace=%d", p->interlaceMode); s += sprintf(s, " keyint=%d", p->keyframeMax); s += sprintf(s, " min-keyint=%d", p->keyframeMin); s += sprintf(s, " scenecut=%d", p->scenecutThreshold); s += sprintf(s, " rc-lookahead=%d", p->lookaheadDepth); + s += sprintf(s, " lookahead-slices=%d", p->lookaheadSlices); s += sprintf(s, " bframes=%d", p->bframes); s += sprintf(s, " bframe-bias=%d", p->bFrameBias); s += sprintf(s, " b-adapt=%d", p->bFrameAdaptive); @@ -1326,9 +1383,12 @@ s += sprintf(s, " crqpoffs=%d", p->crQpOffset); s += sprintf(s, " rd=%d", p->rdLevel); s += sprintf(s, " psy-rd=%.2f", p->psyRd); + s += sprintf(s, " rdoq-level=%d", p->rdoqLevel); s += sprintf(s, " psy-rdoq=%.2f", p->psyRdoq); BOOL(p->bEnableSignHiding, "signhide"); - BOOL(p->bEnableLoopFilter, "lft"); + BOOL(p->bEnableLoopFilter, "deblock"); + if (p->bEnableLoopFilter && (p->deblockingFilterBetaOffset || p->deblockingFilterTCOffset)) + s += sprintf(s, "=%d:%d", p->deblockingFilterTCOffset, p->deblockingFilterBetaOffset); BOOL(p->bEnableSAO, "sao"); BOOL(p->bSaoNonDeblocked, "sao-non-deblock"); BOOL(p->bBPyramid, "b-pyramid"); @@ -1367,7 +1427,7 @@ return buf; } -bool parseLambdaFile(x265_param *param) +bool parseLambdaFile(x265_param* param) { if (!param->rc.lambdaFileName) return false; diff -Nru x265-1.5/source/common/picyuv.cpp x265-1.6/source/common/picyuv.cpp --- x265-1.5/source/common/picyuv.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/picyuv.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -84,7 +84,7 @@ * allocated by the same encoder. */ bool PicYuv::createOffsets(const SPS& sps) { - uint32_t numPartitions = 1 << (g_maxFullDepth * 2); + uint32_t numPartitions = 1 << (g_unitSizeDepth * 2); CHECKED_MALLOC(m_cuOffsetY, intptr_t, sps.numCuInWidth * sps.numCuInHeight); CHECKED_MALLOC(m_cuOffsetC, intptr_t, sps.numCuInWidth * sps.numCuInHeight); for (uint32_t cuRow = 0; cuRow < sps.numCuInHeight; cuRow++) @@ -176,9 +176,7 @@ for (int r = 0; r < height; r++) { for (int c = 0; c < width; c++) - { yPixel[c] = (pixel)yChar[c]; - } yPixel += m_stride; yChar += pic.stride[0] / sizeof(*yChar); @@ -229,9 +227,7 @@ for (int r = 0; r < height; r++) { for (int x = 0; x < padx; x++) - { Y[width + x] = Y[width - 1]; - } Y += m_stride; } @@ -257,9 +253,7 @@ pixel *V = m_picOrg[2] + ((height >> m_vChromaShift) - 1) * m_strideC; for (int i = 1; i <= pady; i++) - { memcpy(Y + i * m_stride, Y, (width + padx) * sizeof(pixel)); - } for (int j = 1; j <= pady >> m_vChromaShift; j++) { diff -Nru x265-1.5/source/common/pixel.cpp x265-1.6/source/common/pixel.cpp --- x265-1.5/source/common/pixel.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/pixel.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -428,7 +428,7 @@ void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) { X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); - X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); + X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n"); X265_CHECK(shift >= 0, "invalid shift\n"); for (int i = 0; i < size; i++) @@ -445,7 +445,7 @@ void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift) { X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n"); - X265_CHECK((((intptr_t)src | srcStride) & 15) == 0 || size == 4, "src alignment error\n"); + X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n"); X265_CHECK(shift > 0, "invalid shift\n"); int16_t round = 1 << (shift - 1); @@ -462,7 +462,7 @@ template void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) { - X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); + X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n"); X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); X265_CHECK(shift >= 0, "invalid shift\n"); @@ -479,7 +479,7 @@ template void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) { - X265_CHECK((((intptr_t)dst | dstStride) & 15) == 0 || size == 4, "dst alignment error\n"); + X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n"); X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n"); X265_CHECK(shift > 0, "invalid shift\n"); @@ -522,12 +522,10 @@ #if CHECKED_BUILD || _DEBUG const int correction = (IF_INTERNAL_PREC - X265_DEPTH); -#endif - X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n"); X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n"); X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n"); - X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n"); +#endif for (y = 0; y <= height - 1; y++) { diff -Nru x265-1.5/source/common/predict.cpp x265-1.6/source/common/predict.cpp --- x265-1.5/source/common/predict.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/predict.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -34,11 +34,23 @@ #pragma warning(disable: 4127) // conditional expression is constant #endif +PredictionUnit::PredictionUnit(const CUData& cu, const CUGeom& cuGeom, int puIdx) +{ + /* address of CTU */ + ctuAddr = cu.m_cuAddr; + + /* offset of CU */ + cuAbsPartIdx = cuGeom.absPartIdx; + + /* offset and dimensions of PU */ + cu.getPartIndexAndSize(puIdx, puAbsPartIdx, width, height); +} + namespace { inline pixel weightBidir(int w0, int16_t P0, int w1, int16_t P1, int round, int shift, int offset) { - return x265_clip((w0 * (P0 + IF_INTERNAL_OFFS) + w1 * (P1 + IF_INTERNAL_OFFS) + round + (offset << (shift - 1))) >> shift); + return x265_clip((w0 * (P0 + IF_INTERNAL_OFFS) + w1 * (P1 + IF_INTERNAL_OFFS) + round + (offset * (1 << (shift - 1)))) >> shift); } } @@ -67,82 +79,24 @@ return false; } -void Predict::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSize) +void Predict::motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma) { - int sizeIdx = log2TrSize - 2; - int tuSize = 1 << log2TrSize; - int filter = !!(g_intraFilterFlags[dirMode] & tuSize); - X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n"); - - bool bFilter = log2TrSize <= 4; - primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, intraNeighbourBuf[filter], dirMode, bFilter); -} + int refIdx0 = cu.m_refIdx[0][pu.puAbsPartIdx]; + int refIdx1 = cu.m_refIdx[1][pu.puAbsPartIdx]; -void Predict::predIntraChromaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC, int chFmt) -{ - int tuSize = 1 << log2TrSizeC; - int tuSize2 = tuSize << 1; - - pixel* srcBuf = intraNeighbourBuf[0]; - - if (chFmt == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize)) - { - pixel* fltBuf = intraNeighbourBuf[1]; - pixel topLeft = srcBuf[0], topLast = srcBuf[tuSize2], leftLast = srcBuf[tuSize2 + tuSize2]; - - // filtering top - for (int i = 1; i < tuSize2; i++) - fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2; - fltBuf[tuSize2] = topLast; - - // filtering top-left - fltBuf[0] = ((srcBuf[0] << 1) + srcBuf[1] + srcBuf[tuSize2 + 1] + 2) >> 2; - - //filtering left - fltBuf[tuSize2 + 1] = ((srcBuf[tuSize2 + 1] << 1) + topLeft + srcBuf[tuSize2 + 2] + 2) >> 2; - for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++) - fltBuf[i] = ((srcBuf[i] << 1) + srcBuf[i - 1] + srcBuf[i + 1] + 2) >> 2; - fltBuf[tuSize2 + tuSize2] = leftLast; - - srcBuf = intraNeighbourBuf[1]; - } - - int sizeIdx = log2TrSizeC - 2; - X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n"); - primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, srcBuf, dirMode, 0); -} - -void Predict::initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx) -{ - m_predSlice = cu.m_slice; - cu.getPartIndexAndSize(partIdx, m_puAbsPartIdx, m_puWidth, m_puHeight); - m_ctuAddr = cu.m_cuAddr; - m_cuAbsPartIdx = cuGeom.encodeIdx; -} - -void Predict::prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx) -{ - initMotionCompensation(cu, cuGeom, partIdx); - - m_refIdx0 = cu.m_refIdx[0][m_puAbsPartIdx]; - m_clippedMv[0] = cu.m_mv[0][m_puAbsPartIdx]; - m_refIdx1 = cu.m_refIdx[1][m_puAbsPartIdx]; - m_clippedMv[1] = cu.m_mv[1][m_puAbsPartIdx]; - cu.clipMv(m_clippedMv[0]); - cu.clipMv(m_clippedMv[1]); -} - -void Predict::motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma) -{ - if (m_predSlice->isInterP()) + if (cu.m_slice->isInterP()) { /* P Slice */ WeightValues wv0[3]; - X265_CHECK(m_refIdx0 >= 0, "invalid P refidx\n"); - X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "P refidx out of range\n"); - const WeightParam *wp0 = m_predSlice->m_weightPredTable[0][m_refIdx0]; - if (m_predSlice->m_pps->bUseWeightPred && wp0->bPresentFlag) + X265_CHECK(refIdx0 >= 0, "invalid P refidx\n"); + X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "P refidx out of range\n"); + const WeightParam *wp0 = cu.m_slice->m_weightPredTable[0][refIdx0]; + + MV mv0 = cu.m_mv[0][pu.puAbsPartIdx]; + cu.clipMv(mv0); + + if (cu.m_slice->m_pps->bUseWeightPred && wp0->bPresentFlag) { for (int plane = 0; plane < 3; plane++) { @@ -155,18 +109,18 @@ ShortYuv& shortYuv = m_predShortYuv[0]; if (bLuma) - predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); if (bChroma) - predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); - addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma); + addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma); } else { if (bLuma) - predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); if (bChroma) - predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); } } else @@ -176,10 +130,13 @@ WeightValues wv0[3], wv1[3]; const WeightParam *pwp0, *pwp1; - if (m_predSlice->m_pps->bUseWeightedBiPred) + X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "bidir refidx0 out of range\n"); + X265_CHECK(refIdx1 < cu.m_slice->m_numRefIdx[1], "bidir refidx1 out of range\n"); + + if (cu.m_slice->m_pps->bUseWeightedBiPred) { - pwp0 = m_refIdx0 >= 0 ? m_predSlice->m_weightPredTable[0][m_refIdx0] : NULL; - pwp1 = m_refIdx1 >= 0 ? m_predSlice->m_weightPredTable[1][m_refIdx1] : NULL; + pwp0 = refIdx0 >= 0 ? cu.m_slice->m_weightPredTable[0][refIdx0] : NULL; + pwp1 = refIdx1 >= 0 ? cu.m_slice->m_weightPredTable[1][refIdx1] : NULL; if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag)) { @@ -200,7 +157,7 @@ else { /* uniprediction weighting, always outputs to wv0 */ - const WeightParam* pwp = (m_refIdx0 >= 0) ? pwp0 : pwp1; + const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1; for (int plane = 0; plane < 3; plane++) { wv0[plane].w = pwp[plane].inputWeight; @@ -213,89 +170,92 @@ else pwp0 = pwp1 = NULL; - if (m_refIdx0 >= 0 && m_refIdx1 >= 0) + if (refIdx0 >= 0 && refIdx1 >= 0) { - /* Biprediction */ - X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "bidir refidx0 out of range\n"); - X265_CHECK(m_refIdx1 < m_predSlice->m_numRefIdx[1], "bidir refidx1 out of range\n"); + MV mv0 = cu.m_mv[0][pu.puAbsPartIdx]; + MV mv1 = cu.m_mv[1][pu.puAbsPartIdx]; + cu.clipMv(mv0); + cu.clipMv(mv1); if (bLuma) { - predInterLumaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); - predInterLumaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); + predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); + predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1); } if (bChroma) { - predInterChromaShort(m_predShortYuv[0], *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); - predInterChromaShort(m_predShortYuv[1], *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); + predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); + predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1); } if (pwp0 && pwp1 && (pwp0->bPresentFlag || pwp1->bPresentFlag)) - addWeightBi(predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma); + addWeightBi(pu, predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma); else - predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], m_puAbsPartIdx, m_puWidth, m_puHeight, bLuma, bChroma); + predYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], pu.puAbsPartIdx, pu.width, pu.height, bLuma, bChroma); } - else if (m_refIdx0 >= 0) + else if (refIdx0 >= 0) { - /* uniprediction to L0 */ - X265_CHECK(m_refIdx0 < m_predSlice->m_numRefIdx[0], "unidir refidx0 out of range\n"); + MV mv0 = cu.m_mv[0][pu.puAbsPartIdx]; + cu.clipMv(mv0); if (pwp0 && pwp0->bPresentFlag) { ShortYuv& shortYuv = m_predShortYuv[0]; if (bLuma) - predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); if (bChroma) - predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); - addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma); + addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma); } else { if (bLuma) - predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); if (bChroma) - predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[0][m_refIdx0]->m_reconPic, m_clippedMv[0]); + predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[0][refIdx0]->m_reconPic, mv0); } } else { + MV mv1 = cu.m_mv[1][pu.puAbsPartIdx]; + cu.clipMv(mv1); + /* uniprediction to L1 */ - X265_CHECK(m_refIdx1 >= 0, "refidx1 was not positive\n"); - X265_CHECK(m_refIdx1 < m_predSlice->m_numRefIdx[1], "unidir refidx1 out of range\n"); + X265_CHECK(refIdx1 >= 0, "refidx1 was not positive\n"); if (pwp1 && pwp1->bPresentFlag) { ShortYuv& shortYuv = m_predShortYuv[0]; if (bLuma) - predInterLumaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); + predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1); if (bChroma) - predInterChromaShort(shortYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); + predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1); - addWeightUni(predYuv, shortYuv, wv0, bLuma, bChroma); + addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma); } else { if (bLuma) - predInterLumaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); + predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1); if (bChroma) - predInterChromaPixel(predYuv, *m_predSlice->m_refPicList[1][m_refIdx1]->m_reconPic, m_clippedMv[1]); + predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refPicList[1][refIdx1]->m_reconPic, mv1); } } } } -void Predict::predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const +void Predict::predInterLumaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const { - pixel* dst = dstYuv.getLumaAddr(m_puAbsPartIdx); + pixel* dst = dstYuv.getLumaAddr(pu.puAbsPartIdx); intptr_t dstStride = dstYuv.m_size; intptr_t srcStride = refPic.m_stride; intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride; - int partEnum = partitionFromSizes(m_puWidth, m_puHeight); - const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset; + int partEnum = partitionFromSizes(pu.width, pu.height); + const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset; int xFrac = mv.x & 0x3; int yFrac = mv.y & 0x3; @@ -310,32 +270,32 @@ primitives.pu[partEnum].luma_hvpp(src, srcStride, dst, dstStride, xFrac, yFrac); } -void Predict::predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const +void Predict::predInterLumaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const { - int16_t* dst = dstSYuv.getLumaAddr(m_puAbsPartIdx); + int16_t* dst = dstSYuv.getLumaAddr(pu.puAbsPartIdx); int dstStride = dstSYuv.m_size; intptr_t srcStride = refPic.m_stride; intptr_t srcOffset = (mv.x >> 2) + (mv.y >> 2) * srcStride; - const pixel* src = refPic.getLumaAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + srcOffset; + const pixel* src = refPic.getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + srcOffset; int xFrac = mv.x & 0x3; int yFrac = mv.y & 0x3; - int partEnum = partitionFromSizes(m_puWidth, m_puHeight); + int partEnum = partitionFromSizes(pu.width, pu.height); - X265_CHECK((m_puWidth % 4) + (m_puHeight % 4) == 0, "width or height not divisible by 4\n"); + X265_CHECK((pu.width % 4) + (pu.height % 4) == 0, "width or height not divisible by 4\n"); X265_CHECK(dstStride == MAX_CU_SIZE, "stride expected to be max cu size\n"); if (!(yFrac | xFrac)) - primitives.luma_p2s(src, srcStride, dst, m_puWidth, m_puHeight); + primitives.luma_p2s(src, srcStride, dst, pu.width, pu.height); else if (!yFrac) primitives.pu[partEnum].luma_hps(src, srcStride, dst, dstStride, xFrac, 0); else if (!xFrac) primitives.pu[partEnum].luma_vps(src, srcStride, dst, dstStride, yFrac); else { - int tmpStride = m_puWidth; + int tmpStride = pu.width; int filterSize = NTAPS_LUMA; int halfFilterSize = (filterSize >> 1); primitives.pu[partEnum].luma_hps(src, srcStride, m_immedVals, tmpStride, xFrac, 1); @@ -343,7 +303,7 @@ } } -void Predict::predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const +void Predict::predInterChromaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const { intptr_t dstStride = dstYuv.m_csize; intptr_t refStride = refPic.m_strideC; @@ -353,16 +313,16 @@ intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride; - const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; - const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset; + const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset; - pixel* dstCb = dstYuv.getCbAddr(m_puAbsPartIdx); - pixel* dstCr = dstYuv.getCrAddr(m_puAbsPartIdx); + pixel* dstCb = dstYuv.getCbAddr(pu.puAbsPartIdx); + pixel* dstCr = dstYuv.getCrAddr(pu.puAbsPartIdx); int xFrac = mv.x & ((1 << shiftHor) - 1); int yFrac = mv.y & ((1 << shiftVer) - 1); - int partEnum = partitionFromSizes(m_puWidth, m_puHeight); + int partEnum = partitionFromSizes(pu.width, pu.height); if (!(yFrac | xFrac)) { @@ -381,7 +341,7 @@ } else { - int extStride = m_puWidth >> m_hChromaShift; + int extStride = pu.width >> m_hChromaShift; int filterSize = NTAPS_CHROMA; int halfFilterSize = (filterSize >> 1); @@ -393,7 +353,7 @@ } } -void Predict::predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const +void Predict::predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const { intptr_t refStride = refPic.m_strideC; intptr_t dstStride = dstSYuv.m_csize; @@ -403,19 +363,19 @@ intptr_t refOffset = (mv.x >> shiftHor) + (mv.y >> shiftVer) * refStride; - const pixel* refCb = refPic.getCbAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; - const pixel* refCr = refPic.getCrAddr(m_ctuAddr, m_cuAbsPartIdx + m_puAbsPartIdx) + refOffset; + const pixel* refCb = refPic.getCbAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset; + const pixel* refCr = refPic.getCrAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) + refOffset; - int16_t* dstCb = dstSYuv.getCbAddr(m_puAbsPartIdx); - int16_t* dstCr = dstSYuv.getCrAddr(m_puAbsPartIdx); + int16_t* dstCb = dstSYuv.getCbAddr(pu.puAbsPartIdx); + int16_t* dstCr = dstSYuv.getCrAddr(pu.puAbsPartIdx); int xFrac = mv.x & ((1 << shiftHor) - 1); int yFrac = mv.y & ((1 << shiftVer) - 1); - int partEnum = partitionFromSizes(m_puWidth, m_puHeight); + int partEnum = partitionFromSizes(pu.width, pu.height); - uint32_t cxWidth = m_puWidth >> m_hChromaShift; - uint32_t cxHeight = m_puHeight >> m_vChromaShift; + uint32_t cxWidth = pu.width >> m_hChromaShift; + uint32_t cxHeight = pu.height >> m_vChromaShift; X265_CHECK(((cxWidth | cxHeight) % 2) == 0, "chroma block size expected to be multiple of 2\n"); @@ -447,7 +407,7 @@ } /* weighted averaging for bi-pred */ -void Predict::addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const +void Predict::addWeightBi(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const { int x, y; @@ -456,9 +416,9 @@ if (bLuma) { - pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx); - const int16_t* srcY0 = srcYuv0.getLumaAddr(m_puAbsPartIdx); - const int16_t* srcY1 = srcYuv1.getLumaAddr(m_puAbsPartIdx); + pixel* dstY = predYuv.getLumaAddr(pu.puAbsPartIdx); + const int16_t* srcY0 = srcYuv0.getLumaAddr(pu.puAbsPartIdx); + const int16_t* srcY1 = srcYuv1.getLumaAddr(pu.puAbsPartIdx); // Luma w0 = wp0[0].w; @@ -473,9 +433,9 @@ dststride = predYuv.m_size; // TODO: can we use weight_sp here? - for (y = m_puHeight - 1; y >= 0; y--) + for (y = pu.height - 1; y >= 0; y--) { - for (x = m_puWidth - 1; x >= 0; ) + for (x = pu.width - 1; x >= 0; ) { // note: luma min width is 4 dstY[x] = weightBidir(w0, srcY0[x], w1, srcY1[x], round, shift, offset); @@ -496,12 +456,12 @@ if (bChroma) { - pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx); - pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx); - const int16_t* srcU0 = srcYuv0.getCbAddr(m_puAbsPartIdx); - const int16_t* srcV0 = srcYuv0.getCrAddr(m_puAbsPartIdx); - const int16_t* srcU1 = srcYuv1.getCbAddr(m_puAbsPartIdx); - const int16_t* srcV1 = srcYuv1.getCrAddr(m_puAbsPartIdx); + pixel* dstU = predYuv.getCbAddr(pu.puAbsPartIdx); + pixel* dstV = predYuv.getCrAddr(pu.puAbsPartIdx); + const int16_t* srcU0 = srcYuv0.getCbAddr(pu.puAbsPartIdx); + const int16_t* srcV0 = srcYuv0.getCrAddr(pu.puAbsPartIdx); + const int16_t* srcU1 = srcYuv1.getCbAddr(pu.puAbsPartIdx); + const int16_t* srcV1 = srcYuv1.getCrAddr(pu.puAbsPartIdx); // Chroma U w0 = wp0[1].w; @@ -515,8 +475,8 @@ src1Stride = srcYuv1.m_csize; dststride = predYuv.m_csize; - uint32_t cwidth = m_puWidth >> srcYuv0.m_hChromaShift; - uint32_t cheight = m_puHeight >> srcYuv0.m_vChromaShift; + uint32_t cwidth = pu.width >> srcYuv0.m_hChromaShift; + uint32_t cheight = pu.height >> srcYuv0.m_vChromaShift; // TODO: can we use weight_sp here? for (y = cheight - 1; y >= 0; y--) @@ -561,15 +521,15 @@ } /* weighted averaging for uni-pred */ -void Predict::addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const +void Predict::addWeightUni(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const { int w0, offset, shiftNum, shift, round; uint32_t srcStride, dstStride; if (bLuma) { - pixel* dstY = predYuv.getLumaAddr(m_puAbsPartIdx); - const int16_t* srcY0 = srcYuv.getLumaAddr(m_puAbsPartIdx); + pixel* dstY = predYuv.getLumaAddr(pu.puAbsPartIdx); + const int16_t* srcY0 = srcYuv.getLumaAddr(pu.puAbsPartIdx); // Luma w0 = wp[0].w; @@ -580,15 +540,15 @@ srcStride = srcYuv.m_size; dstStride = predYuv.m_size; - primitives.weight_sp(srcY0, dstY, srcStride, dstStride, m_puWidth, m_puHeight, w0, round, shift, offset); + primitives.weight_sp(srcY0, dstY, srcStride, dstStride, pu.width, pu.height, w0, round, shift, offset); } if (bChroma) { - pixel* dstU = predYuv.getCbAddr(m_puAbsPartIdx); - pixel* dstV = predYuv.getCrAddr(m_puAbsPartIdx); - const int16_t* srcU0 = srcYuv.getCbAddr(m_puAbsPartIdx); - const int16_t* srcV0 = srcYuv.getCrAddr(m_puAbsPartIdx); + pixel* dstU = predYuv.getCbAddr(pu.puAbsPartIdx); + pixel* dstV = predYuv.getCrAddr(pu.puAbsPartIdx); + const int16_t* srcU0 = srcYuv.getCbAddr(pu.puAbsPartIdx); + const int16_t* srcV0 = srcYuv.getCrAddr(pu.puAbsPartIdx); // Chroma U w0 = wp[1].w; @@ -600,8 +560,8 @@ srcStride = srcYuv.m_csize; dstStride = predYuv.m_csize; - uint32_t cwidth = m_puWidth >> srcYuv.m_hChromaShift; - uint32_t cheight = m_puHeight >> srcYuv.m_vChromaShift; + uint32_t cwidth = pu.width >> srcYuv.m_hChromaShift; + uint32_t cheight = pu.height >> srcYuv.m_vChromaShift; primitives.weight_sp(srcU0, dstU, srcStride, dstStride, cwidth, cheight, w0, round, shift, offset); @@ -615,12 +575,33 @@ } } -void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode) +void Predict::predIntraLumaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSize) +{ + int tuSize = 1 << log2TrSize; + int sizeIdx = log2TrSize - 2; + X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n"); + + int filter = !!(g_intraFilterFlags[dirMode] & tuSize); + bool bFilter = log2TrSize <= 4; + primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, intraNeighbourBuf[filter], dirMode, bFilter); +} + +void Predict::predIntraChromaAng(uint32_t dirMode, pixel* dst, intptr_t stride, uint32_t log2TrSizeC) +{ + int tuSize = 1 << log2TrSizeC; + int sizeIdx = log2TrSizeC - 2; + X265_CHECK(sizeIdx >= 0 && sizeIdx < 4, "intra block size is out of range\n"); + + int filter = !!(m_csp == X265_CSP_I444 && (g_intraFilterFlags[dirMode] & tuSize)); + primitives.cu[sizeIdx].intra_pred[dirMode](dst, stride, intraNeighbourBuf[filter], dirMode, 0); +} + +void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode) { - int tuSize = intraNeighbors.tuSize; + int tuSize = 1 << intraNeighbors.log2TrSize; int tuSize2 = tuSize << 1; - pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + pixel* adiOrigin = cu.m_encData->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx); intptr_t picStride = cu.m_encData->m_reconPic->m_stride; fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]); @@ -633,64 +614,48 @@ if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize) { // generate filtered intra prediction samples - bool bStrongSmoothing = (tuSize == 32 && cu.m_slice->m_sps->bUseStrongIntraSmoothing); - if (bStrongSmoothing) + if (cu.m_slice->m_sps->bUseStrongIntraSmoothing && tuSize == 32) { - const int trSize = 32; - const int trSize2 = trSize << 1; const int threshold = 1 << (X265_DEPTH - 5); pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32]; - bStrongSmoothing = abs (topLeft + topLast - (topMiddle << 1)) < threshold && - abs (topLeft + leftLast - (leftMiddle << 1)) < threshold; - - if (bStrongSmoothing) + if (abs(topLeft + topLast - (topMiddle << 1)) < threshold && + abs(topLeft + leftLast - (leftMiddle << 1)) < threshold) { - // bilinear interpolation + // "strong" bilinear interpolation const int shift = 5 + 1; int init = (topLeft << shift) + tuSize; int deltaL, deltaR; - // TODO: Performance Primitive??? deltaL = leftLast - topLeft; deltaR = topLast - topLeft; fltBuf[0] = topLeft; - for (int i = 1; i < trSize2; i++) + for (int i = 1; i < tuSize2; i++) { fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering fltBuf[i] = (pixel)((init + deltaR * i) >> shift); // Above Filtering } - fltBuf[trSize2] = topLast; - fltBuf[tuSize2 + trSize2] = leftLast; - + fltBuf[tuSize2] = topLast; + fltBuf[tuSize2 + tuSize2] = leftLast; return; } } - // filtering top - for (int i = 1; i < tuSize2; i++) - fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2; - fltBuf[tuSize2] = topLast; - - // filtering top-left - fltBuf[0] = ((topLeft << 1) + refBuf[1] + refBuf[tuSize2 + 1] + 2) >> 2; - - // filtering left - fltBuf[tuSize2 + 1] = ((refBuf[tuSize2 + 1] << 1) + topLeft + refBuf[tuSize2 + 2] + 2) >> 2; - for (int i = tuSize2 + 2; i < tuSize2 + tuSize2; i++) - fltBuf[i] = ((refBuf[i] << 1) + refBuf[i - 1] + refBuf[i + 1] + 2) >> 2; - fltBuf[tuSize2 + tuSize2] = leftLast; + primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf); } } -void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId) +void Predict::initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId) { - const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + const pixel* adiOrigin = cu.m_encData->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx); intptr_t picStride = cu.m_encData->m_reconPic->m_strideC; fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]); + + if (m_csp == X265_CSP_I444) + primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1]); } void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors) @@ -746,7 +711,7 @@ intraNeighbors->leftUnits = leftUnits; intraNeighbors->unitWidth = 1 << log2UnitWidth; intraNeighbors->unitHeight = 1 << log2UnitHeight; - intraNeighbors->tuSize = tuSize; + intraNeighbors->log2TrSize = log2TrSize; } void Predict::fillReferenceSamples(const pixel* adiOrigin, intptr_t picStride, const IntraNeighbors& intraNeighbors, pixel dst[258]) @@ -754,7 +719,7 @@ const pixel dcValue = (pixel)(1 << (X265_DEPTH - 1)); int numIntraNeighbor = intraNeighbors.numIntraNeighbor; int totalUnits = intraNeighbors.totalUnits; - uint32_t tuSize = intraNeighbors.tuSize; + uint32_t tuSize = 1 << intraNeighbors.log2TrSize; uint32_t refSize = tuSize * 2 + 1; // Nothing is available, perform DC prediction. diff -Nru x265-1.5/source/common/predict.h x265-1.6/source/common/predict.h --- x265-1.5/source/common/predict.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/predict.h 2015-04-02 16:46:36.000000000 +0000 @@ -36,6 +36,17 @@ class Slice; struct CUGeom; +struct PredictionUnit +{ + uint32_t ctuAddr; // raster index of current CTU within its picture + uint32_t cuAbsPartIdx; // z-order offset of current CU within its CTU + uint32_t puAbsPartIdx; // z-order offset of current PU with its CU + int width; + int height; + + PredictionUnit(const CUData& cu, const CUGeom& cuGeom, int puIdx); +}; + class Predict { public: @@ -56,7 +67,7 @@ int leftUnits; int unitWidth; int unitHeight; - int tuSize; + int log2TrSize; bool bNeighborFlags[4 * MAX_NUM_SPU_W + 1]; }; @@ -65,38 +76,34 @@ // Unfiltered/filtered neighbours of the current partition. pixel intraNeighbourBuf[2][258]; + /* Slice information */ - const Slice* m_predSlice; int m_csp; int m_hChromaShift; int m_vChromaShift; - /* cached CU information for prediction */ - uint32_t m_ctuAddr; // raster index of current CTU within its picture - uint32_t m_cuAbsPartIdx; // z-order index of current CU within its CTU - uint32_t m_puAbsPartIdx; // z-order index of current PU with its CU - int m_puWidth; - int m_puHeight; - int m_refIdx0; - int m_refIdx1; - - /* TODO: Need to investigate clipping while writing into the TComDataCU fields itself */ - MV m_clippedMv[2]; - Predict(); ~Predict(); bool allocBuffers(int csp); // motion compensation functions - void predInterLumaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const; - void predInterChromaPixel(Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const; + void predInterLumaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const; + void predInterChromaPixel(const PredictionUnit& pu, Yuv& dstYuv, const PicYuv& refPic, const MV& mv) const; - void predInterLumaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const; - void predInterChromaShort(ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const; + void predInterLumaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const; + void predInterChromaShort(const PredictionUnit& pu, ShortYuv& dstSYuv, const PicYuv& refPic, const MV& mv) const; - void addWeightBi(Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const; - void addWeightUni(Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const; + void addWeightBi(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv0, const ShortYuv& srcYuv1, const WeightValues wp0[3], const WeightValues wp1[3], bool bLuma, bool bChroma) const; + void addWeightUni(const PredictionUnit& pu, Yuv& predYuv, const ShortYuv& srcYuv, const WeightValues wp[3], bool bLuma, bool bChroma) const; + + void motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma); + + /* Angular Intra */ + void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize); + void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC); + void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode); + void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId); /* Intra prediction helper functions */ static void initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *IntraNeighbors); @@ -111,19 +118,6 @@ static int isAboveRightAvailable(const CUData& cu, uint32_t partIdxRT, bool* bValidFlags, uint32_t numUnits); template static int isBelowLeftAvailable(const CUData& cu, uint32_t partIdxLB, bool* bValidFlags, uint32_t numUnits); - -public: - - /* prepMotionCompensation needs to be called to prepare MC with CU-relevant data */ - void initMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx); - void prepMotionCompensation(const CUData& cu, const CUGeom& cuGeom, int partIdx); - void motionCompensation(Yuv& predYuv, bool bLuma, bool bChroma); - - /* Angular Intra */ - void predIntraLumaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSize); - void predIntraChromaAng(uint32_t dirMode, pixel* pred, intptr_t stride, uint32_t log2TrSizeC, int chFmt); - void initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, int dirMode); - void initAdiPatternChroma(const CUData& cu, const CUGeom& cuGeom, uint32_t absPartIdx, const IntraNeighbors& intraNeighbors, uint32_t chromaId); }; } diff -Nru x265-1.5/source/common/primitives.cpp x265-1.6/source/common/primitives.cpp --- x265-1.5/source/common/primitives.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/primitives.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -98,6 +98,7 @@ p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp; p.chroma[X265_CSP_I444].pu[i].addAvg = p.pu[i].addAvg; p.chroma[X265_CSP_I444].pu[i].satd = p.pu[i].satd; + p.chroma[X265_CSP_I444].pu[i].chroma_p2s = p.pu[i].filter_p2s; } for (int i = 0; i < NUM_CU_SIZES; i++) @@ -190,7 +191,6 @@ /* cpuid >= 0 - force CPU type * cpuid < 0 - auto-detect if uninitialized */ -extern "C" void x265_setup_primitives(x265_param *param, int cpuid) { if (cpuid < 0) @@ -257,7 +257,7 @@ extern "C" { int x265_cpu_cpuid_test(void) { return 0; } void x265_cpu_emms(void) {} -void x265_cpu_cpuid(uint32_t, uint32_t *, uint32_t *, uint32_t *, uint32_t *) {} +void x265_cpu_cpuid(uint32_t, uint32_t *eax, uint32_t *, uint32_t *, uint32_t *) { *eax = 0; } void x265_cpu_xgetbv(uint32_t, uint32_t *, uint32_t *) {} } #endif diff -Nru x265-1.5/source/common/primitives.h x265-1.6/source/common/primitives.h --- x265-1.5/source/common/primitives.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/primitives.h 2015-04-02 16:46:36.000000000 +0000 @@ -119,6 +119,7 @@ typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter); typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma); +typedef void (*intra_filter_t)(const pixel* references, pixel* filtered); typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); @@ -136,8 +137,7 @@ typedef uint32_t (*nquant_t)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff); typedef void (*dequant_scaling_t)(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift); typedef void (*dequant_normal_t)(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift); -typedef int (*count_nonzero_t)(const int16_t* quantCoeff, int numCoeff); - +typedef int(*count_nonzero_t)(const int16_t* quantCoeff); typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset); typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride); @@ -155,7 +155,8 @@ typedef void (*filter_sp_t) (const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); typedef void (*filter_ss_t) (const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx); typedef void (*filter_hv_pp_t) (const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); -typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height); +typedef void (*filter_p2s_wxh_t)(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height); +typedef void (*filter_p2s_t)(const pixel* src, intptr_t srcStride, int16_t* dst); typedef void (*copy_pp_t)(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); // dst is aligned typedef void (*copy_sp_t)(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); @@ -178,6 +179,8 @@ typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len); +typedef int (*findPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig); + /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ struct EncoderPrimitives @@ -207,6 +210,7 @@ addAvg_t addAvg; // bidir motion compensation, uses 16bit values copy_pp_t copy_pp; + filter_p2s_t filter_p2s; } pu[NUM_PU_SIZES]; @@ -225,7 +229,7 @@ pixel_add_ps_t add_ps; blockfill_s_t blockfill_s; // block fill, for DC transforms copy_cnt_t copy_cnt; // copy coeff while counting non-zero - + count_nonzero_t count_nonzero; cpy2Dto1D_shl_t cpy2Dto1D_shl; cpy2Dto1D_shr_t cpy2Dto1D_shr; cpy1Dto2D_shl_t cpy1Dto2D_shl; @@ -246,6 +250,7 @@ transpose_t transpose; // transpose pixel block; for use with intra all-angs intra_allangs_t intra_pred_allangs; + intra_filter_t intra_filter; intra_pred_t intra_pred[NUM_INTRA_MODE]; } cu[NUM_CU_SIZES]; @@ -260,9 +265,7 @@ nquant_t nquant; dequant_scaling_t dequant_scaling; dequant_normal_t dequant_normal; - count_nonzero_t count_nonzero; denoiseDct_t denoiseDct; - scale_t scale1D_128to64; scale_t scale2D_64to32; @@ -286,7 +289,9 @@ weightp_sp_t weight_sp; weightp_pp_t weight_pp; - filter_p2s_t luma_p2s; + filter_p2s_wxh_t luma_p2s; + + findPosLast_t findPosLast; /* There is one set of chroma primitives per color space. An encoder will * have just a single color space and thus it will only ever use one entry @@ -311,6 +316,8 @@ filter_hps_t filter_hps; addAvg_t addAvg; copy_pp_t copy_pp; + filter_p2s_t chroma_p2s; + } pu[NUM_PU_SIZES]; @@ -329,7 +336,7 @@ } cu[NUM_CU_SIZES]; - filter_p2s_t p2s; // takes width/height as arguments + filter_p2s_wxh_t p2s; // takes width/height as arguments } chroma[X265_CSP_COUNT]; }; diff -Nru x265-1.5/source/common/quant.cpp x265-1.6/source/common/quant.cpp --- x265-1.5/source/common/quant.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/quant.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -50,7 +50,7 @@ return y + ((x - y) & ((x - y) >> (sizeof(int) * CHAR_BIT - 1))); // min(x, y) } -inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) +inline int getICRate(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, const uint32_t absGoRice, const uint32_t maxVlc, uint32_t c1c2Idx) { X265_CHECK(c1c2Idx <= 3, "c1c2Idx check failure\n"); X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); @@ -72,7 +72,6 @@ else { uint32_t symbol = diffLevel; - const uint32_t maxVlc = g_goRiceRange[absGoRice]; bool expGolomb = (symbol > maxVlc); if (expGolomb) @@ -105,6 +104,41 @@ return rate; } +#if CHECKED_BUILD || _DEBUG +inline int getICRateNegDiff(uint32_t absLevel, const int* greaterOneBits, const int* levelAbsBits) +{ + X265_CHECK(absLevel <= 2, "absLevel check failure\n"); + + int rate; + if (absLevel == 0) + rate = 0; + else if (absLevel == 2) + rate = greaterOneBits[1] + levelAbsBits[0]; + else + rate = greaterOneBits[0]; + return rate; +} +#endif + +inline int getICRateLessVlc(uint32_t absLevel, int32_t diffLevel, const uint32_t absGoRice) +{ + X265_CHECK(absGoRice <= 4, "absGoRice check failure\n"); + if (!absLevel) + { + X265_CHECK(diffLevel < 0, "diffLevel check failure\n"); + return 0; + } + int rate; + + uint32_t symbol = diffLevel; + uint32_t prefLen = (symbol >> absGoRice) + 1; + uint32_t numBins = fastMin(prefLen + absGoRice, 8 /* g_goRicePrefixLen[absGoRice] + absGoRice */); + + rate = numBins << 15; + + return rate; +} + /* Calculates the cost for specific absolute transform level */ inline uint32_t getICRateCost(uint32_t absLevel, int32_t diffLevel, const int* greaterOneBits, const int* levelAbsBits, uint32_t absGoRice, uint32_t c1c2Idx) { @@ -160,12 +194,12 @@ m_nr = NULL; } -bool Quant::init(bool useRDOQ, double psyScale, const ScalingList& scalingList, Entropy& entropy) +bool Quant::init(int rdoqLevel, double psyScale, const ScalingList& scalingList, Entropy& entropy) { m_entropyCoder = &entropy; - m_useRDOQ = useRDOQ; + m_rdoqLevel = rdoqLevel; m_psyRdoqScale = (int64_t)(psyScale * 256.0); - m_scalingList = &scalingList; + m_scalingList = &scalingList; m_resiDctCoeff = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE * 2); m_fencDctCoeff = m_resiDctCoeff + (MAX_TR_SIZE * MAX_TR_SIZE); m_fencShortBuf = X265_MALLOC(int16_t, MAX_TR_SIZE * MAX_TR_SIZE); @@ -382,13 +416,13 @@ } } - if (m_useRDOQ) + if (m_rdoqLevel) return rdoQuant(cu, coeff, log2TrSize, ttype, absPartIdx, usePsy); else { int deltaU[32 * 32]; - int scalingListType = ttype + (isLuma ? 3 : 0); + int scalingListType = (cu.isIntra(absPartIdx) ? 0 : 3) + ttype; int rem = m_qpParam[ttype].rem; int per = m_qpParam[ttype].per; const int32_t* quantCoeff = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; @@ -454,9 +488,7 @@ else { int useDST = !sizeIdx && ttype == TEXT_LUMA && bIntra; - - X265_CHECK((int)numSig == primitives.count_nonzero(coeff, 1 << (log2TrSize * 2)), "numSig differ\n"); - + X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(coeff), "numSig differ\n"); // DC only if (numSig == 1 && coeff[0] != 0 && !useDST) { @@ -493,13 +525,10 @@ const int32_t* qCoef = m_scalingList->m_quantCoef[log2TrSize - 2][scalingListType][rem]; int numCoeff = 1 << (log2TrSize * 2); - uint32_t numSig = primitives.nquant(m_resiDctCoeff, qCoef, dstCoeff, qbits, add, numCoeff); - - X265_CHECK((int)numSig == primitives.count_nonzero(dstCoeff, 1 << (log2TrSize * 2)), "numSig differ\n"); + X265_CHECK((int)numSig == primitives.cu[log2TrSize - 2].count_nonzero(dstCoeff), "numSig differ\n"); if (!numSig) return 0; - uint32_t trSize = 1 << log2TrSize; int64_t lambda2 = m_qpParam[ttype].lambda2; int64_t psyScale = (m_psyRdoqScale * m_qpParam[ttype].lambda); @@ -674,9 +703,43 @@ /* record costs for sign-hiding performed at the end */ if (level) { - int rateNow = getICRate(level, level - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx); - rateIncUp[blkPos] = getICRate(level + 1, level + 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow; - rateIncDown[blkPos] = getICRate(level - 1, level - 1 - baseLevel, greaterOneBits, levelAbsBits, goRiceParam, c1c2Idx) - rateNow; + const int32_t diff0 = level - 1 - baseLevel; + const int32_t diff2 = level + 1 - baseLevel; + const int32_t maxVlc = g_goRiceRange[goRiceParam]; + int rate0, rate1, rate2; + + if (diff0 < -2) // prob (92.9, 86.5, 74.5)% + { + // NOTE: Min: L - 1 - {1,2,1,3} < -2 ==> L < {0,1,0,2} + // additional L > 0, so I got (L > 0 && L < 2) ==> L = 1 + X265_CHECK(level == 1, "absLevel check failure\n"); + + const int rateEqual2 = greaterOneBits[1] + levelAbsBits[0];; + const int rateNotEqual2 = greaterOneBits[0]; + + rate0 = 0; + rate2 = rateEqual2; + rate1 = rateNotEqual2; + + X265_CHECK(rate1 == getICRateNegDiff(level + 0, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); + X265_CHECK(rate2 == getICRateNegDiff(level + 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); + X265_CHECK(rate0 == getICRateNegDiff(level - 1, greaterOneBits, levelAbsBits), "rate1 check failure!\n"); + } + else if (diff0 >= 0 && diff2 <= maxVlc) // prob except from above path (98.6, 97.9, 96.9)% + { + // NOTE: no c1c2 correct rate since all of rate include this factor + rate1 = getICRateLessVlc(level + 0, diff0 + 1, goRiceParam); + rate2 = getICRateLessVlc(level + 1, diff0 + 2, goRiceParam); + rate0 = getICRateLessVlc(level - 1, diff0 + 0, goRiceParam); + } + else + { + rate1 = getICRate(level + 0, diff0 + 1, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Idx); + rate2 = getICRate(level + 1, diff0 + 2, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Idx); + rate0 = getICRate(level - 1, diff0 + 0, greaterOneBits, levelAbsBits, goRiceParam, maxVlc, c1c2Idx); + } + rateIncUp[blkPos] = rate2 - rate1; + rateIncDown[blkPos] = rate0 - rate1; } else { @@ -762,7 +825,7 @@ costCoeffGroupSig[cgScanPos] = SIGCOST(estBitsSbac.significantCoeffGroupBits[sigCtx][1]); totalRdCost += costCoeffGroupSig[cgScanPos]; /* add the cost of 1 bit in significant CG bitmap */ - if (costZeroCG < totalRdCost) + if (costZeroCG < totalRdCost && m_rdoqLevel > 1) { sigCoeffGroupFlag64 &= ~cgBlkPosMask; totalRdCost = costZeroCG; @@ -870,7 +933,7 @@ bestLastIdx = scanPos + 1; bestCost = costAsLast; } - if (dstCoeff[blkPos] > 1) + if (dstCoeff[blkPos] > 1 || m_rdoqLevel == 1) { foundLast = true; break; @@ -1037,7 +1100,8 @@ const uint32_t trSizeCG = 1 << log2TrSizeCG; X265_CHECK(trSizeCG <= 8, "transform CG is too large\n"); - const uint32_t sigPos = (uint32_t)(sigCoeffGroupFlag64 >> (1 + (cgPosY << log2TrSizeCG) + cgPosX)); + const uint32_t shift = (cgPosY << log2TrSizeCG) + cgPosX + 1; + const uint32_t sigPos = (uint32_t)(shift >= 64 ? 0 : sigCoeffGroupFlag64 >> shift); const uint32_t sigRight = ((int32_t)(cgPosX - (trSizeCG - 1)) >> 31) & (sigPos & 1); const uint32_t sigLower = ((int32_t)(cgPosY - (trSizeCG - 1)) >> 31) & (sigPos >> (trSizeCG - 2)) & 2; diff -Nru x265-1.5/source/common/quant.h x265-1.6/source/common/quant.h --- x265-1.5/source/common/quant.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/quant.h 2015-04-02 16:46:36.000000000 +0000 @@ -81,7 +81,7 @@ QpParam m_qpParam[3]; - bool m_useRDOQ; + int m_rdoqLevel; int64_t m_psyRdoqScale; int16_t* m_resiDctCoeff; int16_t* m_fencDctCoeff; @@ -99,7 +99,7 @@ ~Quant(); /* one-time setup */ - bool init(bool useRDOQ, double psyScale, const ScalingList& scalingList, Entropy& entropy); + bool init(int rdoqLevel, double psyScale, const ScalingList& scalingList, Entropy& entropy); bool allocNoiseReduction(const x265_param& param); /* CU setup */ diff -Nru x265-1.5/source/common/scalinglist.cpp x265-1.6/source/common/scalinglist.cpp --- x265-1.5/source/common/scalinglist.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/scalinglist.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -222,7 +222,7 @@ void ScalingList::processDefaultMarix(int sizeId, int listId) { - ::memcpy(m_scalingListCoef[sizeId][listId], getScalingListDefaultAddress(sizeId, listId), sizeof(int) * X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeId])); + memcpy(m_scalingListCoef[sizeId][listId], getScalingListDefaultAddress(sizeId, listId), sizeof(int) * X265_MIN(MAX_MATRIX_COEF_NUM, s_numCoefPerSize[sizeId])); m_scalingListDC[sizeId][listId] = SCALING_LIST_DC; } diff -Nru x265-1.5/source/common/shortyuv.cpp x265-1.6/source/common/shortyuv.cpp --- x265-1.5/source/common/shortyuv.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/shortyuv.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -66,9 +66,9 @@ void ShortYuv::clear() { - ::memset(m_buf[0], 0, (m_size * m_size) * sizeof(int16_t)); - ::memset(m_buf[1], 0, (m_csize * m_csize) * sizeof(int16_t)); - ::memset(m_buf[2], 0, (m_csize * m_csize) * sizeof(int16_t)); + memset(m_buf[0], 0, (m_size * m_size) * sizeof(int16_t)); + memset(m_buf[1], 0, (m_csize * m_csize) * sizeof(int16_t)); + memset(m_buf[2], 0, (m_csize * m_csize) * sizeof(int16_t)); } void ShortYuv::subtract(const Yuv& srcYuv0, const Yuv& srcYuv1, uint32_t log2Size) diff -Nru x265-1.5/source/common/slice.cpp x265-1.6/source/common/slice.cpp --- x265-1.5/source/common/slice.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/slice.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -33,7 +33,7 @@ { if (m_sliceType == I_SLICE) { - ::memset(m_refPicList, 0, sizeof(m_refPicList)); + memset(m_refPicList, 0, sizeof(m_refPicList)); m_numRefIdx[1] = m_numRefIdx[0] = 0; return; } @@ -112,7 +112,7 @@ if (m_sliceType != B_SLICE) { m_numRefIdx[1] = 0; - ::memset(m_refPicList[1], 0, sizeof(m_refPicList[1])); + memset(m_refPicList[1], 0, sizeof(m_refPicList[1])); } else { @@ -183,8 +183,8 @@ uint32_t Slice::realEndAddress(uint32_t endCUAddr) const { // Calculate end address - uint32_t internalAddress = (endCUAddr - 1) % NUM_CU_PARTITIONS; - uint32_t externalAddress = (endCUAddr - 1) / NUM_CU_PARTITIONS; + uint32_t internalAddress = (endCUAddr - 1) % NUM_4x4_PARTITIONS; + uint32_t externalAddress = (endCUAddr - 1) / NUM_4x4_PARTITIONS; uint32_t xmax = m_sps->picWidthInLumaSamples - (externalAddress % m_sps->numCuInWidth) * g_maxCUSize; uint32_t ymax = m_sps->picHeightInLumaSamples - (externalAddress / m_sps->numCuInWidth) * g_maxCUSize; @@ -192,13 +192,13 @@ internalAddress--; internalAddress++; - if (internalAddress == NUM_CU_PARTITIONS) + if (internalAddress == NUM_4x4_PARTITIONS) { internalAddress = 0; externalAddress++; } - return externalAddress * NUM_CU_PARTITIONS + internalAddress; + return externalAddress * NUM_4x4_PARTITIONS + internalAddress; } diff -Nru x265-1.5/source/common/slice.h x265-1.6/source/common/slice.h --- x265-1.5/source/common/slice.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/slice.h 2015-04-02 16:46:36.000000000 +0000 @@ -55,9 +55,9 @@ , numberOfNegativePictures(0) , numberOfPositivePictures(0) { - ::memset(deltaPOC, 0, sizeof(deltaPOC)); - ::memset(poc, 0, sizeof(poc)); - ::memset(bUsed, 0, sizeof(bUsed)); + memset(deltaPOC, 0, sizeof(deltaPOC)); + memset(poc, 0, sizeof(poc)); + memset(bUsed, 0, sizeof(bUsed)); } void sortDeltaPOC(); @@ -149,8 +149,10 @@ struct VPS { + uint32_t maxTempSubLayers; uint32_t numReorderPics; uint32_t maxDecPicBuffering; + uint32_t maxLatencyIncrease; HRDInfo hrdParameters; ProfileTierLevel ptl; }; @@ -228,9 +230,10 @@ bool bUseAMP; // use param uint32_t maxAMPDepth; + uint32_t maxTempSubLayers; // max number of Temporal Sub layers uint32_t maxDecPicBuffering; // these are dups of VPS values + uint32_t maxLatencyIncrease; int numReorderPics; - int maxLatencyIncrease; bool bUseStrongIntraSmoothing; // use param bool bTemporalMVPEnabled; @@ -285,6 +288,14 @@ } }; +#define SET_WEIGHT(w, b, s, d, o) \ + { \ + (w).inputWeight = (s); \ + (w).log2WeightDenom = (d); \ + (w).inputOffset = (o); \ + (w).bPresentFlag = (b); \ + } + class Slice { public: diff -Nru x265-1.5/source/common/threading.cpp x265-1.6/source/common/threading.cpp --- x265-1.5/source/common/threading.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/threading.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -26,6 +26,13 @@ namespace x265 { // x265 private namespace +#if X265_ARCH_X86 && !defined(X86_64) && ENABLE_ASSEMBLY && defined(__GNUC__) +extern "C" intptr_t x265_stack_align(void (*func)(), ...); +#define x265_stack_align(func, ...) x265_stack_align((void (*)())func, __VA_ARGS__) +#else +#define x265_stack_align(func, ...) func(__VA_ARGS__) +#endif + /* C shim for forced stack alignment */ static void stackAlignMain(Thread *instance) { diff -Nru x265-1.5/source/common/threading.h x265-1.6/source/common/threading.h --- x265-1.5/source/common/threading.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/threading.h 2015-04-02 16:46:36.000000000 +0000 @@ -42,32 +42,32 @@ #include #endif -#ifdef __GNUC__ /* GCCs builtin atomics */ +#ifdef __GNUC__ /* GCCs builtin atomics */ #include #include -#define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31 -#define CTZ(id, x) id = (unsigned long)__builtin_ctz(x) -#define ATOMIC_OR(ptr, mask) __sync_fetch_and_or(ptr, mask) -#define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask) -#define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1) -#define ATOMIC_DEC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, -1) -#define ATOMIC_ADD(ptr, value) __sync_add_and_fetch((volatile int32_t*)ptr, value) -#define GIVE_UP_TIME() usleep(0) +#define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31 +#define CTZ(id, x) id = (unsigned long)__builtin_ctz(x) +#define ATOMIC_OR(ptr, mask) __sync_fetch_and_or(ptr, mask) +#define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask) +#define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1) +#define ATOMIC_DEC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, -1) +#define ATOMIC_ADD(ptr, val) __sync_fetch_and_add((volatile int32_t*)ptr, val) +#define GIVE_UP_TIME() usleep(0) -#elif defined(_MSC_VER) /* Windows atomic intrinsics */ +#elif defined(_MSC_VER) /* Windows atomic intrinsics */ #include -#define CLZ(id, x) _BitScanReverse(&id, x) -#define CTZ(id, x) _BitScanForward(&id, x) -#define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr) -#define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr) -#define ATOMIC_ADD(ptr, value) InterlockedExchangeAdd((volatile LONG*)ptr, value) -#define ATOMIC_OR(ptr, mask) _InterlockedOr((volatile LONG*)ptr, (LONG)mask) -#define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask) -#define GIVE_UP_TIME() Sleep(0) +#define CLZ(id, x) _BitScanReverse(&id, x) +#define CTZ(id, x) _BitScanForward(&id, x) +#define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr) +#define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr) +#define ATOMIC_ADD(ptr, val) InterlockedExchangeAdd((volatile LONG*)ptr, val) +#define ATOMIC_OR(ptr, mask) _InterlockedOr((volatile LONG*)ptr, (LONG)mask) +#define ATOMIC_AND(ptr, mask) _InterlockedAnd((volatile LONG*)ptr, (LONG)mask) +#define GIVE_UP_TIME() Sleep(0) #endif // ifdef __GNUC__ @@ -128,8 +128,8 @@ bool timedWait(uint32_t milliseconds) { - /* returns true if event was signaled */ - return WaitForSingleObject(this->handle, milliseconds) == WAIT_OBJECT_0; + /* returns true if the wait timed out */ + return WaitForSingleObject(this->handle, milliseconds) == WAIT_TIMEOUT; } void trigger() @@ -263,10 +263,8 @@ /* blocking wait on conditional variable, mutex is atomically released * while blocked. When condition is signaled, mutex is re-acquired */ - while (m_counter == 0) - { + while (!m_counter) pthread_cond_wait(&m_cond, &m_mutex); - } m_counter--; pthread_mutex_unlock(&m_mutex); @@ -277,7 +275,7 @@ bool bTimedOut = false; pthread_mutex_lock(&m_mutex); - if (m_counter == 0) + if (!m_counter) { struct timeval tv; struct timespec ts; @@ -297,7 +295,10 @@ bTimedOut = pthread_cond_timedwait(&m_cond, &m_mutex, &ts) == ETIMEDOUT; } if (m_counter > 0) + { m_counter--; + bTimedOut = false; + } pthread_mutex_unlock(&m_mutex); return bTimedOut; } @@ -408,6 +409,23 @@ Lock &inst; }; +// Utility class which adds elapsed time of the scope of the object into the +// accumulator provided to the constructor +struct ScopedElapsedTime +{ + ScopedElapsedTime(int64_t& accum) : accumlatedTime(accum) { startTime = x265_mdate(); } + + ~ScopedElapsedTime() { accumlatedTime += x265_mdate() - startTime; } + +protected: + + int64_t startTime; + int64_t& accumlatedTime; + + // do not allow assignments + ScopedElapsedTime &operator =(const ScopedElapsedTime &); +}; + //< Simplistic portable thread class. Shutdown signalling left to derived class class Thread { diff -Nru x265-1.5/source/common/threadpool.cpp x265-1.6/source/common/threadpool.cpp --- x265-1.5/source/common/threadpool.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/threadpool.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -27,115 +27,65 @@ #include -#if MACOS -#include -#include -#endif - -namespace x265 { -// x265 private namespace - -class ThreadPoolImpl; +#if X86_64 -class PoolThread : public Thread -{ -private: +#ifdef __GNUC__ - ThreadPoolImpl &m_pool; +#define SLEEPBITMAP_CTZ(id, x) id = (unsigned long)__builtin_ctzll(x) +#define SLEEPBITMAP_OR(ptr, mask) __sync_fetch_and_or(ptr, mask) +#define SLEEPBITMAP_AND(ptr, mask) __sync_fetch_and_and(ptr, mask) - PoolThread& operator =(const PoolThread&); +#elif defined(_MSC_VER) - int m_id; +#define SLEEPBITMAP_CTZ(id, x) _BitScanForward64(&id, x) +#define SLEEPBITMAP_OR(ptr, mask) InterlockedOr64((volatile LONG64*)ptr, (LONG)mask) +#define SLEEPBITMAP_AND(ptr, mask) InterlockedAnd64((volatile LONG64*)ptr, (LONG)mask) - bool m_dirty; +#endif // ifdef __GNUC__ - bool m_exited; - - Event m_wakeEvent; - -public: - - PoolThread(ThreadPoolImpl& pool, int id) - : m_pool(pool) - , m_id(id) - , m_dirty(false) - , m_exited(false) - { - } - - bool isDirty() const { return m_dirty; } - - void markDirty() { m_dirty = true; } +#else - bool isExited() const { return m_exited; } +/* use 32-bit primitives defined in threading.h */ +#define SLEEPBITMAP_CTZ CTZ +#define SLEEPBITMAP_OR ATOMIC_OR +#define SLEEPBITMAP_AND ATOMIC_AND - void poke() { m_wakeEvent.trigger(); } +#endif - virtual ~PoolThread() {} +#if MACOS +#include +#include +#endif +#if HAVE_LIBNUMA +#include +#endif - void threadMain(); -}; +namespace x265 { +// x265 private namespace -class ThreadPoolImpl : public ThreadPool +class WorkerThread : public Thread { private: - bool m_ok; - int m_referenceCount; - int m_numThreads; - int m_numSleepMapWords; - PoolThread *m_threads; - volatile uint32_t *m_sleepMap; - - /* Lock for write access to the provider lists. Threads are - * always allowed to read m_firstProvider and follow the - * linked list. Providers must zero their m_nextProvider - * pointers before removing themselves from this list */ - Lock m_writeLock; + ThreadPool& m_pool; + int m_id; + Event m_wakeEvent; -public: - - static ThreadPoolImpl *s_instance; - static Lock s_createLock; - - JobProvider *m_firstProvider; - JobProvider *m_lastProvider; + WorkerThread& operator =(const WorkerThread&); public: - ThreadPoolImpl(int numthreads); - - virtual ~ThreadPoolImpl(); - - ThreadPoolImpl *AddReference() - { - m_referenceCount++; - - return this; - } - - void markThreadAsleep(int id); + JobProvider* m_curJobProvider; + BondedTaskGroup* m_bondMaster; - void waitForAllIdle(); + WorkerThread(ThreadPool& pool, int id) : m_pool(pool), m_id(id) {} + virtual ~WorkerThread() {} - int getThreadCount() const { return m_numThreads; } - - bool IsValid() const { return m_ok; } - - void release(); - - void Stop(); - - void enqueueJobProvider(JobProvider &); - - void dequeueJobProvider(JobProvider &); - - void FlushProviderList(); - - void pokeIdleThread(); + void threadMain(); + void awaken() { m_wakeEvent.trigger(); } }; -void PoolThread::threadMain() +void WorkerThread::threadMain() { THREAD_NAME("Worker", m_id); @@ -145,286 +95,361 @@ __attribute__((unused)) int val = nice(10); #endif - while (m_pool.IsValid()) + m_pool.setCurrentThreadAffinity(); + + sleepbitmap_t idBit = (sleepbitmap_t)1 << m_id; + m_curJobProvider = m_pool.m_jpTable[0]; + m_bondMaster = NULL; + + SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit); + SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit); + m_wakeEvent.wait(); + + while (m_pool.m_isActive) { - /* Walk list of job providers, looking for work */ - JobProvider *cur = m_pool.m_firstProvider; - while (cur) + if (m_bondMaster) { - // FindJob() may perform actual work and return true. If - // it does we restart the job search - if (cur->findJob(m_id) == true) - break; - - cur = cur->m_nextProvider; + m_bondMaster->processTasks(m_id); + m_bondMaster->m_exitedPeerCount.incr(); + m_bondMaster = NULL; } - // this thread has reached the end of the provider list - m_dirty = false; - - if (cur == NULL) + do { - m_pool.markThreadAsleep(m_id); - m_wakeEvent.wait(); + /* do pending work for current job provider */ + m_curJobProvider->findJob(m_id); + + /* if the current job provider still wants help, only switch to a + * higher priority provider (lower slice type). Else take the first + * available job provider with the highest priority */ + int curPriority = (m_curJobProvider->m_helpWanted) ? m_curJobProvider->m_sliceType : + INVALID_SLICE_PRIORITY + 1; + int nextProvider = -1; + for (int i = 0; i < m_pool.m_numProviders; i++) + { + if (m_pool.m_jpTable[i]->m_helpWanted && + m_pool.m_jpTable[i]->m_sliceType < curPriority) + { + nextProvider = i; + curPriority = m_pool.m_jpTable[i]->m_sliceType; + } + } + if (nextProvider != -1 && m_curJobProvider != m_pool.m_jpTable[nextProvider]) + { + SLEEPBITMAP_AND(&m_curJobProvider->m_ownerBitmap, ~idBit); + m_curJobProvider = m_pool.m_jpTable[nextProvider]; + SLEEPBITMAP_OR(&m_curJobProvider->m_ownerBitmap, idBit); + } } + while (m_curJobProvider->m_helpWanted); + + /* While the worker sleeps, a job-provider or bond-group may acquire this + * worker's sleep bitmap bit. Once acquired, that thread may modify + * m_bondMaster or m_curJobProvider, then waken the thread */ + SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit); + m_wakeEvent.wait(); } - m_exited = true; + SLEEPBITMAP_OR(&m_pool.m_sleepBitmap, idBit); } -void ThreadPoolImpl::markThreadAsleep(int id) +void JobProvider::tryWakeOne() { - int word = id >> 5; - uint32_t bit = 1 << (id & 31); + int id = m_pool->tryAcquireSleepingThread(m_ownerBitmap, ALL_POOL_THREADS); + if (id < 0) + { + m_helpWanted = true; + return; + } - ATOMIC_OR(&m_sleepMap[word], bit); + WorkerThread& worker = m_pool->m_workers[id]; + if (worker.m_curJobProvider != this) /* poaching */ + { + sleepbitmap_t bit = (sleepbitmap_t)1 << id; + SLEEPBITMAP_AND(&worker.m_curJobProvider->m_ownerBitmap, ~bit); + worker.m_curJobProvider = this; + SLEEPBITMAP_OR(&worker.m_curJobProvider->m_ownerBitmap, bit); + } + worker.awaken(); } -void ThreadPoolImpl::pokeIdleThread() +int ThreadPool::tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap) { - /* Find a bit in the sleeping thread bitmap and poke it awake, do - * not give up until a thread is awakened or all of them are awake */ - for (int i = 0; i < m_numSleepMapWords; i++) + unsigned long id; + + sleepbitmap_t masked = m_sleepBitmap & firstTryBitmap; + while (masked) { - uint32_t oldval = m_sleepMap[i]; - while (oldval) - { - unsigned long id; - CTZ(id, oldval); + SLEEPBITMAP_CTZ(id, masked); - uint32_t bit = 1 << id; - if (ATOMIC_AND(&m_sleepMap[i], ~bit) & bit) - { - m_threads[i * 32 + id].poke(); - return; - } + sleepbitmap_t bit = (sleepbitmap_t)1 << id; + if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit) + return (int)id; - oldval = m_sleepMap[i]; - } + masked = m_sleepBitmap & firstTryBitmap; } -} - -ThreadPoolImpl *ThreadPoolImpl::s_instance; -Lock ThreadPoolImpl::s_createLock; -/* static */ -ThreadPool *ThreadPool::allocThreadPool(int numthreads) -{ - if (ThreadPoolImpl::s_instance) - return ThreadPoolImpl::s_instance->AddReference(); - - /* acquire the lock to create the instance */ - ThreadPoolImpl::s_createLock.acquire(); + masked = m_sleepBitmap & secondTryBitmap; + while (masked) + { + SLEEPBITMAP_CTZ(id, masked); - if (ThreadPoolImpl::s_instance) - /* pool was allocated while we waited for the lock */ - ThreadPoolImpl::s_instance->AddReference(); - else - ThreadPoolImpl::s_instance = new ThreadPoolImpl(numthreads); - ThreadPoolImpl::s_createLock.release(); + sleepbitmap_t bit = (sleepbitmap_t)1 << id; + if (SLEEPBITMAP_AND(&m_sleepBitmap, ~bit) & bit) + return (int)id; - return ThreadPoolImpl::s_instance; -} + masked = m_sleepBitmap & secondTryBitmap; + } -ThreadPool *ThreadPool::getThreadPool() -{ - X265_CHECK(ThreadPoolImpl::s_instance, "getThreadPool() called prior to allocThreadPool()\n"); - return ThreadPoolImpl::s_instance; + return -1; } -void ThreadPoolImpl::release() +int ThreadPool::tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master) { - if (--m_referenceCount == 0) + int bondCount = 0; + do { - X265_CHECK(this == ThreadPoolImpl::s_instance, "multiple thread pool instances detected\n"); - ThreadPoolImpl::s_instance = NULL; - this->Stop(); - delete this; + int id = tryAcquireSleepingThread(peerBitmap, 0); + if (id < 0) + return bondCount; + + m_workers[id].m_bondMaster = &master; + m_workers[id].awaken(); + bondCount++; } + while (bondCount < maxPeers); + + return bondCount; } -ThreadPoolImpl::ThreadPoolImpl(int numThreads) - : m_ok(false) - , m_referenceCount(1) - , m_firstProvider(NULL) - , m_lastProvider(NULL) +ThreadPool* ThreadPool::allocThreadPools(x265_param* p, int& numPools) { - m_numSleepMapWords = (numThreads + 31) >> 5; - m_sleepMap = X265_MALLOC(uint32_t, m_numSleepMapWords); + enum { MAX_NODE_NUM = 127 }; + int cpusPerNode[MAX_NODE_NUM + 1]; - char *buffer = (char*)X265_MALLOC(PoolThread, numThreads); - m_threads = reinterpret_cast(buffer); - m_numThreads = numThreads; + memset(cpusPerNode, 0, sizeof(cpusPerNode)); + int numNumaNodes = X265_MIN(getNumaNodeCount(), MAX_NODE_NUM); + int cpuCount = getCpuCount(); + bool bNumaSupport = false; - if (m_threads && m_sleepMap) - { - for (int i = 0; i < m_numSleepMapWords; i++) - m_sleepMap[i] = 0; +#if _WIN32_WINNT >= 0x0601 + bNumaSupport = true; +#elif HAVE_LIBNUMA + bNumaSupport = numa_available() >= 0; +#endif - m_ok = true; - int i; - for (i = 0; i < numThreads; i++) - { - new (buffer)PoolThread(*this, i); - buffer += sizeof(PoolThread); - if (!m_threads[i].start()) - { - m_ok = false; - break; - } - } - if (m_ok) - waitForAllIdle(); + for (int i = 0; i < cpuCount; i++) + { +#if _WIN32_WINNT >= 0x0601 + UCHAR node; + if (GetNumaProcessorNode((UCHAR)i, &node)) + cpusPerNode[X265_MIN(node, MAX_NODE_NUM)]++; else - { - // stop threads that did start up - for (int j = 0; j < i; j++) - { - m_threads[j].poke(); - m_threads[j].stop(); - } - } +#elif HAVE_LIBNUMA + if (bNumaSupport >= 0) + cpusPerNode[X265_MIN(numa_node_of_cpu(i), MAX_NODE_NUM)]++; + else +#endif + cpusPerNode[0]++; } -} -void ThreadPoolImpl::waitForAllIdle() -{ - if (!m_ok) - return; + if (bNumaSupport && p->logLevel >= X265_LOG_DEBUG) + for (int i = 0; i < numNumaNodes; i++) + x265_log(p, X265_LOG_DEBUG, "detected NUMA node %d with %d logical cores\n", i, cpusPerNode[i]); - int id = 0; - do + /* limit nodes based on param->numaPools */ + if (p->numaPools && *p->numaPools) { - int word = id >> 5; - uint32_t bit = 1 << (id & 31); - if (m_sleepMap[word] & bit) - id++; - else + char *nodeStr = p->numaPools; + for (int i = 0; i < numNumaNodes; i++) { - GIVE_UP_TIME(); + if (!*nodeStr) + { + cpusPerNode[i] = 0; + continue; + } + else if (*nodeStr == '-') + cpusPerNode[i] = 0; + else if (*nodeStr == '*') + break; + else if (*nodeStr == '+') + ; + else + { + int count = atoi(nodeStr); + cpusPerNode[i] = X265_MIN(count, cpusPerNode[i]); + } + + /* consume current node string, comma, and white-space */ + while (*nodeStr && *nodeStr != ',') + ++nodeStr; + if (*nodeStr == ',' || *nodeStr == ' ') + ++nodeStr; } } - while (id < m_numThreads); -} -void ThreadPoolImpl::Stop() -{ - if (m_ok) + numPools = 0; + for (int i = 0; i < numNumaNodes; i++) { - waitForAllIdle(); - - // set invalid flag, then wake them up so they exit their main func - m_ok = false; - for (int i = 0; i < m_numThreads; i++) - { - m_threads[i].poke(); - m_threads[i].stop(); - } + if (bNumaSupport) + x265_log(p, X265_LOG_DEBUG, "NUMA node %d may use %d logical cores\n", i, cpusPerNode[i]); + if (cpusPerNode[i]) + numPools += (cpusPerNode[i] + MAX_POOL_THREADS - 1) / MAX_POOL_THREADS; } -} -ThreadPoolImpl::~ThreadPoolImpl() -{ - X265_FREE((void*)m_sleepMap); + if (!numPools) + return NULL; - if (m_threads) + if (numPools > p->frameNumThreads) { - // cleanup thread handles - for (int i = 0; i < m_numThreads; i++) - m_threads[i].~PoolThread(); + x265_log(p, X265_LOG_DEBUG, "Reducing number of thread pools for frame thread count\n"); + numPools = X265_MAX(p->frameNumThreads / 2, 1); + } - X265_FREE(reinterpret_cast(m_threads)); + ThreadPool *pools = new ThreadPool[numPools]; + if (pools) + { + int maxProviders = (p->frameNumThreads + 1 + numPools - 1) / numPools; /* +1 is Lookahead */ + int node = 0; + for (int i = 0; i < numPools; i++) + { + while (!cpusPerNode[node]) + node++; + int cores = X265_MIN(MAX_POOL_THREADS, cpusPerNode[node]); + if (!pools[i].create(cores, maxProviders, node)) + { + X265_FREE(pools); + numPools = 0; + return NULL; + } + if (numNumaNodes > 1) + x265_log(p, X265_LOG_INFO, "Thread pool %d using %d threads on NUMA node %d\n", i, cores, node); + else + x265_log(p, X265_LOG_INFO, "Thread pool created using %d threads\n", cores); + cpusPerNode[node] -= cores; + } } + else + numPools = 0; + return pools; } -void ThreadPoolImpl::enqueueJobProvider(JobProvider &p) +ThreadPool::ThreadPool() { - // only one list writer at a time - ScopedLock l(m_writeLock); - - p.m_nextProvider = NULL; - p.m_prevProvider = m_lastProvider; - m_lastProvider = &p; - - if (p.m_prevProvider) - p.m_prevProvider->m_nextProvider = &p; - else - m_firstProvider = &p; + memset(this, 0, sizeof(*this)); } -void ThreadPoolImpl::dequeueJobProvider(JobProvider &p) +bool ThreadPool::create(int numThreads, int maxProviders, int node) { - // only one list writer at a time - ScopedLock l(m_writeLock); - - // update pool entry pointers first - if (m_firstProvider == &p) - m_firstProvider = p.m_nextProvider; + X265_CHECK(numThreads <= MAX_POOL_THREADS, "a single thread pool cannot have more than MAX_POOL_THREADS threads\n"); - if (m_lastProvider == &p) - m_lastProvider = p.m_prevProvider; + m_numaNode = node; + m_numWorkers = numThreads; - // extract self from doubly linked lists - if (p.m_nextProvider) - p.m_nextProvider->m_prevProvider = p.m_prevProvider; + m_workers = X265_MALLOC(WorkerThread, numThreads); + /* placement new initialization */ + if (m_workers) + for (int i = 0; i < numThreads; i++) + new (m_workers + i)WorkerThread(*this, i); - if (p.m_prevProvider) - p.m_prevProvider->m_nextProvider = p.m_nextProvider; + m_jpTable = X265_MALLOC(JobProvider*, maxProviders); + m_numProviders = 0; - p.m_nextProvider = NULL; - p.m_prevProvider = NULL; + return m_workers && m_jpTable; } -/* Ensure all threads have made a full pass through the provider list, ensuring - * dequeued providers are safe for deletion. */ -void ThreadPoolImpl::FlushProviderList() +bool ThreadPool::start() { - for (int i = 0; i < m_numThreads; i++) + m_isActive = true; + for (int i = 0; i < m_numWorkers; i++) { - m_threads[i].markDirty(); - m_threads[i].poke(); + if (!m_workers[i].start()) + { + m_isActive = false; + return false; + } } + return true; +} - int i; - do +void ThreadPool::stop() +{ + if (m_workers) { - for (i = 0; i < m_numThreads; i++) + m_isActive = false; + for (int i = 0; i < m_numWorkers; i++) { - if (m_threads[i].isDirty()) - { + while (!(m_sleepBitmap & ((sleepbitmap_t)1 << i))) GIVE_UP_TIME(); - break; - } + m_workers[i].awaken(); + m_workers[i].stop(); } } - while (i < m_numThreads); } -void JobProvider::flush() +ThreadPool::~ThreadPool() { - if (m_nextProvider || m_prevProvider) - dequeue(); - dynamic_cast(m_pool)->FlushProviderList(); + if (m_workers) + { + for (int i = 0; i < m_numWorkers; i++) + m_workers[i].~WorkerThread(); + } + + X265_FREE(m_workers); + X265_FREE(m_jpTable); } -void JobProvider::enqueue() +void ThreadPool::setCurrentThreadAffinity() { - // Add this provider to the end of the thread pool's job provider list - X265_CHECK(!m_nextProvider && !m_prevProvider && m_pool, "job provider was already queued\n"); - m_pool->enqueueJobProvider(*this); - m_pool->pokeIdleThread(); + setThreadNodeAffinity(m_numaNode); +} + +/* static */ +void ThreadPool::setThreadNodeAffinity(int numaNode) +{ +#if _WIN32_WINNT >= 0x0601 + GROUP_AFFINITY groupAffinity; + if (GetNumaNodeProcessorMaskEx((USHORT)numaNode, &groupAffinity)) + { + if (SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR)groupAffinity.Mask)) + return; + } + x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode); +#elif HAVE_LIBNUMA + if (numa_available() >= 0) + { + numa_run_on_node(numaNode); + numa_set_preferred(numaNode); + numa_set_localalloc(); + return; + } + x265_log(NULL, X265_LOG_ERROR, "unable to set thread affinity to NUMA node %d\n", numaNode); +#else + (void)numaNode; +#endif } -void JobProvider::dequeue() +/* static */ +int ThreadPool::getNumaNodeCount() { - // Remove this provider from the thread pool's job provider list - m_pool->dequeueJobProvider(*this); - // Ensure no jobs were missed while the provider was being removed - m_pool->pokeIdleThread(); +#if _WIN32_WINNT >= 0x0601 + ULONG num = 1; + if (GetNumaHighestNodeNumber(&num)) + num++; + return (int)num; +#elif HAVE_LIBNUMA + if (numa_available() >= 0) + return numa_max_node() + 1; + else + return 1; +#else + return 1; +#endif } -int getCpuCount() +/* static */ +int ThreadPool::getCpuCount() { #if _WIN32 SYSTEM_INFO sysinfo; @@ -450,8 +475,9 @@ } return count; -#else // if _WIN32 +#else return 2; // default to 2 threads, everywhere else -#endif // if _WIN32 +#endif } + } // end namespace x265 diff -Nru x265-1.5/source/common/threadpool.h x265-1.6/source/common/threadpool.h --- x265-1.5/source/common/threadpool.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/threadpool.h 2015-04-02 16:46:36.000000000 +0000 @@ -25,85 +25,148 @@ #define X265_THREADPOOL_H #include "common.h" +#include "threading.h" namespace x265 { // x265 private namespace class ThreadPool; +class WorkerThread; +class BondedTaskGroup; -int getCpuCount(); +#if X86_64 +typedef uint64_t sleepbitmap_t; +#else +typedef uint32_t sleepbitmap_t; +#endif + +static const sleepbitmap_t ALL_POOL_THREADS = (sleepbitmap_t)-1; +enum { MAX_POOL_THREADS = sizeof(sleepbitmap_t) * 8 }; +enum { INVALID_SLICE_PRIORITY = 10 }; // a value larger than any X265_TYPE_* macro -// Any class that wants to distribute work to the thread pool must -// derive from JobProvider and implement FindJob(). +// Frame level job providers. FrameEncoder and Lookahead derive from +// this class and implement findJob() class JobProvider { -protected: - - ThreadPool *m_pool; - - JobProvider *m_nextProvider; - JobProvider *m_prevProvider; - public: - JobProvider(ThreadPool *p) : m_pool(p), m_nextProvider(0), m_prevProvider(0) {} + ThreadPool* m_pool; + sleepbitmap_t m_ownerBitmap; + int m_jpId; + int m_sliceType; + bool m_helpWanted; + bool m_isFrameEncoder; /* rather ugly hack, but nothing better presents itself */ + + JobProvider() + : m_pool(NULL) + , m_ownerBitmap(0) + , m_jpId(-1) + , m_sliceType(INVALID_SLICE_PRIORITY) + , m_helpWanted(false) + , m_isFrameEncoder(false) + {} virtual ~JobProvider() {} - void setThreadPool(ThreadPool *p) { m_pool = p; } - - // Register this job provider with the thread pool, jobs are available - void enqueue(); - - // Remove this job provider from the thread pool, all jobs complete - void dequeue(); + // Worker threads will call this method to perform work + virtual void findJob(int workerThreadId) = 0; - // Worker threads will call this method to find a job. Must return true if - // work was completed. False if no work was available. - virtual bool findJob(int threadId) = 0; - - // All derived objects that call Enqueue *MUST* call flush before allowing - // their object to be destroyed, otherwise you will see random crashes involving - // partially freed vtables and you will be unhappy - void flush(); - - friend class ThreadPoolImpl; - friend class PoolThread; + // Will awaken one idle thread, preferring a thread which most recently + // performed work for this provider. + void tryWakeOne(); }; -// Abstract interface to ThreadPool. Each encoder instance should call -// AllocThreadPool() to get a handle to the singleton object and then make -// it available to their job provider structures (wave-front frame encoders, -// etc). class ThreadPool { -protected: - - // Destructor is inaccessable, force the use of reference counted Release() - ~ThreadPool() {} - - virtual void enqueueJobProvider(JobProvider &) = 0; - - virtual void dequeueJobProvider(JobProvider &) = 0; - public: - // When numthreads == 0, a default thread count is used. A request may grow - // an existing pool but it will never shrink. - static ThreadPool *allocThreadPool(int numthreads = 0); - - static ThreadPool *getThreadPool(); - - virtual void pokeIdleThread() = 0; - - // The pool is reference counted so all calls to AllocThreadPool() should be - // followed by a call to Release() - virtual void release() = 0; + sleepbitmap_t m_sleepBitmap; + int m_numProviders; + int m_numWorkers; + int m_numaNode; + bool m_isActive; + + JobProvider** m_jpTable; + WorkerThread* m_workers; + + ThreadPool(); + ~ThreadPool(); + + bool create(int numThreads, int maxProviders, int node); + bool start(); + void stop(); + void setCurrentThreadAffinity(); + int tryAcquireSleepingThread(sleepbitmap_t firstTryBitmap, sleepbitmap_t secondTryBitmap); + int tryBondPeers(int maxPeers, sleepbitmap_t peerBitmap, BondedTaskGroup& master); + + static ThreadPool* allocThreadPools(x265_param* p, int& numPools); + + static int getCpuCount(); + static int getNumaNodeCount(); + static void setThreadNodeAffinity(int node); +}; - virtual int getThreadCount() const = 0; +/* Any worker thread may enlist the help of idle worker threads from the same + * job provider. They must derive from this class and implement the + * processTasks() method. To use, an instance must be instantiated by a worker + * thread (referred to as the master thread) and then tryBondPeers() must be + * called. If it returns non-zero then some number of slave worker threads are + * already in the process of calling your processTasks() function. The master + * thread should participate and call processTasks() itself. When + * waitForExit() returns, all bonded peer threads are quarunteed to have + * exitied processTasks(). Since the thread count is small, it uses explicit + * locking instead of atomic counters and bitmasks */ +class BondedTaskGroup +{ +public: - friend class JobProvider; + Lock m_lock; + ThreadSafeInteger m_exitedPeerCount; + int m_bondedPeerCount; + int m_jobTotal; + int m_jobAcquired; + + BondedTaskGroup() { m_bondedPeerCount = m_jobTotal = m_jobAcquired = 0; } + + /* Do not allow the instance to be destroyed before all bonded peers have + * exited processTasks() */ + ~BondedTaskGroup() { waitForExit(); } + + /* Try to enlist the help of idle worker threads on most recently associated + * with the given job provider and "bond" them to work on your tasks. Up to + * maxPeers worker threads will call your processTasks() method. */ + int tryBondPeers(JobProvider& jp, int maxPeers) + { + int count = jp.m_pool->tryBondPeers(maxPeers, jp.m_ownerBitmap, *this); + m_bondedPeerCount += count; + return count; + } + + /* Try to enlist the help of any idle worker threads and "bond" them to work + * on your tasks. Up to maxPeers worker threads will call your + * processTasks() method. */ + int tryBondPeers(ThreadPool& pool, int maxPeers) + { + int count = pool.tryBondPeers(maxPeers, ALL_POOL_THREADS, *this); + m_bondedPeerCount += count; + return count; + } + + /* Returns when all bonded peers have exited processTasks(). It does *NOT* + * ensure all tasks are completed (but this is generally implied). */ + void waitForExit() + { + int exited = m_exitedPeerCount.get(); + while (m_bondedPeerCount != exited) + exited = m_exitedPeerCount.waitForChange(exited); + } + + /* Derived classes must define this method. The worker thread ID may be + * used to index into thread local data, or ignored. The ID will be between + * 0 and jp.m_numWorkers - 1 */ + virtual void processTasks(int workerThreadId) = 0; }; + } // end namespace x265 #endif // ifndef X265_THREADPOOL_H diff -Nru x265-1.5/source/common/wavefront.cpp x265-1.6/source/common/wavefront.cpp --- x265-1.5/source/common/wavefront.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/wavefront.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -54,13 +54,13 @@ void WaveFront::clearEnabledRowMask() { memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords); + memset((void*)m_internalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords); } void WaveFront::enqueueRow(int row) { uint32_t bit = 1 << (row & 31); ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit); - if (m_pool) m_pool->pokeIdleThread(); } void WaveFront::enableRow(int row) @@ -80,11 +80,11 @@ return !!(ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit); } -bool WaveFront::findJob(int threadId) +void WaveFront::findJob(int threadId) { unsigned long id; - // thread safe + /* Loop over each word until all available rows are finished */ for (int w = 0; w < m_numWords; w++) { uint32_t oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w]; @@ -97,15 +97,14 @@ { /* we cleared the bit, we get to process the row */ processRow(w * 32 + id, threadId); - return true; + m_helpWanted = true; + return; /* check for a higher priority task */ } - // some other thread cleared the bit, try another bit oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w]; } } - // made it through the bitmap without finding any enqueued rows - return false; + m_helpWanted = false; } } diff -Nru x265-1.5/source/common/wavefront.h x265-1.6/source/common/wavefront.h --- x265-1.5/source/common/wavefront.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/wavefront.h 2015-04-02 16:46:36.000000000 +0000 @@ -53,10 +53,9 @@ public: - WaveFront(ThreadPool *pool) - : JobProvider(pool) - , m_internalDependencyBitmap(0) - , m_externalDependencyBitmap(0) + WaveFront() + : m_internalDependencyBitmap(NULL) + , m_externalDependencyBitmap(NULL) {} virtual ~WaveFront(); @@ -86,8 +85,8 @@ // WaveFront's implementation of JobProvider::findJob. Consults // m_queuedBitmap and calls ProcessRow(row) for lowest numbered queued row - // or returns false - bool findJob(int threadId); + // processes available rows and returns when no work remains + void findJob(int threadId); // Start or resume encode processing of this row, must be implemented by // derived classes. diff -Nru x265-1.5/source/common/x86/asm-primitives.cpp x265-1.6/source/common/x86/asm-primitives.cpp --- x265-1.5/source/common/x86/asm-primitives.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/asm-primitives.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -44,6 +44,11 @@ p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## _16x16_ ## cpu; \ p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \ p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu +#define ALL_LUMA_CU_TYPED_S(prim, fncdef, fname, cpu) \ + p.cu[BLOCK_8x8].prim = fncdef x265_ ## fname ## 8_ ## cpu; \ + p.cu[BLOCK_16x16].prim = fncdef x265_ ## fname ## 16_ ## cpu; \ + p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## 32_ ## cpu; \ + p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## 64_ ## cpu #define ALL_LUMA_TU_TYPED(prim, fncdef, fname, cpu) \ p.cu[BLOCK_4x4].prim = fncdef x265_ ## fname ## _4x4_ ## cpu; \ p.cu[BLOCK_8x8].prim = fncdef x265_ ## fname ## _8x8_ ## cpu; \ @@ -61,6 +66,7 @@ p.cu[BLOCK_32x32].prim = fncdef x265_ ## fname ## _32x32_ ## cpu; \ p.cu[BLOCK_64x64].prim = fncdef x265_ ## fname ## _64x64_ ## cpu; #define ALL_LUMA_CU(prim, fname, cpu) ALL_LUMA_CU_TYPED(prim, , fname, cpu) +#define ALL_LUMA_CU_S(prim, fname, cpu) ALL_LUMA_CU_TYPED_S(prim, , fname, cpu) #define ALL_LUMA_TU(prim, fname, cpu) ALL_LUMA_TU_TYPED(prim, , fname, cpu) #define ALL_LUMA_BLOCKS(prim, fname, cpu) ALL_LUMA_BLOCKS_TYPED(prim, , fname, cpu) #define ALL_LUMA_TU_S(prim, fname, cpu) ALL_LUMA_TU_TYPED_S(prim, , fname, cpu) @@ -179,7 +185,6 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef x265_ ## fname ## _8x32_ ## cpu #define ALL_CHROMA_420_4x4_PU(prim, fname, cpu) ALL_CHROMA_420_4x4_PU_TYPED(prim, , fname, cpu) - #define ALL_CHROMA_422_CU_TYPED(prim, fncdef, fname, cpu) \ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].prim = fncdef x265_ ## fname ## _4x8_ ## cpu; \ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].prim = fncdef x265_ ## fname ## _8x16_ ## cpu; \ @@ -791,6 +796,10 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 16bpp { +#if !defined(X86_64) +#error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF" +#endif + if (cpuMask & X265_CPU_SSE2) { /* We do not differentiate CPUs which support MMX and not SSE2. We only check @@ -863,6 +872,16 @@ ALL_LUMA_TU_S(calcresidual, getResidual, sse2); ALL_LUMA_TU_S(transpose, transpose, sse2); + p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; + p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2; + p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_sse2; + p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2; + + p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = x265_intra_pred_planar4_sse2; + p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = x265_intra_pred_planar8_sse2; + p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_sse2; + p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_sse2; + p.cu[BLOCK_4x4].sse_ss = x265_pixel_ssd_ss_4x4_mmx2; ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2); @@ -872,10 +891,10 @@ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_32x64_sse2; p.cu[BLOCK_4x4].dct = x265_dct4_sse2; + p.cu[BLOCK_8x8].dct = x265_dct8_sse2; p.cu[BLOCK_4x4].idct = x265_idct4_sse2; -#if X86_64 p.cu[BLOCK_8x8].idct = x265_idct8_sse2; -#endif + p.idst4x4 = x265_idst4_sse2; LUMA_VSS_FILTERS(sse2); @@ -894,7 +913,10 @@ p.dst4x4 = x265_dst4_ssse3; p.cu[BLOCK_8x8].idct = x265_idct8_ssse3; - p.count_nonzero = x265_count_nonzero_ssse3; + p.cu[BLOCK_4x4].count_nonzero = x265_count_nonzero_4x4_ssse3; + p.cu[BLOCK_8x8].count_nonzero = x265_count_nonzero_8x8_ssse3; + p.cu[BLOCK_16x16].count_nonzero = x265_count_nonzero_16x16_ssse3; + p.cu[BLOCK_32x32].count_nonzero = x265_count_nonzero_32x32_ssse3; p.frameInitLowres = x265_frame_init_lowres_core_ssse3; } if (cpuMask & X265_CPU_SSE4) @@ -931,19 +953,30 @@ p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4; p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4; -#if X86_64 + // TODO: check POPCNT flag! + ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4); ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4); ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4); -#endif } if (cpuMask & X265_CPU_AVX) { // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_avx; fails tests + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = x265_pixel_satd_16x24_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = x265_pixel_satd_32x48_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = x265_pixel_satd_24x64_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = x265_pixel_satd_8x64_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = x265_pixel_satd_8x12_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = x265_pixel_satd_12x32_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = x265_pixel_satd_4x32_avx; + ALL_LUMA_PU(satd, pixel_satd, avx); ASSIGN_SA8D(avx); LUMA_VAR(avx); p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx; p.ssim_end_4 = x265_pixel_ssim_end4_avx; + + // copy_pp primitives + // 16 x N p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx; p.pu[LUMA_16x4].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x4_avx; p.pu[LUMA_16x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x8_avx; @@ -963,11 +996,82 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x16_avx; p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x24_avx; p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_16x32_avx; + + // 24 X N + p.pu[LUMA_24x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_24x32_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_24x32_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_24x64_avx; + + // 32 x N + p.pu[LUMA_32x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x8_avx; + p.pu[LUMA_32x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x16_avx; + p.pu[LUMA_32x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x24_avx; + p.pu[LUMA_32x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x32_avx; + p.pu[LUMA_32x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x64_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x8_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x16_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x24_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x32_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x16_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x32_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x48_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_32x64_avx; + + // 48 X 64 + p.pu[LUMA_48x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_48x64_avx; + + // copy_ss primitives + // 16 X N + p.cu[BLOCK_16x16].copy_ss = x265_blockcopy_ss_16x16_avx; + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = x265_blockcopy_ss_16x16_avx; + p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx; + + // 32 X N + p.cu[BLOCK_32x32].copy_ss = x265_blockcopy_ss_32x32_avx; + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = x265_blockcopy_ss_32x32_avx; + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = x265_blockcopy_ss_32x64_avx; + + // 64 X N + p.cu[BLOCK_64x64].copy_ss = x265_blockcopy_ss_64x64_avx; + + // copy_ps primitives + // 16 X N + p.cu[BLOCK_16x16].copy_ps = (copy_ps_t)x265_blockcopy_ss_16x16_avx; + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = (copy_ps_t)x265_blockcopy_ss_16x16_avx; + p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = (copy_ps_t)x265_blockcopy_ss_16x32_avx; + + // 32 X N + p.cu[BLOCK_32x32].copy_ps = (copy_ps_t)x265_blockcopy_ss_32x32_avx; + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = (copy_ps_t)x265_blockcopy_ss_32x32_avx; + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = (copy_ps_t)x265_blockcopy_ss_32x64_avx; + + // 64 X N + p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)x265_blockcopy_ss_64x64_avx; + + // copy_sp primitives + // 16 X N + p.cu[BLOCK_16x16].copy_sp = (copy_sp_t)x265_blockcopy_ss_16x16_avx; + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = (copy_sp_t)x265_blockcopy_ss_16x16_avx; + p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = (copy_sp_t)x265_blockcopy_ss_16x32_avx; + + // 32 X N + p.cu[BLOCK_32x32].copy_sp = (copy_sp_t)x265_blockcopy_ss_32x32_avx; + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)x265_blockcopy_ss_32x32_avx; + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)x265_blockcopy_ss_32x64_avx; + + // 64 X N + p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)x265_blockcopy_ss_64x64_avx; + p.frameInitLowres = x265_frame_init_lowres_core_avx; + + p.pu[LUMA_64x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x16_avx; + p.pu[LUMA_64x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x32_avx; + p.pu[LUMA_64x48].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x48_avx; + p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx; } if (cpuMask & X265_CPU_XOP) { - p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_xop; + //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_xop; this one is broken ALL_LUMA_PU(satd, pixel_satd, xop); ASSIGN_SA8D(xop); LUMA_VAR(xop); @@ -975,36 +1079,48 @@ } if (cpuMask & X265_CPU_AVX2) { + p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2; + p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2; + p.quant = x265_quant_avx2; p.nquant = x265_nquant_avx2; p.dequant_normal = x265_dequant_normal_avx2; + p.scale1D_128to64 = x265_scale1D_128to64_avx2; // p.weight_pp = x265_weight_pp_avx2; fails tests + p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2; + p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2; + + p.cu[BLOCK_16x16].blockfill_s = x265_blockfill_s_16x16_avx2; + p.cu[BLOCK_32x32].blockfill_s = x265_blockfill_s_32x32_avx2; + + ALL_LUMA_TU(count_nonzero, count_nonzero, avx2); ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2); ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2); -#if X86_64 + p.cu[BLOCK_8x8].copy_cnt = x265_copy_cnt_8_avx2; + p.cu[BLOCK_16x16].copy_cnt = x265_copy_cnt_16_avx2; + p.cu[BLOCK_32x32].copy_cnt = x265_copy_cnt_32_avx2; + + p.cu[BLOCK_8x8].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_8_avx2; + p.cu[BLOCK_16x16].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_16_avx2; + p.cu[BLOCK_32x32].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_32_avx2; + + p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2; + p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_avx2; + p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_avx2; + ALL_LUMA_TU_S(dct, dct, avx2); ALL_LUMA_TU_S(idct, idct, avx2); + ALL_LUMA_CU_S(transpose, transpose, avx2); - p.cu[BLOCK_8x8].transpose = x265_transpose8_avx2; - p.cu[BLOCK_16x16].transpose = x265_transpose16_avx2; - p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2; - p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2; -#else - p.cu[BLOCK_4x4].dct = x265_dct4_avx2; -#endif - p.pu[LUMA_64x16].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x16_avx; - p.pu[LUMA_64x32].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x32_avx; - p.pu[LUMA_64x48].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x48_avx; - p.pu[LUMA_64x64].copy_pp = (copy_pp_t)x265_blockcopy_ss_64x64_avx; - - p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2; - p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2; + ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2); + ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2); + ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2); + ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2); } } - #else // if HIGH_BIT_DEPTH void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 8bpp @@ -1070,6 +1186,25 @@ ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, sse2); ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2); + p.cu[BLOCK_4x4].intra_pred[DC_IDX] = x265_intra_pred_dc4_sse2; + p.cu[BLOCK_8x8].intra_pred[DC_IDX] = x265_intra_pred_dc8_sse2; + p.cu[BLOCK_16x16].intra_pred[DC_IDX] = x265_intra_pred_dc16_sse2; + p.cu[BLOCK_32x32].intra_pred[DC_IDX] = x265_intra_pred_dc32_sse2; + + p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = x265_intra_pred_planar4_sse2; + p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = x265_intra_pred_planar8_sse2; + p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = x265_intra_pred_planar16_sse2; + p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = x265_intra_pred_planar32_sse2; + + p.cu[BLOCK_4x4].intra_pred[2] = x265_intra_pred_ang4_2_sse2; + p.cu[BLOCK_4x4].intra_pred[3] = x265_intra_pred_ang4_3_sse2; + p.cu[BLOCK_4x4].intra_pred[4] = x265_intra_pred_ang4_4_sse2; + p.cu[BLOCK_4x4].intra_pred[5] = x265_intra_pred_ang4_5_sse2; + p.cu[BLOCK_4x4].intra_pred[6] = x265_intra_pred_ang4_6_sse2; + p.cu[BLOCK_4x4].intra_pred[7] = x265_intra_pred_ang4_7_sse2; + p.cu[BLOCK_4x4].intra_pred[8] = x265_intra_pred_ang4_8_sse2; + p.cu[BLOCK_4x4].intra_pred[9] = x265_intra_pred_ang4_9_sse2; + p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2; p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2; @@ -1080,6 +1215,7 @@ p.ssim_end_4 = x265_pixel_ssim_end4_sse2; p.cu[BLOCK_4x4].dct = x265_dct4_sse2; + p.cu[BLOCK_8x8].dct = x265_dct8_sse2; p.cu[BLOCK_4x4].idct = x265_idct4_sse2; #if X86_64 p.cu[BLOCK_8x8].idct = x265_idct8_sse2; @@ -1113,14 +1249,36 @@ ASSIGN_SSE_PP(ssse3); p.cu[BLOCK_4x4].sse_pp = x265_pixel_ssd_4x4_ssse3; p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = x265_pixel_ssd_4x8_ssse3; + p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_ssse3; + p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_ssse3; + p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_ssse3; + p.pu[LUMA_8x4].filter_p2s = x265_pixelToShort_8x4_ssse3; + p.pu[LUMA_8x8].filter_p2s = x265_pixelToShort_8x8_ssse3; + p.pu[LUMA_8x16].filter_p2s = x265_pixelToShort_8x16_ssse3; + p.pu[LUMA_8x32].filter_p2s = x265_pixelToShort_8x32_ssse3; + p.pu[LUMA_16x4].filter_p2s = x265_pixelToShort_16x4_ssse3; + p.pu[LUMA_16x8].filter_p2s = x265_pixelToShort_16x8_ssse3; + p.pu[LUMA_16x12].filter_p2s = x265_pixelToShort_16x12_ssse3; + p.pu[LUMA_16x16].filter_p2s = x265_pixelToShort_16x16_ssse3; + p.pu[LUMA_16x32].filter_p2s = x265_pixelToShort_16x32_ssse3; + p.pu[LUMA_16x64].filter_p2s = x265_pixelToShort_16x64_ssse3; + p.pu[LUMA_32x8].filter_p2s = x265_pixelToShort_32x8_ssse3; + p.pu[LUMA_32x16].filter_p2s = x265_pixelToShort_32x16_ssse3; + p.pu[LUMA_32x24].filter_p2s = x265_pixelToShort_32x24_ssse3; + p.pu[LUMA_32x32].filter_p2s = x265_pixelToShort_32x32_ssse3; + p.pu[LUMA_32x64].filter_p2s = x265_pixelToShort_32x64_ssse3; + p.pu[LUMA_64x16].filter_p2s = x265_pixelToShort_64x16_ssse3; + p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_ssse3; + p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_ssse3; + p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_ssse3; - p.luma_p2s = x265_luma_p2s_ssse3; p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3; p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3; p.dst4x4 = x265_dst4_ssse3; p.cu[BLOCK_8x8].idct = x265_idct8_ssse3; - p.count_nonzero = x265_count_nonzero_ssse3; + + ALL_LUMA_TU(count_nonzero, count_nonzero, ssse3); p.frameInitLowres = x265_frame_init_lowres_core_ssse3; p.scale1D_128to64 = x265_scale1D_128to64_ssse3; @@ -1188,21 +1346,45 @@ INTRA_ANG_SSE4(sse4); p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4; + p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4; + #if X86_64 ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4); - p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_sse4; - p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_sse4; - p.cu[BLOCK_32x32].psy_cost_ss = x265_psyCost_ss_32x32_sse4; - p.cu[BLOCK_64x64].psy_cost_ss = x265_psyCost_ss_64x64_sse4; + ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4); #endif - p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4; } if (cpuMask & X265_CPU_AVX) { p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = x265_pixel_satd_16x24_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = x265_pixel_satd_32x48_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = x265_pixel_satd_24x64_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = x265_pixel_satd_8x64_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = x265_pixel_satd_8x12_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = x265_pixel_satd_12x32_avx; + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = x265_pixel_satd_4x32_avx; ALL_LUMA_PU(satd, pixel_satd, avx); + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = x265_pixel_satd_4x4_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = x265_pixel_satd_8x8_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = x265_pixel_satd_16x16_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = x265_pixel_satd_32x32_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = x265_pixel_satd_8x4_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = x265_pixel_satd_4x8_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = x265_pixel_satd_16x8_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = x265_pixel_satd_8x16_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = x265_pixel_satd_32x16_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = x265_pixel_satd_16x32_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = x265_pixel_satd_16x12_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = x265_pixel_satd_12x16_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = x265_pixel_satd_16x4_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = x265_pixel_satd_4x16_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = x265_pixel_satd_32x24_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = x265_pixel_satd_24x32_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = x265_pixel_satd_32x8_avx; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = x265_pixel_satd_8x32_avx; ASSIGN_SA8D(avx); ASSIGN_SSE_PP(avx); + p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = x265_pixel_ssd_8x8_avx; ASSIGN_SSE_SS(avx); LUMA_VAR(avx); @@ -1216,7 +1398,13 @@ p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx; p.ssim_end_4 = x265_pixel_ssim_end4_avx; + + p.cu[BLOCK_16x16].copy_ss = x265_blockcopy_ss_16x16_avx; + p.cu[BLOCK_32x32].copy_ss = x265_blockcopy_ss_32x32_avx; p.cu[BLOCK_64x64].copy_ss = x265_blockcopy_ss_64x64_avx; + p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ss = x265_blockcopy_ss_16x16_avx; + p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = x265_blockcopy_ss_32x32_avx; + p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx; p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = x265_blockcopy_pp_32x8_avx; p.pu[LUMA_32x8].copy_pp = x265_blockcopy_pp_32x8_avx; @@ -1237,11 +1425,18 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = x265_blockcopy_pp_32x64_avx; p.pu[LUMA_32x64].copy_pp = x265_blockcopy_pp_32x64_avx; + p.pu[LUMA_64x16].copy_pp = x265_blockcopy_pp_64x16_avx; + p.pu[LUMA_64x32].copy_pp = x265_blockcopy_pp_64x32_avx; + p.pu[LUMA_64x48].copy_pp = x265_blockcopy_pp_64x48_avx; + p.pu[LUMA_64x64].copy_pp = x265_blockcopy_pp_64x64_avx; + + p.pu[LUMA_48x64].copy_pp = x265_blockcopy_pp_48x64_avx; + p.frameInitLowres = x265_frame_init_lowres_core_avx; } if (cpuMask & X265_CPU_XOP) { - p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_xop; + //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = x265_pixel_satd_4x4_xop; this one is broken ALL_LUMA_PU(satd, pixel_satd, xop); ASSIGN_SA8D(xop); LUMA_VAR(xop); @@ -1249,19 +1444,133 @@ p.cu[BLOCK_16x16].sse_pp = x265_pixel_ssd_16x16_xop; p.frameInitLowres = x265_frame_init_lowres_core_xop; } +#if X86_64 if (cpuMask & X265_CPU_AVX2) { + p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2; + p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2; + p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2; + p.cu[BLOCK_32x32].psy_cost_ss = x265_psyCost_ss_32x32_avx2; + p.cu[BLOCK_64x64].psy_cost_ss = x265_psyCost_ss_64x64_avx2; + + p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_avx2; + p.cu[BLOCK_8x8].psy_cost_pp = x265_psyCost_pp_8x8_avx2; + p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_avx2; + p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_avx2; + p.cu[BLOCK_64x64].psy_cost_pp = x265_psyCost_pp_64x64_avx2; + + p.pu[LUMA_8x4].addAvg = x265_addAvg_8x4_avx2; + p.pu[LUMA_8x8].addAvg = x265_addAvg_8x8_avx2; + p.pu[LUMA_8x16].addAvg = x265_addAvg_8x16_avx2; + p.pu[LUMA_8x32].addAvg = x265_addAvg_8x32_avx2; + + p.pu[LUMA_12x16].addAvg = x265_addAvg_12x16_avx2; + + p.pu[LUMA_16x4].addAvg = x265_addAvg_16x4_avx2; + p.pu[LUMA_16x8].addAvg = x265_addAvg_16x8_avx2; + p.pu[LUMA_16x12].addAvg = x265_addAvg_16x12_avx2; + p.pu[LUMA_16x16].addAvg = x265_addAvg_16x16_avx2; + p.pu[LUMA_16x32].addAvg = x265_addAvg_16x32_avx2; + p.pu[LUMA_16x64].addAvg = x265_addAvg_16x64_avx2; + + p.pu[LUMA_24x32].addAvg = x265_addAvg_24x32_avx2; + + p.pu[LUMA_32x8].addAvg = x265_addAvg_32x8_avx2; + p.pu[LUMA_32x16].addAvg = x265_addAvg_32x16_avx2; + p.pu[LUMA_32x24].addAvg = x265_addAvg_32x24_avx2; + p.pu[LUMA_32x32].addAvg = x265_addAvg_32x32_avx2; + p.pu[LUMA_32x64].addAvg = x265_addAvg_32x64_avx2; + + p.pu[LUMA_48x64].addAvg = x265_addAvg_48x64_avx2; + + p.pu[LUMA_64x16].addAvg = x265_addAvg_64x16_avx2; + p.pu[LUMA_64x32].addAvg = x265_addAvg_64x32_avx2; + p.pu[LUMA_64x48].addAvg = x265_addAvg_64x48_avx2; + p.pu[LUMA_64x64].addAvg = x265_addAvg_64x64_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = x265_addAvg_8x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = x265_addAvg_8x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = x265_addAvg_8x6_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = x265_addAvg_8x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = x265_addAvg_8x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = x265_addAvg_8x32_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = x265_addAvg_12x16_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = x265_addAvg_16x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = x265_addAvg_16x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = x265_addAvg_16x12_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = x265_addAvg_16x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = x265_addAvg_16x32_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = x265_addAvg_32x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = x265_addAvg_32x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = x265_addAvg_32x24_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = x265_addAvg_32x32_avx2; + + p.cu[BLOCK_16x16].add_ps = x265_pixel_add_ps_16x16_avx2; + p.cu[BLOCK_32x32].add_ps = x265_pixel_add_ps_32x32_avx2; + p.cu[BLOCK_64x64].add_ps = x265_pixel_add_ps_64x64_avx2; + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = x265_pixel_add_ps_16x16_avx2; + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = x265_pixel_add_ps_32x32_avx2; + + p.cu[BLOCK_16x16].sub_ps = x265_pixel_sub_ps_16x16_avx2; + p.cu[BLOCK_32x32].sub_ps = x265_pixel_sub_ps_32x32_avx2; + p.cu[BLOCK_64x64].sub_ps = x265_pixel_sub_ps_64x64_avx2; + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = x265_pixel_sub_ps_16x16_avx2; + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = x265_pixel_sub_ps_32x32_avx2; + + p.pu[LUMA_16x4].pixelavg_pp = x265_pixel_avg_16x4_avx2; + p.pu[LUMA_16x8].pixelavg_pp = x265_pixel_avg_16x8_avx2; + p.pu[LUMA_16x12].pixelavg_pp = x265_pixel_avg_16x12_avx2; + p.pu[LUMA_16x16].pixelavg_pp = x265_pixel_avg_16x16_avx2; + p.pu[LUMA_16x32].pixelavg_pp = x265_pixel_avg_16x32_avx2; + p.pu[LUMA_16x64].pixelavg_pp = x265_pixel_avg_16x64_avx2; + + p.pu[LUMA_32x64].pixelavg_pp = x265_pixel_avg_32x64_avx2; + p.pu[LUMA_32x32].pixelavg_pp = x265_pixel_avg_32x32_avx2; + p.pu[LUMA_32x24].pixelavg_pp = x265_pixel_avg_32x24_avx2; + p.pu[LUMA_32x16].pixelavg_pp = x265_pixel_avg_32x16_avx2; + p.pu[LUMA_32x8].pixelavg_pp = x265_pixel_avg_32x8_avx2; + + p.pu[LUMA_64x64].pixelavg_pp = x265_pixel_avg_64x64_avx2; + p.pu[LUMA_64x48].pixelavg_pp = x265_pixel_avg_64x48_avx2; + p.pu[LUMA_64x32].pixelavg_pp = x265_pixel_avg_64x32_avx2; + p.pu[LUMA_64x16].pixelavg_pp = x265_pixel_avg_64x16_avx2; + p.pu[LUMA_16x16].satd = x265_pixel_satd_16x16_avx2; p.pu[LUMA_16x8].satd = x265_pixel_satd_16x8_avx2; p.pu[LUMA_8x16].satd = x265_pixel_satd_8x16_avx2; p.pu[LUMA_8x8].satd = x265_pixel_satd_8x8_avx2; - p.pu[LUMA_16x8].sad_x4 = x265_pixel_sad_x4_16x8_sse2; + p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2; + p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2; + p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_avx2; + p.pu[LUMA_32x32].sad = x265_pixel_sad_32x32_avx2; + p.pu[LUMA_32x64].sad = x265_pixel_sad_32x64_avx2; + p.pu[LUMA_48x64].sad = x265_pixel_sad_48x64_avx2; + p.pu[LUMA_64x16].sad = x265_pixel_sad_64x16_avx2; + p.pu[LUMA_64x32].sad = x265_pixel_sad_64x32_avx2; + p.pu[LUMA_64x48].sad = x265_pixel_sad_64x48_avx2; + p.pu[LUMA_64x64].sad = x265_pixel_sad_64x64_avx2; + + p.pu[LUMA_8x4].sad_x3 = x265_pixel_sad_x3_8x4_avx2; + p.pu[LUMA_8x8].sad_x3 = x265_pixel_sad_x3_8x8_avx2; + p.pu[LUMA_8x16].sad_x3 = x265_pixel_sad_x3_8x16_avx2; + + p.pu[LUMA_8x8].sad_x4 = x265_pixel_sad_x4_8x8_avx2; + p.pu[LUMA_16x8].sad_x4 = x265_pixel_sad_x4_16x8_avx2; p.pu[LUMA_16x12].sad_x4 = x265_pixel_sad_x4_16x12_avx2; - p.pu[LUMA_16x16].sad_x4 = x265_pixel_sad_x4_16x16_sse2; + p.pu[LUMA_16x16].sad_x4 = x265_pixel_sad_x4_16x16_avx2; p.pu[LUMA_16x32].sad_x4 = x265_pixel_sad_x4_16x32_avx2; p.cu[BLOCK_16x16].sse_pp = x265_pixel_ssd_16x16_avx2; + p.cu[BLOCK_32x32].sse_pp = x265_pixel_ssd_32x32_avx2; + p.cu[BLOCK_64x64].sse_pp = x265_pixel_ssd_64x64_avx2; + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = x265_pixel_ssd_16x16_avx2; + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = x265_pixel_ssd_32x32_avx2; + + p.cu[BLOCK_16x16].ssd_s = x265_pixel_ssd_s_16_avx2; p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2; p.cu[BLOCK_8x8].copy_cnt = x265_copy_cnt_8_avx2; @@ -1274,108 +1583,314 @@ ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, avx2); ALL_LUMA_TU_S(cpy1Dto2D_shr, cpy1Dto2D_shr_, avx2); + p.cu[BLOCK_8x8].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_8_avx2; + p.cu[BLOCK_16x16].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_16_avx2; + p.cu[BLOCK_32x32].cpy2Dto1D_shl = x265_cpy2Dto1D_shl_32_avx2; + + p.cu[BLOCK_8x8].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_8_avx2; + p.cu[BLOCK_16x16].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_16_avx2; + p.cu[BLOCK_32x32].cpy2Dto1D_shr = x265_cpy2Dto1D_shr_32_avx2; + + ALL_LUMA_TU(count_nonzero, count_nonzero, avx2); p.denoiseDct = x265_denoise_dct_avx2; p.quant = x265_quant_avx2; p.nquant = x265_nquant_avx2; p.dequant_normal = x265_dequant_normal_avx2; - p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ss = x265_blockcopy_ss_16x16_avx; - p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx; - p.scale1D_128to64 = x265_scale1D_128to64_avx2; + p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2; + p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2; + p.scale1D_128to64 = x265_scale1D_128to64_avx2; p.weight_pp = x265_weight_pp_avx2; - p.cu[BLOCK_4x4].dct = x265_dct4_avx2; -#if X86_64 - p.cu[BLOCK_8x8].dct = x265_dct8_avx2; - p.cu[BLOCK_16x16].dct = x265_dct16_avx2; - p.cu[BLOCK_32x32].dct = x265_dct32_avx2; - - p.cu[BLOCK_4x4].idct = x265_idct4_avx2; - p.cu[BLOCK_8x8].idct = x265_idct8_avx2; - p.cu[BLOCK_16x16].idct = x265_idct16_avx2; - p.cu[BLOCK_32x32].idct = x265_idct32_avx2; - - p.cu[BLOCK_8x8].transpose = x265_transpose8_avx2; - p.cu[BLOCK_16x16].transpose = x265_transpose16_avx2; - p.cu[BLOCK_32x32].transpose = x265_transpose32_avx2; - p.cu[BLOCK_64x64].transpose = x265_transpose64_avx2; - - p.pu[LUMA_12x16].luma_vpp = x265_interp_8tap_vert_pp_12x16_avx2; - - p.pu[LUMA_16x4].luma_vpp = x265_interp_8tap_vert_pp_16x4_avx2; - p.pu[LUMA_16x8].luma_vpp = x265_interp_8tap_vert_pp_16x8_avx2; - p.pu[LUMA_16x12].luma_vpp = x265_interp_8tap_vert_pp_16x12_avx2; - p.pu[LUMA_16x16].luma_vpp = x265_interp_8tap_vert_pp_16x16_avx2; - p.pu[LUMA_16x32].luma_vpp = x265_interp_8tap_vert_pp_16x32_avx2; - p.pu[LUMA_16x64].luma_vpp = x265_interp_8tap_vert_pp_16x64_avx2; - - p.pu[LUMA_24x32].luma_vpp = x265_interp_8tap_vert_pp_24x32_avx2; - - p.pu[LUMA_32x8].luma_vpp = x265_interp_8tap_vert_pp_32x8_avx2; - p.pu[LUMA_32x16].luma_vpp = x265_interp_8tap_vert_pp_32x16_avx2; - p.pu[LUMA_32x24].luma_vpp = x265_interp_8tap_vert_pp_32x24_avx2; - p.pu[LUMA_32x32].luma_vpp = x265_interp_8tap_vert_pp_32x32_avx2; - p.pu[LUMA_32x64].luma_vpp = x265_interp_8tap_vert_pp_32x64_avx2; - - p.pu[LUMA_48x64].luma_vpp = x265_interp_8tap_vert_pp_48x64_avx2; - - p.pu[LUMA_64x16].luma_vpp = x265_interp_8tap_vert_pp_64x16_avx2; - p.pu[LUMA_64x32].luma_vpp = x265_interp_8tap_vert_pp_64x32_avx2; - p.pu[LUMA_64x48].luma_vpp = x265_interp_8tap_vert_pp_64x48_avx2; - p.pu[LUMA_64x64].luma_vpp = x265_interp_8tap_vert_pp_64x64_avx2; -#endif - p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_avx2; + // intra_pred functions + p.cu[BLOCK_8x8].intra_pred[3] = x265_intra_pred_ang8_3_avx2; + p.cu[BLOCK_8x8].intra_pred[33] = x265_intra_pred_ang8_33_avx2; + p.cu[BLOCK_8x8].intra_pred[4] = x265_intra_pred_ang8_4_avx2; + p.cu[BLOCK_8x8].intra_pred[32] = x265_intra_pred_ang8_32_avx2; + p.cu[BLOCK_8x8].intra_pred[5] = x265_intra_pred_ang8_5_avx2; + p.cu[BLOCK_8x8].intra_pred[31] = x265_intra_pred_ang8_31_avx2; + p.cu[BLOCK_8x8].intra_pred[30] = x265_intra_pred_ang8_30_avx2; + p.cu[BLOCK_8x8].intra_pred[6] = x265_intra_pred_ang8_6_avx2; + p.cu[BLOCK_8x8].intra_pred[7] = x265_intra_pred_ang8_7_avx2; + p.cu[BLOCK_8x8].intra_pred[29] = x265_intra_pred_ang8_29_avx2; + p.cu[BLOCK_8x8].intra_pred[8] = x265_intra_pred_ang8_8_avx2; + p.cu[BLOCK_8x8].intra_pred[28] = x265_intra_pred_ang8_28_avx2; + p.cu[BLOCK_8x8].intra_pred[9] = x265_intra_pred_ang8_9_avx2; + p.cu[BLOCK_8x8].intra_pred[27] = x265_intra_pred_ang8_27_avx2; + p.cu[BLOCK_8x8].intra_pred[25] = x265_intra_pred_ang8_25_avx2; + p.cu[BLOCK_8x8].intra_pred[12] = x265_intra_pred_ang8_12_avx2; + p.cu[BLOCK_8x8].intra_pred[24] = x265_intra_pred_ang8_24_avx2; + p.cu[BLOCK_8x8].intra_pred[11] = x265_intra_pred_ang8_11_avx2; + p.cu[BLOCK_16x16].intra_pred[25] = x265_intra_pred_ang16_25_avx2; + p.cu[BLOCK_16x16].intra_pred[28] = x265_intra_pred_ang16_28_avx2; + p.cu[BLOCK_16x16].intra_pred[27] = x265_intra_pred_ang16_27_avx2; + p.cu[BLOCK_16x16].intra_pred[29] = x265_intra_pred_ang16_29_avx2; + p.cu[BLOCK_16x16].intra_pred[30] = x265_intra_pred_ang16_30_avx2; + p.cu[BLOCK_16x16].intra_pred[31] = x265_intra_pred_ang16_31_avx2; + p.cu[BLOCK_16x16].intra_pred[32] = x265_intra_pred_ang16_32_avx2; + p.cu[BLOCK_16x16].intra_pred[33] = x265_intra_pred_ang16_33_avx2; + p.cu[BLOCK_16x16].intra_pred[24] = x265_intra_pred_ang16_24_avx2; + p.cu[BLOCK_16x16].intra_pred[23] = x265_intra_pred_ang16_23_avx2; + p.cu[BLOCK_16x16].intra_pred[22] = x265_intra_pred_ang16_22_avx2; + p.cu[BLOCK_32x32].intra_pred[34] = x265_intra_pred_ang32_34_avx2; + p.cu[BLOCK_32x32].intra_pred[2] = x265_intra_pred_ang32_2_avx2; + p.cu[BLOCK_32x32].intra_pred[26] = x265_intra_pred_ang32_26_avx2; + p.cu[BLOCK_32x32].intra_pred[27] = x265_intra_pred_ang32_27_avx2; + p.cu[BLOCK_32x32].intra_pred[28] = x265_intra_pred_ang32_28_avx2; + p.cu[BLOCK_32x32].intra_pred[29] = x265_intra_pred_ang32_29_avx2; + p.cu[BLOCK_32x32].intra_pred[30] = x265_intra_pred_ang32_30_avx2; + p.cu[BLOCK_32x32].intra_pred[31] = x265_intra_pred_ang32_31_avx2; + p.cu[BLOCK_32x32].intra_pred[32] = x265_intra_pred_ang32_32_avx2; + + // copy_sp primitives + p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2; + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2; + p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = x265_blockcopy_sp_16x32_avx2; + + p.cu[BLOCK_32x32].copy_sp = x265_blockcopy_sp_32x32_avx2; + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = x265_blockcopy_sp_32x32_avx2; + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = x265_blockcopy_sp_32x64_avx2; + + p.cu[BLOCK_64x64].copy_sp = x265_blockcopy_sp_64x64_avx2; + + // copy_ps primitives + p.cu[BLOCK_16x16].copy_ps = x265_blockcopy_ps_16x16_avx2; + p.chroma[X265_CSP_I420].cu[CHROMA_420_16x16].copy_ps = x265_blockcopy_ps_16x16_avx2; + p.chroma[X265_CSP_I422].cu[CHROMA_422_16x32].copy_ps = x265_blockcopy_ps_16x32_avx2; + + p.cu[BLOCK_32x32].copy_ps = x265_blockcopy_ps_32x32_avx2; + p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = x265_blockcopy_ps_32x32_avx2; + p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = x265_blockcopy_ps_32x64_avx2; + + p.cu[BLOCK_64x64].copy_ps = x265_blockcopy_ps_64x64_avx2; + + ALL_LUMA_TU_S(dct, dct, avx2); + ALL_LUMA_TU_S(idct, idct, avx2); + ALL_LUMA_CU_S(transpose, transpose, avx2); + + ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2); + ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2); + ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2); + ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2); + // missing 4x8, 4x16, 24x32, 12x16 for the fill set of luma PU + p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_avx2; + p.pu[LUMA_4x8].luma_hpp = x265_interp_8tap_horiz_pp_4x8_avx2; + p.pu[LUMA_4x16].luma_hpp = x265_interp_8tap_horiz_pp_4x16_avx2; p.pu[LUMA_8x4].luma_hpp = x265_interp_8tap_horiz_pp_8x4_avx2; p.pu[LUMA_8x8].luma_hpp = x265_interp_8tap_horiz_pp_8x8_avx2; p.pu[LUMA_8x16].luma_hpp = x265_interp_8tap_horiz_pp_8x16_avx2; p.pu[LUMA_8x32].luma_hpp = x265_interp_8tap_horiz_pp_8x32_avx2; - p.pu[LUMA_16x4].luma_hpp = x265_interp_8tap_horiz_pp_16x4_avx2; p.pu[LUMA_16x8].luma_hpp = x265_interp_8tap_horiz_pp_16x8_avx2; p.pu[LUMA_16x12].luma_hpp = x265_interp_8tap_horiz_pp_16x12_avx2; p.pu[LUMA_16x16].luma_hpp = x265_interp_8tap_horiz_pp_16x16_avx2; p.pu[LUMA_16x32].luma_hpp = x265_interp_8tap_horiz_pp_16x32_avx2; p.pu[LUMA_16x64].luma_hpp = x265_interp_8tap_horiz_pp_16x64_avx2; - p.pu[LUMA_32x8].luma_hpp = x265_interp_8tap_horiz_pp_32x8_avx2; p.pu[LUMA_32x16].luma_hpp = x265_interp_8tap_horiz_pp_32x16_avx2; p.pu[LUMA_32x24].luma_hpp = x265_interp_8tap_horiz_pp_32x24_avx2; p.pu[LUMA_32x32].luma_hpp = x265_interp_8tap_horiz_pp_32x32_avx2; p.pu[LUMA_32x64].luma_hpp = x265_interp_8tap_horiz_pp_32x64_avx2; - p.pu[LUMA_64x64].luma_hpp = x265_interp_8tap_horiz_pp_64x64_avx2; p.pu[LUMA_64x48].luma_hpp = x265_interp_8tap_horiz_pp_64x48_avx2; p.pu[LUMA_64x32].luma_hpp = x265_interp_8tap_horiz_pp_64x32_avx2; p.pu[LUMA_64x16].luma_hpp = x265_interp_8tap_horiz_pp_64x16_avx2; - p.pu[LUMA_48x64].luma_hpp = x265_interp_8tap_horiz_pp_48x64_avx2; + p.pu[LUMA_24x32].luma_hpp = x265_interp_8tap_horiz_pp_24x32_avx2; + p.pu[LUMA_12x16].luma_hpp = x265_interp_8tap_horiz_pp_12x16_avx2; + + p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_avx2; + p.pu[LUMA_4x8].luma_hps = x265_interp_8tap_horiz_ps_4x8_avx2; + p.pu[LUMA_4x16].luma_hps = x265_interp_8tap_horiz_ps_4x16_avx2; + p.pu[LUMA_8x4].luma_hps = x265_interp_8tap_horiz_ps_8x4_avx2; + p.pu[LUMA_8x8].luma_hps = x265_interp_8tap_horiz_ps_8x8_avx2; + p.pu[LUMA_8x16].luma_hps = x265_interp_8tap_horiz_ps_8x16_avx2; + p.pu[LUMA_8x32].luma_hps = x265_interp_8tap_horiz_ps_8x32_avx2; + p.pu[LUMA_16x8].luma_hps = x265_interp_8tap_horiz_ps_16x8_avx2; + p.pu[LUMA_16x16].luma_hps = x265_interp_8tap_horiz_ps_16x16_avx2; + p.pu[LUMA_16x12].luma_hps = x265_interp_8tap_horiz_ps_16x12_avx2; + p.pu[LUMA_16x4].luma_hps = x265_interp_8tap_horiz_ps_16x4_avx2; + p.pu[LUMA_16x32].luma_hps = x265_interp_8tap_horiz_ps_16x32_avx2; + p.pu[LUMA_16x64].luma_hps = x265_interp_8tap_horiz_ps_16x64_avx2; + + p.pu[LUMA_32x32].luma_hps = x265_interp_8tap_horiz_ps_32x32_avx2; + p.pu[LUMA_32x16].luma_hps = x265_interp_8tap_horiz_ps_32x16_avx2; + p.pu[LUMA_32x24].luma_hps = x265_interp_8tap_horiz_ps_32x24_avx2; + p.pu[LUMA_32x8].luma_hps = x265_interp_8tap_horiz_ps_32x8_avx2; + p.pu[LUMA_32x64].luma_hps = x265_interp_8tap_horiz_ps_32x64_avx2; + p.pu[LUMA_48x64].luma_hps = x265_interp_8tap_horiz_ps_48x64_avx2; + p.pu[LUMA_64x64].luma_hps = x265_interp_8tap_horiz_ps_64x64_avx2; + p.pu[LUMA_64x48].luma_hps = x265_interp_8tap_horiz_ps_64x48_avx2; + p.pu[LUMA_64x32].luma_hps = x265_interp_8tap_horiz_ps_64x32_avx2; + p.pu[LUMA_64x16].luma_hps = x265_interp_8tap_horiz_ps_64x16_avx2; p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp = x265_interp_4tap_horiz_pp_8x8_avx2; p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_avx2; p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = x265_interp_4tap_horiz_pp_32x32_avx2; p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = x265_interp_4tap_horiz_pp_16x16_avx2; - p.pu[LUMA_4x4].luma_vps = x265_interp_8tap_vert_ps_4x4_avx2; - p.pu[LUMA_4x4].luma_vpp = x265_interp_8tap_vert_pp_4x4_avx2; - p.pu[LUMA_8x4].luma_vpp = x265_interp_8tap_vert_pp_8x4_avx2; - p.pu[LUMA_8x8].luma_vpp = x265_interp_8tap_vert_pp_8x8_avx2; - p.pu[LUMA_8x16].luma_vpp = x265_interp_8tap_vert_pp_8x16_avx2; - p.pu[LUMA_8x32].luma_vpp = x265_interp_8tap_vert_pp_8x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = x265_interp_4tap_horiz_pp_2x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_avx2; - // color space i420 - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2; - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = x265_interp_4tap_horiz_pp_4x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = x265_interp_4tap_horiz_pp_16x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = x265_interp_4tap_horiz_pp_16x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = x265_interp_4tap_horiz_pp_16x12_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = x265_interp_4tap_horiz_pp_16x32_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hpp = x265_interp_4tap_horiz_pp_6x8_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = x265_interp_4tap_horiz_pp_32x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = x265_interp_4tap_horiz_pp_32x24_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = x265_interp_4tap_horiz_pp_32x8_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_hpp = x265_interp_4tap_horiz_pp_8x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hpp = x265_interp_4tap_horiz_pp_8x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_hpp = x265_interp_4tap_horiz_pp_8x6_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hpp = x265_interp_4tap_horiz_pp_8x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hpp = x265_interp_4tap_horiz_pp_8x32_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hpp = x265_interp_4tap_horiz_pp_12x16_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = x265_interp_4tap_horiz_ps_32x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hps = x265_interp_4tap_horiz_ps_16x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hps = x265_interp_4tap_horiz_ps_4x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = x265_interp_4tap_horiz_ps_8x8_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hps = x265_interp_4tap_horiz_ps_4x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hps = x265_interp_4tap_horiz_ps_4x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hps = x265_interp_4tap_horiz_ps_4x16_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_hps = x265_interp_4tap_horiz_ps_8x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps = x265_interp_4tap_horiz_ps_8x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_hps = x265_interp_4tap_horiz_ps_8x6_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hps = x265_interp_4tap_horiz_ps_8x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps = x265_interp_4tap_horiz_ps_8x16_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = x265_interp_4tap_horiz_ps_16x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = x265_interp_4tap_horiz_ps_16x12_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps = x265_interp_4tap_horiz_ps_16x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps = x265_interp_4tap_horiz_ps_16x4_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = x265_interp_4tap_horiz_ps_32x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = x265_interp_4tap_horiz_ps_32x24_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hps = x265_interp_4tap_horiz_ps_2x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hps = x265_interp_4tap_horiz_ps_2x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hps = x265_interp_4tap_horiz_ps_6x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hpp = x265_interp_4tap_horiz_pp_24x32_avx2; - // color space i422 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2; -#if X86_64 + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vpp = x265_interp_4tap_vert_pp_4x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vpp = x265_interp_4tap_vert_pp_2x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vpp = x265_interp_4tap_vert_pp_4x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vpp = x265_interp_4tap_vert_pp_4x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vpp = x265_interp_4tap_vert_pp_6x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vpp = x265_interp_4tap_vert_pp_8x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = x265_interp_4tap_vert_pp_8x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vpp = x265_interp_4tap_vert_pp_8x6_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = x265_interp_4tap_vert_pp_8x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = x265_interp_4tap_vert_pp_8x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vpp = x265_interp_4tap_vert_pp_12x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = x265_interp_4tap_vert_pp_16x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = x265_interp_4tap_vert_pp_16x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = x265_interp_4tap_vert_pp_16x12_avx2; p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = x265_interp_4tap_vert_pp_16x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = x265_interp_4tap_vert_pp_16x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = x265_interp_4tap_vert_pp_24x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = x265_interp_4tap_vert_pp_32x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = x265_interp_4tap_vert_pp_32x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = x265_interp_4tap_vert_pp_32x24_avx2; p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = x265_interp_4tap_vert_pp_32x32_avx2; -#endif + + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vps = x265_interp_4tap_vert_ps_2x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vps = x265_interp_4tap_vert_ps_2x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vps = x265_interp_4tap_vert_ps_4x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vps = x265_interp_4tap_vert_ps_6x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vps = x265_interp_4tap_vert_ps_8x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vps = x265_interp_4tap_vert_ps_8x6_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = x265_interp_4tap_vert_ps_8x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = x265_interp_4tap_vert_ps_8x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = x265_interp_4tap_vert_ps_8x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vps = x265_interp_4tap_vert_ps_12x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = x265_interp_4tap_vert_ps_16x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = x265_interp_4tap_vert_ps_16x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = x265_interp_4tap_vert_ps_16x12_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vps = x265_interp_4tap_vert_ps_4x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = x265_interp_4tap_vert_ps_16x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = x265_interp_4tap_vert_ps_16x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vps = x265_interp_4tap_vert_ps_24x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = x265_interp_4tap_vert_ps_32x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = x265_interp_4tap_vert_ps_32x24_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = x265_interp_4tap_vert_ps_32x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = x265_interp_4tap_vert_ps_32x8_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vsp = x265_interp_4tap_vert_sp_4x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vsp = x265_interp_4tap_vert_sp_8x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vsp = x265_interp_4tap_vert_sp_16x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = x265_interp_4tap_vert_sp_32x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vsp = x265_interp_4tap_vert_sp_2x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vsp = x265_interp_4tap_vert_sp_2x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vsp = x265_interp_4tap_vert_sp_4x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vsp = x265_interp_4tap_vert_sp_4x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vsp = x265_interp_4tap_vert_sp_4x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vsp = x265_interp_4tap_vert_sp_6x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vsp = x265_interp_4tap_vert_sp_8x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vsp = x265_interp_4tap_vert_sp_8x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vsp = x265_interp_4tap_vert_sp_8x6_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vsp = x265_interp_4tap_vert_sp_8x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vsp = x265_interp_4tap_vert_sp_8x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vsp = x265_interp_4tap_vert_sp_12x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vsp = x265_interp_4tap_vert_sp_16x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vsp = x265_interp_4tap_vert_sp_16x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vsp = x265_interp_4tap_vert_sp_16x12_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vsp = x265_interp_4tap_vert_sp_16x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vsp = x265_interp_4tap_vert_sp_24x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = x265_interp_4tap_vert_sp_32x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = x265_interp_4tap_vert_sp_32x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = x265_interp_4tap_vert_sp_32x24_avx2; + + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vss = x265_interp_4tap_vert_ss_4x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vss = x265_interp_4tap_vert_ss_8x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = x265_interp_4tap_vert_ss_16x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = x265_interp_4tap_vert_ss_32x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vss = x265_interp_4tap_vert_ss_2x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vss = x265_interp_4tap_vert_ss_2x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vss = x265_interp_4tap_vert_ss_4x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vss = x265_interp_4tap_vert_ss_4x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vss = x265_interp_4tap_vert_ss_4x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vss = x265_interp_4tap_vert_ss_6x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vss = x265_interp_4tap_vert_ss_8x2_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vss = x265_interp_4tap_vert_ss_8x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vss = x265_interp_4tap_vert_ss_8x6_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = x265_interp_4tap_vert_ss_8x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = x265_interp_4tap_vert_ss_8x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vss = x265_interp_4tap_vert_ss_12x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = x265_interp_4tap_vert_ss_16x4_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = x265_interp_4tap_vert_ss_16x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = x265_interp_4tap_vert_ss_16x12_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = x265_interp_4tap_vert_ss_16x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = x265_interp_4tap_vert_ss_24x32_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = x265_interp_4tap_vert_ss_32x8_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = x265_interp_4tap_vert_ss_32x16_avx2; + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = x265_interp_4tap_vert_ss_32x24_avx2; + + if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2)) + p.findPosLast = x265_findPosLast_x64; } +#endif } #endif // if HIGH_BIT_DEPTH diff -Nru x265-1.5/source/common/x86/blockcopy8.asm x265-1.6/source/common/x86/blockcopy8.asm --- x265-1.5/source/common/x86/blockcopy8.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/blockcopy8.asm 2015-04-02 16:46:36.000000000 +0000 @@ -47,15 +47,15 @@ cglobal blockcopy_pp_2x4, 4, 7, 0 mov r4w, [r2] mov r5w, [r2 + r3] - lea r2, [r2 + r3 * 2] - mov r6w, [r2] + mov r6w, [r2 + 2 * r3] + lea r3, [r3 + 2 * r3] mov r3w, [r2 + r3] - mov [r0], r4w - mov [r0 + r1], r5w - lea r0, [r0 + 2 * r1] - mov [r0], r6w - mov [r0 + r1], r3w + mov [r0], r4w + mov [r0 + r1], r5w + mov [r0 + 2 * r1], r6w + lea r1, [r1 + 2 * r1] + mov [r0 + r1], r3w RET ;----------------------------------------------------------------------------- @@ -63,37 +63,29 @@ ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x8, 4, 7, 0 - mov r4w, [r2] - mov r5w, [r2 + r3] - mov r6w, [r2 + 2 * r3] + lea r5, [3 * r1] + lea r6, [3 * r3] - mov [r0], r4w - mov [r0 + r1], r5w - mov [r0 + 2 * r1], r6w - - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - - mov r4w, [r2 + r3] - mov r5w, [r2 + 2 * r3] - - mov [r0 + r1], r4w - mov [r0 + 2 * r1], r5w - - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - - mov r4w, [r2 + r3] - mov r5w, [r2 + 2 * r3] - - mov [r0 + r1], r4w - mov [r0 + 2 * r1], r5w - - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - - mov r4w, [r2 + r3] - mov [r0 + r1], r4w + mov r4w, [r2] + mov [r0], r4w + mov r4w, [r2 + r3] + mov [r0 + r1], r4w + mov r4w, [r2 + 2 * r3] + mov [r0 + 2 * r1], r4w + mov r4w, [r2 + r6] + mov [r0 + r5], r4w + + lea r2, [r2 + 4 * r3] + mov r4w, [r2] + lea r0, [r0 + 4 * r1] + mov [r0], r4w + + mov r4w, [r2 + r3] + mov [r0 + r1], r4w + mov r4w, [r2 + 2 * r3] + mov [r0 + 2 * r1], r4w + mov r4w, [r2 + r6] + mov [r0 + r5], r4w RET ;----------------------------------------------------------------------------- @@ -101,16 +93,30 @@ ;----------------------------------------------------------------------------- INIT_XMM sse2 cglobal blockcopy_pp_2x16, 4, 7, 0 - mov r6d, 16/2 -.loop: - mov r4w, [r2] - mov r5w, [r2 + r3] - dec r6d - lea r2, [r2 + r3 * 2] - mov [r0], r4w - mov [r0 + r1], r5w - lea r0, [r0 + r1 * 2] - jnz .loop + lea r5, [3 * r1] + lea r6, [3 * r3] + + mov r4w, [r2] + mov [r0], r4w + mov r4w, [r2 + r3] + mov [r0 + r1], r4w + mov r4w, [r2 + 2 * r3] + mov [r0 + 2 * r1], r4w + mov r4w, [r2 + r6] + mov [r0 + r5], r4w + +%rep 3 + lea r2, [r2 + 4 * r3] + mov r4w, [r2] + lea r0, [r0 + 4 * r1] + mov [r0], r4w + mov r4w, [r2 + r3] + mov [r0 + r1], r4w + mov r4w, [r2 + 2 * r3] + mov [r0 + 2 * r1], r4w + mov r4w, [r2 + r6] + mov [r0 + r5], r4w +%endrep RET @@ -145,115 +151,130 @@ RET ;----------------------------------------------------------------------------- +; void blockcopy_pp_4x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_4x8, 4, 6, 4 + + lea r4, [3 * r1] + lea r5, [3 * r3] + + movd m0, [r2] + movd m1, [r2 + r3] + movd m2, [r2 + 2 * r3] + movd m3, [r2 + r5] + + movd [r0], m0 + movd [r0 + r1], m1 + movd [r0 + 2 * r1], m2 + movd [r0 + r4], m3 + + lea r2, [r2 + 4 * r3] + movd m0, [r2] + movd m1, [r2 + r3] + movd m2, [r2 + 2 * r3] + movd m3, [r2 + r5] + + lea r0, [r0 + 4 * r1] + movd [r0], m0 + movd [r0 + r1], m1 + movd [r0 + 2 * r1], m2 + movd [r0 + r4], m3 + RET + +;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W4_H8 2 INIT_XMM sse2 -cglobal blockcopy_pp_%1x%2, 4, 5, 4 +cglobal blockcopy_pp_%1x%2, 4, 7, 4 mov r4d, %2/8 + lea r5, [3 * r1] + lea r6, [3 * r3] + .loop: movd m0, [r2] movd m1, [r2 + r3] - lea r2, [r2 + 2 * r3] - movd m2, [r2] - movd m3, [r2 + r3] + movd m2, [r2 + 2 * r3] + movd m3, [r2 + r6] - movd [r0], m0 - movd [r0 + r1], m1 - lea r0, [r0 + 2 * r1] - movd [r0], m2 - movd [r0 + r1], m3 + movd [r0], m0 + movd [r0 + r1], m1 + movd [r0 + 2 * r1], m2 + movd [r0 + r5], m3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] + lea r2, [r2 + 4 * r3] movd m0, [r2] movd m1, [r2 + r3] - lea r2, [r2 + 2 * r3] - movd m2, [r2] - movd m3, [r2 + r3] + movd m2, [r2 + 2 * r3] + movd m3, [r2 + r6] - movd [r0], m0 - movd [r0 + r1], m1 - lea r0, [r0 + 2 * r1] - movd [r0], m2 - movd [r0 + r1], m3 + lea r0, [r0 + 4 * r1] + movd [r0], m0 + movd [r0 + r1], m1 + movd [r0 + 2 * r1], m2 + movd [r0 + r5], m3 - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] dec r4d jnz .loop RET %endmacro -BLOCKCOPY_PP_W4_H8 4, 8 BLOCKCOPY_PP_W4_H8 4, 16 - BLOCKCOPY_PP_W4_H8 4, 32 ;----------------------------------------------------------------------------- ; void blockcopy_pp_6x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_pp_6x8, 4, 7, 8 +cglobal blockcopy_pp_6x8, 4, 7, 3 - movd m0, [r2] - movd m1, [r2 + r3] - movd m2, [r2 + 2 * r3] - lea r5, [r2 + 2 * r3] - movd m3, [r5 + r3] - - movd m4, [r5 + 2 * r3] - lea r5, [r5 + 2 * r3] - movd m5, [r5 + r3] - movd m6, [r5 + 2 * r3] - lea r5, [r5 + 2 * r3] - movd m7, [r5 + r3] - - movd [r0], m0 - movd [r0 + r1], m1 - movd [r0 + 2 * r1], m2 - lea r6, [r0 + 2 * r1] - movd [r6 + r1], m3 - - movd [r6 + 2 * r1], m4 - lea r6, [r6 + 2 * r1] - movd [r6 + r1], m5 - movd [r6 + 2 * r1], m6 - lea r6, [r6 + 2 * r1] - movd [r6 + r1], m7 - - mov r4w, [r2 + 4] - mov r5w, [r2 + r3 + 4] - mov r6w, [r2 + 2 * r3 + 4] - - mov [r0 + 4], r4w - mov [r0 + r1 + 4], r5w - mov [r0 + 2 * r1 + 4], r6w - - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - - mov r4w, [r2 + r3 + 4] - mov r5w, [r2 + 2 * r3 + 4] - - mov [r0 + r1 + 4], r4w - mov [r0 + 2 * r1 + 4], r5w - - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - - mov r4w, [r2 + r3 + 4] - mov r5w, [r2 + 2 * r3 + 4] - - mov [r0 + r1 + 4], r4w - mov [r0 + 2 * r1 + 4], r5w - - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - - mov r4w, [r2 + r3 + 4] - mov [r0 + r1 + 4], r4w + movd m0, [r2] + mov r4w, [r2 + 4] + movd m1, [r2 + r3] + mov r5w, [r2 + r3 + 4] + movd m2, [r2 + 2 * r3] + mov r6w, [r2 + 2 * r3 + 4] + + movd [r0], m0 + mov [r0 + 4], r4w + movd [r0 + r1], m1 + mov [r0 + r1 + 4], r5w + movd [r0 + 2 * r1], m2 + mov [r0 + 2 * r1 + 4], r6w + + lea r2, [r2 + 2 * r3] + movd m0, [r2 + r3] + mov r4w, [r2 + r3 + 4] + movd m1, [r2 + 2 * r3] + mov r5w, [r2 + 2 * r3 + 4] + lea r2, [r2 + 2 * r3] + movd m2, [r2 + r3] + mov r6w, [r2 + r3 + 4] + + lea r0, [r0 + 2 * r1] + movd [r0 + r1], m0 + mov [r0 + r1 + 4], r4w + movd [r0 + 2 * r1], m1 + mov [r0 + 2 * r1 + 4], r5w + lea r0, [r0 + 2 * r1] + movd [r0 + r1], m2 + mov [r0 + r1 + 4], r6w + + lea r2, [r2 + 2 * r3] + movd m0, [r2] + mov r4w, [r2 + 4] + movd m1, [r2 + r3] + mov r5w, [r2 + r3 + 4] + + lea r0, [r0 + 2 * r1] + movd [r0], m0 + mov [r0 + 4], r4w + movd [r0 + r1], m1 + mov [r0 + r1 + 4], r5w RET ;----------------------------------------------------------------------------- @@ -312,89 +333,193 @@ ; void blockcopy_pp_8x6(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_pp_8x6, 4, 7, 6 +cglobal blockcopy_pp_8x6, 4, 4, 6 movh m0, [r2] movh m1, [r2 + r3] - movh m2, [r2 + 2 * r3] - lea r5, [r2 + 2 * r3] - movh m3, [r5 + r3] - movh m4, [r5 + 2 * r3] - lea r5, [r5 + 2 * r3] - movh m5, [r5 + r3] + lea r2, [r2 + 2 * r3] + movh m2, [r2] + movh m3, [r2 + r3] + lea r2, [r2 + 2 * r3] + movh m4, [r2] + movh m5, [r2 + r3] - movh [r0], m0 - movh [r0 + r1], m1 - movh [r0 + 2 * r1], m2 - lea r6, [r0 + 2 * r1] - movh [r6 + r1], m3 - movh [r6 + 2 * r1], m4 - lea r6, [r6 + 2 * r1] - movh [r6 + r1], m5 + movh [r0], m0 + movh [r0 + r1], m1 + lea r0, [r0 + 2 * r1] + movh [r0], m2 + movh [r0 + r1], m3 + lea r0, [r0 + 2 * r1] + movh [r0], m4 + movh [r0 + r1], m5 RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_8x12(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockcopy_pp_8x12, 4, 5, 2 - mov r4d, 12/2 -.loop: - movh m0, [r2] - movh m1, [r2 + r3] - movh [r0], m0 - movh [r0 + r1], m1 - dec r4d - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - jnz .loop +cglobal blockcopy_pp_8x12, 4, 6, 4 + + lea r4, [3 * r3] + lea r5, [3 * r1] + + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] + + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 + + %rep 2 + lea r2, [r2 + 4 * r3] + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] + + lea r0, [r0 + 4 * r1] + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 + %endrep RET ;----------------------------------------------------------------------------- -; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) +; void blockcopy_pp_8x8(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- -%macro BLOCKCOPY_PP_W8_H8 2 INIT_XMM sse2 -cglobal blockcopy_pp_%1x%2, 4, 5, 6 - mov r4d, %2/8 +cglobal blockcopy_pp_8x8, 4, 6, 4 -.loop: - movh m0, [r2] - movh m1, [r2 + r3] - lea r2, [r2 + 2 * r3] - movh m2, [r2] - movh m3, [r2 + r3] - lea r2, [r2 + 2 * r3] - movh m4, [r2] - movh m5, [r2 + r3] - - movh [r0], m0 - movh [r0 + r1], m1 - lea r0, [r0 + 2 * r1] - movh [r0], m2 - movh [r0 + r1], m3 - lea r0, [r0 + 2 * r1] - movh [r0], m4 - movh [r0 + r1], m5 - - lea r2, [r2 + 2 * r3] - movh m4, [r2] - movh m5, [r2 + r3] - lea r0, [r0 + 2 * r1] - movh [r0], m4 - movh [r0 + r1], m5 - - dec r4d - lea r0, [r0 + 2 * r1] - lea r2, [r2 + 2 * r3] - jnz .loop -RET -%endmacro + lea r4, [3 * r3] + lea r5, [3 * r1] + + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] + + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 + + lea r2, [r2 + 4 * r3] + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] + + lea r0, [r0 + 4 * r1] + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_8x16(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_8x16, 4, 6, 4 + + lea r4, [3 * r3] + lea r5, [3 * r1] + + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] + + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 + + %rep 3 + lea r2, [r2 + 4 * r3] + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] + + lea r0, [r0 + 4 * r1] + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 + %endrep + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_8x32(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_8x32, 4, 6, 4 + + lea r4, [3 * r3] + lea r5, [3 * r1] + + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] + + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 + + %rep 7 + lea r2, [r2 + 4 * r3] + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] + + lea r0, [r0 + 4 * r1] + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 + %endrep + RET + +;----------------------------------------------------------------------------- +; void blockcopy_pp_8x64(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal blockcopy_pp_8x64, 4, 6, 4 + + lea r4, [3 * r3] + lea r5, [3 * r1] + + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] + + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 -BLOCKCOPY_PP_W8_H8 8, 8 -BLOCKCOPY_PP_W8_H8 8, 16 -BLOCKCOPY_PP_W8_H8 8, 32 + %rep 15 + lea r2, [r2 + 4 * r3] + movh m0, [r2] + movh m1, [r2 + r3] + movh m2, [r2 + 2 * r3] + movh m3, [r2 + r4] -BLOCKCOPY_PP_W8_H8 8, 64 + lea r0, [r0 + 4 * r1] + movh [r0], m0 + movh [r0 + r1], m1 + movh [r0 + 2 * r1], m2 + movh [r0 + r5], m3 + %endrep + RET ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) @@ -838,6 +963,46 @@ ;----------------------------------------------------------------------------- ; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W48_H4_avx 2 +INIT_YMM avx +cglobal blockcopy_pp_%1x%2, 4, 5, 4 + mov r4d, %2/4 + +.loop: + movu m0, [r2] + movu xm1, [r2 + 32] + movu m2, [r2 + r3] + movu xm3, [r2 + r3 + 32] + lea r2, [r2 + 2 * r3] + + movu [r0], m0 + movu [r0 + 32], xm1 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], xm3 + lea r0, [r0 + 2 * r1] + + movu m0, [r2] + movu xm1, [r2 + 32] + movu m2, [r2 + r3] + movu xm3, [r2 + r3 + 32] + + movu [r0], m0 + movu [r0 + 32], xm1 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], xm3 + + dec r4d + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W48_H4_avx 48, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) +;----------------------------------------------------------------------------- %macro BLOCKCOPY_PP_W64_H4 2 INIT_XMM sse2 cglobal blockcopy_pp_%1x%2, 4, 5, 6 @@ -897,6 +1062,49 @@ BLOCKCOPY_PP_W64_H4 64, 64 ;----------------------------------------------------------------------------- +; void blockcopy_pp_%1x%2(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PP_W64_H4_avx 2 +INIT_YMM avx +cglobal blockcopy_pp_%1x%2, 4, 7, 6 + lea r4, [3 * r1] + lea r5, [3 * r3] + mov r6d, %2/4 + +.loop: + movu m0, [r2] + movu m1, [r2 + 32] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 32] + movu m4, [r2 + 2 * r3] + movu m5, [r2 + 2 * r3 + 32] + + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + r1], m2 + movu [r0 + r1 + 32], m3 + movu [r0 + 2 * r1], m4 + movu [r0 + 2 * r1 + 32], m5 + + movu m0, [r2 + r5] + movu m1, [r2 + r5 + 32] + + movu [r0 + r4], m0 + movu [r0 + r4 + 32], m1 + + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + dec r6d + jnz .loop + RET +%endmacro + +BLOCKCOPY_PP_W64_H4_avx 64, 16 +BLOCKCOPY_PP_W64_H4_avx 64, 32 +BLOCKCOPY_PP_W64_H4_avx 64, 48 +BLOCKCOPY_PP_W64_H4_avx 64, 64 + +;----------------------------------------------------------------------------- ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- INIT_XMM sse4 @@ -1550,9 +1758,69 @@ BLOCKCOPY_SP_W16_H4 16, 16 BLOCKCOPY_SP_W16_H4 16, 32 BLOCKCOPY_SP_W16_H4 16, 64 - BLOCKCOPY_SP_W16_H4 16, 24 +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W16_H8_avx2 2 +INIT_YMM avx2 +cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride + mov r4d, %2/8 + add r3, r3 + lea r5, [3 * r3] + lea r6, [3 * r1] + +.loop: + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + movu m3, [r2 + r5] + + packuswb m0, m1 + packuswb m2, m3 + + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + + movu [r0], xm0 + movu [r0 + r1], xm1 + movu [r0 + 2 * r1], xm2 + movu [r0 + r6], xm3 + + lea r2, [r2 + 4 * r3] + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + 2 * r3] + movu m3, [r2 + r5] + + packuswb m0, m1 + packuswb m2, m3 + + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + + lea r0, [r0 + 4 * r1] + movu [r0], xm0 + movu [r0 + r1], xm1 + movu [r0 + 2 * r1], xm2 + movu [r0 + r6], xm3 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_SP_W16_H8_avx2 16, 16 +BLOCKCOPY_SP_W16_H8_avx2 16, 32 ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- @@ -1645,6 +1913,57 @@ ;----------------------------------------------------------------------------- ; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W32_H4_avx2 2 +INIT_YMM avx2 +cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride + mov r4d, %2/4 + add r3, r3 + lea r5, [3 * r3] + lea r6, [3 * r1] + +.loop: + movu m0, [r2] + movu m1, [r2 + 32] + movu m2, [r2 + r3] + movu m3, [r2 + r3 + 32] + + packuswb m0, m1 + packuswb m2, m3 + + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + + movu [r0], m0 + movu [r0 + r1], m2 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + 32] + movu m2, [r2 + r5] + movu m3, [r2 + r5 + 32] + + packuswb m0, m1 + packuswb m2, m3 + + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + + movu [r0 + 2 * r1], m0 + movu [r0 + r6], m2 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_SP_W32_H4_avx2 32, 32 +BLOCKCOPY_SP_W32_H4_avx2 32, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;----------------------------------------------------------------------------- %macro BLOCKCOPY_SP_W48_H2 2 INIT_XMM sse2 cglobal blockcopy_sp_%1x%2, 4, 5, 6, dst, dstStride, src, srcStride @@ -1720,10 +2039,88 @@ RET %endmacro -BLOCKCOPY_SP_W64_H1 64, 16 -BLOCKCOPY_SP_W64_H1 64, 32 -BLOCKCOPY_SP_W64_H1 64, 48 -BLOCKCOPY_SP_W64_H1 64, 64 +BLOCKCOPY_SP_W64_H1 64, 16 +BLOCKCOPY_SP_W64_H1 64, 32 +BLOCKCOPY_SP_W64_H1 64, 48 +BLOCKCOPY_SP_W64_H1 64, 64 + +;----------------------------------------------------------------------------- +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SP_W64_H4_avx2 2 +INIT_YMM avx2 +cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride + mov r4d, %2/4 + add r3, r3 + lea r5, [3 * r3] + lea r6, [3 * r1] + +.loop: + movu m0, [r2] + movu m1, [r2 + 32] + movu m2, [r2 + 64] + movu m3, [r2 + 96] + + packuswb m0, m1 + packuswb m2, m3 + + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + + movu [r0], m0 + movu [r0 + 32], m2 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 32] + movu m2, [r2 + r3 + 64] + movu m3, [r2 + r3 + 96] + + packuswb m0, m1 + packuswb m2, m3 + + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m2 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + 32] + movu m2, [r2 + 2 * r3 + 64] + movu m3, [r2 + 2 * r3 + 96] + + packuswb m0, m1 + packuswb m2, m3 + + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + 32], m2 + + movu m0, [r2 + r5] + movu m1, [r2 + r5 + 32] + movu m2, [r2 + r5 + 64] + movu m3, [r2 + r5 + 96] + + packuswb m0, m1 + packuswb m2, m3 + + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + + movu [r0 + r6], m0 + movu [r0 + r6 + 32], m2 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_SP_W64_H4_avx2 64, 64 ;----------------------------------------------------------------------------- ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) @@ -1748,9 +2145,10 @@ ; void blockfill_s_8x8(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- INIT_XMM sse2 -cglobal blockfill_s_8x8, 3, 3, 1, dst, dstStride, val +cglobal blockfill_s_8x8, 3, 4, 1, dst, dstStride, val add r1, r1 +lea r3, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 @@ -1760,71 +2158,68 @@ movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 -lea r0, [r0 + 2 * r1] -movu [r0 + r1], m0 -movu [r0 + 2 * r1], m0 +movu [r0 + r3], m0 -lea r0, [r0 + 2 * r1] +lea r0, [r0 + 4 * r1] +movu [r0], m0 movu [r0 + r1], m0 movu [r0 + 2 * r1], m0 - -lea r0, [r0 + 2 * r1] -movu [r0 + r1], m0 - +movu [r0 + r3], m0 RET ;----------------------------------------------------------------------------- -; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val) +; void blockfill_s_16x16(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- -%macro BLOCKFILL_S_W16_H8 2 INIT_XMM sse2 -cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val - -mov r3d, %2/8 +cglobal blockfill_s_16x16, 3, 4, 1, dst, dstStride, val add r1, r1 +lea r3, [3 * r1] movd m0, r2d -pshuflw m0, m0, 0 -pshufd m0, m0, 0 - -.loop: - movu [r0], m0 - movu [r0 + 16], m0 - - movu [r0 + r1], m0 - movu [r0 + r1 + 16], m0 - - movu [r0 + 2 * r1], m0 - movu [r0 + 2 * r1 + 16], m0 - - lea r4, [r0 + 2 * r1] - movu [r4 + r1], m0 - movu [r4 + r1 + 16], m0 - - movu [r0 + 4 * r1], m0 - movu [r0 + 4 * r1 + 16], m0 - - lea r4, [r0 + 4 * r1] - movu [r4 + r1], m0 - movu [r4 + r1 + 16], m0 - - movu [r4 + 2 * r1], m0 - movu [r4 + 2 * r1 + 16], m0 - - lea r4, [r4 + 2 * r1] - movu [r4 + r1], m0 - movu [r4 + r1 + 16], m0 - - lea r0, [r0 + 8 * r1] - - dec r3d - jnz .loop +pshuflw m0, m0, 0 +pshufd m0, m0, 0 +movu [r0], m0 +movu [r0 + 16], m0 +movu [r0 + r1], m0 +movu [r0 + r1 + 16], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 16], m0 + +movu [r0 + r3], m0 +movu [r0 + r3 + 16], m0 +movu [r0 + 4 * r1], m0 +movu [r0 + 4 * r1 + 16], m0 + +lea r0, [r0 + 4 * r1] +movu [r0 + r1], m0 +movu [r0 + r1 + 16], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 16], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 16], m0 +movu [r0 + 4 * r1], m0 +movu [r0 + 4 * r1 + 16], m0 + +lea r0, [r0 + 4 * r1] +movu [r0 + r1], m0 +movu [r0 + r1 + 16], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 16], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 16], m0 +movu [r0 + 4 * r1], m0 +movu [r0 + 4 * r1 + 16], m0 + +lea r0, [r0 + 4 * r1] +movu [r0 + r1], m0 +movu [r0 + r1 + 16], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + 2 * r1 + 16], m0 +movu [r0 + r3], m0 +movu [r0 + r3 + 16], m0 RET -%endmacro - -BLOCKFILL_S_W16_H8 16, 16 INIT_YMM avx2 cglobal blockfill_s_16x16, 3, 4, 1 @@ -1857,13 +2252,14 @@ ;----------------------------------------------------------------------------- ; void blockfill_s_%1x%2(int16_t* dst, intptr_t dstride, int16_t val) ;----------------------------------------------------------------------------- -%macro BLOCKFILL_S_W32_H4 2 +%macro BLOCKFILL_S_W32_H8 2 INIT_XMM sse2 cglobal blockfill_s_%1x%2, 3, 5, 1, dst, dstStride, val -mov r3d, %2/4 +mov r3d, %2/8 add r1, r1 +lea r4, [3 * r1] movd m0, r2d pshuflw m0, m0, 0 @@ -1885,12 +2281,31 @@ movu [r0 + 2 * r1 + 32], m0 movu [r0 + 2 * r1 + 48], m0 - lea r4, [r0 + 2 * r1] + movu [r0 + r4], m0 + movu [r0 + r4 + 16], m0 + movu [r0 + r4 + 32], m0 + movu [r0 + r4 + 48], m0 + + movu [r0 + 4 * r1], m0 + movu [r0 + 4 * r1 + 16], m0 + movu [r0 + 4 * r1 + 32], m0 + movu [r0 + 4 * r1 + 48], m0 + + lea r0, [r0 + 4 * r1] + movu [r0 + r1], m0 + movu [r0 + r1 + 16], m0 + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m0 + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + 16], m0 + movu [r0 + 2 * r1 + 32], m0 + movu [r0 + 2 * r1 + 48], m0 - movu [r4 + r1], m0 - movu [r4 + r1 + 16], m0 - movu [r4 + r1 + 32], m0 - movu [r4 + r1 + 48], m0 + movu [r0 + r4], m0 + movu [r0 + r4 + 16], m0 + movu [r0 + r4 + 32], m0 + movu [r0 + r4 + 48], m0 lea r0, [r0 + 4 * r1] @@ -1900,7 +2315,7 @@ RET %endmacro -BLOCKFILL_S_W32_H4 32, 32 +BLOCKFILL_S_W32_H8 32, 32 INIT_YMM avx2 cglobal blockfill_s_32x32, 3, 4, 1 @@ -2488,9 +2903,44 @@ BLOCKCOPY_PS_W16_H4 16, 16 BLOCKCOPY_PS_W16_H4 16, 32 BLOCKCOPY_PS_W16_H4 16, 64 - BLOCKCOPY_PS_W16_H4 16, 24 +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W16_H4_avx2 2 +INIT_YMM avx2 +cglobal blockcopy_ps_%1x%2, 4, 7, 3 + + add r1, r1 + mov r4d, %2/4 + lea r5, [3 * r3] + lea r6, [3 * r1] + pxor m0, m0 + +.loop: + movu xm1, [r2] + pmovzxbw m2, xm1 + movu [r0], m2 + movu xm1, [r2 + r3] + pmovzxbw m2, xm1 + movu [r0 + r1], m2 + movu xm1, [r2 + 2 * r3] + pmovzxbw m2, xm1 + movu [r0 + 2 * r1], m2 + movu xm1, [r2 + r5] + pmovzxbw m2, xm1 + movu [r0 + r6], m2 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_PS_W16_H4_avx2 16, 16 +BLOCKCOPY_PS_W16_H4_avx2 16, 32 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;----------------------------------------------------------------------------- @@ -2588,6 +3038,57 @@ BLOCKCOPY_PS_W32_H2 32, 64 BLOCKCOPY_PS_W32_H2 32, 48 +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_PS_W32_H4_avx2 2 +INIT_YMM avx2 +cglobal blockcopy_ps_%1x%2, 4, 7, 3 + add r1, r1 + mov r4d, %2/4 + lea r5, [3 * r3] + lea r6, [3 * r1] + pxor m0, m0 + +.loop: + movu m1, [r2] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0], m3 + movu [r0 + 32], m2 + movu m1, [r2 + r3] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + r1], m3 + movu [r0 + r1 + 32], m2 + movu m1, [r2 + 2 * r3] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + 2 * r1], m3 + movu [r0 + 2 * r1 + 32], m2 + movu m1, [r2 + r5] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + r6], m3 + movu [r0 + r6 + 32], m2 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + dec r4d + jnz .loop + RET +%endmacro + +BLOCKCOPY_PS_W32_H4_avx2 32, 32 +BLOCKCOPY_PS_W32_H4_avx2 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); @@ -2721,6 +3222,80 @@ BLOCKCOPY_PS_W64_H2 64, 32 BLOCKCOPY_PS_W64_H2 64, 48 BLOCKCOPY_PS_W64_H2 64, 64 +;----------------------------------------------------------------------------- +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal blockcopy_ps_64x64, 4, 7, 4 + add r1, r1 + mov r4d, 64/4 + lea r5, [3 * r3] + lea r6, [3 * r1] + pxor m0, m0 + +.loop: + movu m1, [r2] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0], m3 + movu [r0 + 32], m2 + movu m1, [r2 + 32] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + 64], m3 + movu [r0 + 96], m2 + movu m1, [r2 + r3] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + r1], m3 + movu [r0 + r1 + 32], m2 + movu m1, [r2 + r3 + 32] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + r1 + 64], m3 + movu [r0 + r1 + 96], m2 + movu m1, [r2 + 2 * r3] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + 2 * r1], m3 + movu [r0 + 2 * r1 + 32], m2 + movu m1, [r2 + 2 * r3 + 32] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + 2 * r1 + 64], m3 + movu [r0 + 2 * r1 + 96], m2 + movu m1, [r2 + r5] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + r6], m3 + movu [r0 + r6 + 32], m2 + movu m1, [r2 + r5 + 32] + punpcklbw m2, m1, m0 + punpckhbw m1, m1, m0 + vperm2i128 m3, m2, m1, 00100000b + vperm2i128 m2, m2, m1, 00110001b + movu [r0 + r6 + 64], m3 + movu [r0 + r6 + 96], m2 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + dec r4d + jnz .loop + RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) @@ -3352,6 +3927,45 @@ BLOCKCOPY_SS_W24_H4 24, 32 BLOCKCOPY_SS_W24_H4 24, 64 +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W24_H4_avx 2 +INIT_YMM avx +cglobal blockcopy_ss_%1x%2, 4, 7, 2 + + mov r4d, %2/4 + add r1, r1 + add r3, r3 + lea r5, [3 * r3] + lea r6, [3 * r1] + +.loop + movu m0, [r2] + movu xm1, [r2 + 32] + movu [r0], m0 + movu [r0 + 32], xm1 + movu m0, [r2 + r3] + movu xm1, [r2 + r3 + 32] + movu [r0 + r1], m0 + movu [r0 + r1 + 32], xm1 + movu m0, [r2 + 2 * r3] + movu xm1, [r2 + 2 * r3 + 32] + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + 32], xm1 + movu m0, [r2 + r5] + movu xm1, [r2 + r5 + 32] + movu [r0 + r6], m0 + movu [r0 + r6 + 32], xm1 + dec r4d + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W24_H4_avx 24, 32 +BLOCKCOPY_SS_W24_H4_avx 24, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) @@ -3420,6 +4034,57 @@ BLOCKCOPY_SS_W32_H4 32, 64 BLOCKCOPY_SS_W32_H4 32, 48 +;----------------------------------------------------------------------------- +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +%macro BLOCKCOPY_SS_W32_H4_avx 2 +INIT_YMM avx +cglobal blockcopy_ss_%1x%2, 4, 7, 4 + + mov r4d, %2/4 + add r1, r1 + add r3, r3 + lea r5, [3 * r1] + lea r6, [3 * r3] + +.loop: + movu m0, [r2] + movu m1, [r2 + 32] + + movu [r0], m0 + movu [r0 + 32], m1 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 32] + + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + 32] + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + 32], m1 + + movu m0, [r2 + r6] + movu m1, [r2 + r6 + 32] + + movu [r0 + r5], m0 + movu [r0 + r5 + 32], m1 + + dec r4d + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + jnz .loop + RET +%endmacro + +BLOCKCOPY_SS_W32_H4_avx 32, 8 +BLOCKCOPY_SS_W32_H4_avx 32, 16 +BLOCKCOPY_SS_W32_H4_avx 32, 24 +BLOCKCOPY_SS_W32_H4_avx 32, 32 +BLOCKCOPY_SS_W32_H4_avx 32, 48 +BLOCKCOPY_SS_W32_H4_avx 32, 64 ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) @@ -3498,6 +4163,56 @@ %endmacro BLOCKCOPY_SS_W48_H2 48, 64 +;----------------------------------------------------------------------------- +; void blockcopy_ss_48x64(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;----------------------------------------------------------------------------- +INIT_YMM avx +cglobal blockcopy_ss_48x64, 4, 7, 6 + + mov r4d, 64/4 + add r1, r1 + add r3, r3 + lea r5, [3 * r3] + lea r6, [3 * r1] + +.loop: + movu m0, [r2] + movu m1, [r2 + 32] + movu m2, [r2 + 64] + + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + 64], m2 + + movu m0, [r2 + r3] + movu m1, [r2 + r3 + 32] + movu m2, [r2 + r3 + 64] + + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + movu [r0 + r1 + 64], m2 + + movu m0, [r2 + 2 * r3] + movu m1, [r2 + 2 * r3 + 32] + movu m2, [r2 + 2 * r3 + 64] + + movu [r0 + 2 * r1], m0 + movu [r0 + 2 * r1 + 32], m1 + movu [r0 + 2 * r1 + 64], m2 + + movu m0, [r2 + r5] + movu m1, [r2 + r5 + 32] + movu m2, [r2 + r5 + 64] + + movu [r0 + r6], m0 + movu [r0 + r6 + 32], m1 + movu [r0 + r6 + 64], m2 + + dec r4d + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + jnz .loop + RET ;----------------------------------------------------------------------------- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) @@ -3676,7 +4391,7 @@ cglobal cpy2Dto1D_shr_4, 3, 4, 4 add r2d, r2d movd m0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 @@ -3709,7 +4424,7 @@ cglobal cpy2Dto1D_shr_8, 3, 5, 4 add r2d, r2d movd m0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 8/4 @@ -3751,6 +4466,41 @@ jnz .loop RET +INIT_YMM avx2 +cglobal cpy2Dto1D_shr_8, 3, 4, 4 + add r2d, r2d + movd xm0, r3m + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + lea r3, [r2 * 3] + + ; Row 0-3 + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r2], 1 + movu xm3, [r1 + 2 * r2] + vinserti128 m3, m3, [r1 + r3], 1 + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0], m2 + movu [r0 + 32], m3 + + ; Row 4-7 + lea r1, [r1 + 4 * r2] + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r2], 1 + movu xm3, [r1 + 2 * r2] + vinserti128 m3, m3, [r1 + r3], 1 + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 64], m2 + movu [r0 + 96], m3 + RET + ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); @@ -3759,7 +4509,7 @@ cglobal cpy2Dto1D_shr_16, 3, 4, 4 add r2d, r2d movd m0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 16/2 @@ -3799,6 +4549,64 @@ jnz .loop RET +INIT_YMM avx2 +cglobal cpy2Dto1D_shr_16, 4, 5, 4 + add r2d, r2d + movd xm0, r3d + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + lea r3, [r2 * 3] + mov r4d, 16/8 + +.loop: + ; Row 0-1 + movu m2, [r1] + movu m3, [r1 + r2] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 0 * mmsize], m2 + movu [r0 + 1 * mmsize], m3 + + ; Row 2-3 + movu m2, [r1 + 2 * r2] + movu m3, [r1 + r3] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 2 * mmsize], m2 + movu [r0 + 3 * mmsize], m3 + + ; Row 4-5 + lea r1, [r1 + 4 * r2] + movu m2, [r1] + movu m3, [r1 + r2] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 4 * mmsize], m2 + movu [r0 + 5 * mmsize], m3 + + ; Row 6-7 + movu m2, [r1 + 2 * r2] + movu m3, [r1 + r3] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 6 * mmsize], m2 + movu [r0 + 7 * mmsize], m3 + + add r0, 8 * mmsize + lea r1, [r1 + 4 * r2] + dec r4d + jnz .loop + RET + ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); @@ -3807,7 +4615,7 @@ cglobal cpy2Dto1D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 32/1 @@ -3845,6 +4653,62 @@ jnz .loop RET +INIT_YMM avx2 +cglobal cpy2Dto1D_shr_32, 4, 5, 4 + add r2d, r2d + movd xm0, r3d + pcmpeqw m1, m1 + psllw m1, xm0 + psraw m1, 1 + lea r3, [r2 * 3] + mov r4d, 32/4 + +.loop: + ; Row 0 + movu m2, [r1] + movu m3, [r1 + 32] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 0 * mmsize], m2 + movu [r0 + 1 * mmsize], m3 + + ; Row 1 + movu m2, [r1 + r2] + movu m3, [r1 + r2 + 32] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 2 * mmsize], m2 + movu [r0 + 3 * mmsize], m3 + + ; Row 2 + movu m2, [r1 + 2 * r2] + movu m3, [r1 + 2 * r2 + 32] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 4 * mmsize], m2 + movu [r0 + 5 * mmsize], m3 + + ; Row 3 + movu m2, [r1 + r3] + movu m3, [r1 + r3 + 32] + psubw m2, m1 + psraw m2, xm0 + psubw m3, m1 + psraw m3, xm0 + movu [r0 + 6 * mmsize], m2 + movu [r0 + 7 * mmsize], m3 + + add r0, 8 * mmsize + lea r1, [r1 + 4 * r2] + dec r4d + jnz .loop + RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) @@ -4535,6 +5399,42 @@ jnz .loop RET +;-------------------------------------------------------------------------------------- +; void cpy2Dto1D_shl_8(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +;-------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal cpy2Dto1D_shl_8, 4, 5, 2 + add r2d, r2d + movd xm0, r3d + lea r4, [3 * r2] + + ; Row 0, 1 + movu xm1, [r1] + vinserti128 m1, m1, [r1 + r2], 1 + psllw m1, xm0 + movu [r0], m1 + + ; Row 2, 3 + movu xm1, [r1 + 2 * r2] + vinserti128 m1, m1, [r1 + r4], 1 + psllw m1, xm0 + movu [r0 + 32], m1 + + lea r1, [r1 + 4 * r2] + + ; Row 4, 5 + movu xm1, [r1] + vinserti128 m1, m1, [r1 + r2], 1 + psllw m1, xm0 + movu [r0 + 64], m1 + + ; Row 6, 7 + movu xm1, [r1 + 2 * r2] + vinserti128 m1, m1, [r1 + r4], 1 + psllw m1, xm0 + movu [r0 + 96], m1 + RET + ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); @@ -4575,6 +5475,38 @@ jnz .loop RET +;-------------------------------------------------------------------------------------- +; void cpy2Dto1D_shl_16(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +;-------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal cpy2Dto1D_shl_16, 3, 5, 3 + add r2d, r2d + movd xm0, r3m + mov r3d, 16/4 + lea r4, [r2 * 3] + +.loop: + ; Row 0-1 + movu m1, [r1] + movu m2, [r1 + r2] + psllw m1, xm0 + psllw m2, xm0 + movu [r0 + 0 * mmsize], m1 + movu [r0 + 1 * mmsize], m2 + + ; Row 2-3 + movu m1, [r1 + 2 * r2] + movu m2, [r1 + r4] + psllw m1, xm0 + psllw m2, xm0 + movu [r0 + 2 * mmsize], m1 + movu [r0 + 3 * mmsize], m2 + + add r0, 4 * mmsize + lea r1, [r1 + r2 * 4] + dec r3d + jnz .loop + RET ;-------------------------------------------------------------------------------------- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); @@ -4613,6 +5545,52 @@ jnz .loop RET +;-------------------------------------------------------------------------------------- +; void cpy2Dto1D_shl_32(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +;-------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal cpy2Dto1D_shl_32, 3, 5, 5 + add r2d, r2d + movd xm0, r3m + mov r3d, 32/4 + lea r4, [3 * r2] + +.loop: + ; Row 0-1 + movu m1, [r1] + movu m2, [r1 + 32] + movu m3, [r1 + r2] + movu m4, [r1 + r2 + 32] + + psllw m1, xm0 + psllw m2, xm0 + psllw m3, xm0 + psllw m4, xm0 + movu [r0], m1 + movu [r0 + mmsize], m2 + movu [r0 + 2 * mmsize], m3 + movu [r0 + 3 * mmsize], m4 + + ; Row 2-3 + movu m1, [r1 + 2 * r2] + movu m2, [r1 + 2 * r2 + 32] + movu m3, [r1 + r4] + movu m4, [r1 + r4 + 32] + + psllw m1, xm0 + psllw m2, xm0 + psllw m3, xm0 + psllw m4, xm0 + movu [r0 + 4 * mmsize], m1 + movu [r0 + 5 * mmsize], m2 + movu [r0 + 6 * mmsize], m3 + movu [r0 + 7 * mmsize], m4 + + add r0, 8 * mmsize + lea r1, [r1 + r2 * 4] + dec r3d + jnz .loop + RET ;-------------------------------------------------------------------------------------- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) @@ -4621,7 +5599,7 @@ cglobal cpy1Dto2D_shr_4, 3, 3, 4 add r2d, r2d movd m0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 @@ -4644,7 +5622,7 @@ cglobal cpy1Dto2D_shr_4, 3, 3, 3 add r2d, r2d movd xm0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 @@ -4668,7 +5646,7 @@ cglobal cpy1Dto2D_shr_8, 3, 4, 6 add r2d, r2d movd m0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 lea r3, [r2 * 3] @@ -4716,7 +5694,7 @@ cglobal cpy1Dto2D_shr_8, 3, 4, 4 add r2d, r2d movd xm0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 lea r3, [r2 * 3] @@ -4755,7 +5733,7 @@ cglobal cpy1Dto2D_shr_16, 3, 5, 6 add r2d, r2d movd m0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 16/4 @@ -4809,7 +5787,7 @@ cglobal cpy1Dto2D_shr_16, 3, 5, 4 add r2d, r2d movd xm0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 mov r3d, 16/4 @@ -4850,7 +5828,7 @@ cglobal cpy1Dto2D_shr_32, 3, 4, 6 add r2d, r2d movd m0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, m0 psraw m1, 1 mov r3d, 32/2 @@ -4903,7 +5881,7 @@ cglobal cpy1Dto2D_shr_32, 3, 4, 6 add r2d, r2d movd xm0, r3m - pcmpeqw m1, m1 + pcmpeqw m1, m1 psllw m1, xm0 psraw m1, 1 mov r3d, 32/2 diff -Nru x265-1.5/source/common/x86/blockcopy8.h x265-1.6/source/common/x86/blockcopy8.h --- x265-1.5/source/common/x86/blockcopy8.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/blockcopy8.h 2015-04-02 16:46:36.000000000 +0000 @@ -48,6 +48,12 @@ void x265_cpy1Dto2D_shr_8_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); void x265_cpy1Dto2D_shr_16_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); void x265_cpy1Dto2D_shr_32_sse2(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift); +void x265_cpy2Dto1D_shl_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shl_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shl_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_8_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_16_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +void x265_cpy2Dto1D_shr_32_avx2(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); uint32_t x265_copy_cnt_4_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); uint32_t x265_copy_cnt_8_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); uint32_t x265_copy_cnt_16_sse4(int16_t* dst, const int16_t* src, intptr_t srcStride); @@ -198,6 +204,15 @@ void x265_blockcopy_ss_64x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); void x265_blockcopy_ss_64x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); void x265_blockcopy_ss_64x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_32x8_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_32x16_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_32x24_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_32x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_32x48_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_32x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_48x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_24x32_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +void x265_blockcopy_ss_24x64_avx(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); void x265_blockcopy_pp_32x8_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); void x265_blockcopy_pp_32x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); @@ -205,9 +220,36 @@ void x265_blockcopy_pp_32x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); void x265_blockcopy_pp_32x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); void x265_blockcopy_pp_32x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_64x16_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_64x32_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_64x48_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_64x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_pp_48x64_avx(pixel* a, intptr_t stridea, const pixel* b, intptr_t strideb); void x265_blockfill_s_16x16_avx2(int16_t* dst, intptr_t dstride, int16_t val); void x265_blockfill_s_32x32_avx2(int16_t* dst, intptr_t dstride, int16_t val); +// copy_sp primitives +// 16 x N +void x265_blockcopy_sp_16x16_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); +void x265_blockcopy_sp_16x32_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); + +// 32 x N +void x265_blockcopy_sp_32x32_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); +void x265_blockcopy_sp_32x64_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); + +// 64 x N +void x265_blockcopy_sp_64x64_avx2(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb); +// copy_ps primitives +// 16 x N +void x265_blockcopy_ps_16x16_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_ps_16x32_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb); + +// 32 x N +void x265_blockcopy_ps_32x32_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb); +void x265_blockcopy_ps_32x64_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb); + +// 64 x N +void x265_blockcopy_ps_64x64_avx2(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb); #undef BLOCKCOPY_COMMON #undef BLOCKCOPY_SS_PP diff -Nru x265-1.5/source/common/x86/const-a.asm x265-1.6/source/common/x86/const-a.asm --- x265-1.5/source/common/x86/const-a.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/const-a.asm 2015-04-02 16:46:36.000000000 +0000 @@ -6,7 +6,7 @@ ;* Authors: Loren Merritt ;* Fiona Glaser ;* Min Chen -;* +;* Praveen Kumar Tiwari ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by ;* the Free Software Foundation; either version 2 of the License, or @@ -37,11 +37,14 @@ const pw_32, times 16 dw 32 const pw_128, times 16 dw 128 const pw_256, times 16 dw 256 +const pw_257, times 16 dw 257 const pw_512, times 16 dw 512 const pw_1023, times 8 dw 1023 +ALIGN 32 const pw_1024, times 16 dw 1024 const pw_4096, times 16 dw 4096 const pw_00ff, times 16 dw 0x00ff +ALIGN 32 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 @@ -50,16 +53,16 @@ const pb_unpackwq2, db 4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7 const pw_swap, times 2 db 6,7,4,5,2,3,0,1 -const pb_2, times 16 db 2 -const pb_4, times 16 db 4 -const pb_16, times 16 db 16 -const pb_64, times 16 db 64 +const pb_2, times 32 db 2 +const pb_4, times 32 db 4 +const pb_16, times 32 db 16 +const pb_64, times 32 db 64 const pb_01, times 8 db 0,1 const pb_0, times 16 db 0 const pb_a1, times 16 db 0xa1 const pb_3, times 16 db 3 -const pb_8, times 16 db 8 -const pb_32, times 16 db 32 +const pb_8, times 32 db 8 +const pb_32, times 32 db 32 const pb_128, times 16 db 128 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6 @@ -72,7 +75,7 @@ const pw_256, times 8 dw 256 const pw_32_0, times 4 dw 32, times 4 dw 0 -const pw_2000, times 8 dw 0x2000 +const pw_2000, times 16 dw 0x2000 const pw_8000, times 8 dw 0x8000 const pw_3fff, times 8 dw 0x3fff const pw_ppppmmmm, dw 1,1,1,1,-1,-1,-1,-1 @@ -80,7 +83,7 @@ const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0 const pd_1, times 8 dd 1 -const pd_2, times 4 dd 2 +const pd_2, times 8 dd 2 const pd_4, times 4 dd 4 const pd_8, times 4 dd 8 const pd_16, times 4 dd 16 diff -Nru x265-1.5/source/common/x86/dct8.asm x265-1.6/source/common/x86/dct8.asm --- x265-1.5/source/common/x86/dct8.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/dct8.asm 2015-04-02 16:46:36.000000000 +0000 @@ -748,6 +748,368 @@ movhps [r1 + r2], m1 RET +;------------------------------------------------------- +; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride) +;------------------------------------------------------- +INIT_XMM sse2 +cglobal dct8, 3,6,8,0-16*mmsize + ;------------------------ + ; Stack Mapping(dword) + ;------------------------ + ; Row0[0-3] Row1[0-3] + ; ... + ; Row6[0-3] Row7[0-3] + ; Row0[0-3] Row7[0-3] + ; ... + ; Row6[4-7] Row7[4-7] + ;------------------------ +%if BIT_DEPTH == 10 + %define DCT_SHIFT1 4 + %define DCT_ADD1 [pd_8] +%elif BIT_DEPTH == 8 + %define DCT_SHIFT1 2 + %define DCT_ADD1 [pd_2] +%else + %error Unsupported BIT_DEPTH! +%endif +%define DCT_ADD2 [pd_256] +%define DCT_SHIFT2 9 + + add r2, r2 + lea r3, [r2 * 3] + mov r5, rsp +%assign x 0 +%rep 2 + movu m0, [r0] + movu m1, [r0 + r2] + movu m2, [r0 + r2 * 2] + movu m3, [r0 + r3] + + punpcklwd m4, m0, m1 + punpckhwd m0, m1 + punpcklwd m5, m2, m3 + punpckhwd m2, m3 + punpckldq m1, m4, m5 ; m1 = [1 0] + punpckhdq m4, m5 ; m4 = [3 2] + punpckldq m3, m0, m2 + punpckhdq m0, m2 + pshufd m2, m3, 0x4E ; m2 = [4 5] + pshufd m0, m0, 0x4E ; m0 = [6 7] + + paddw m3, m1, m0 + psubw m1, m0 ; m1 = [d1 d0] + paddw m0, m4, m2 + psubw m4, m2 ; m4 = [d3 d2] + punpcklqdq m2, m3, m0 ; m2 = [s2 s0] + punpckhqdq m3, m0 + pshufd m3, m3, 0x4E ; m3 = [s1 s3] + + punpcklwd m0, m1, m4 ; m0 = [d2/d0] + punpckhwd m1, m4 ; m1 = [d3/d1] + punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0] + punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0] + + ; odd + lea r4, [tab_dct8_1] + pmaddwd m1, m4, [r4 + 0*16] + pmaddwd m5, m0, [r4 + 0*16] + pshufd m1, m1, 0xD8 + pshufd m5, m5, 0xD8 + mova m7, m1 + punpckhqdq m7, m5 + punpcklqdq m1, m5 + paddd m1, m7 + paddd m1, DCT_ADD1 + psrad m1, DCT_SHIFT1 + %if x == 1 + pshufd m1, m1, 0x1B + %endif + mova [r5 + 1*2*mmsize], m1 ; Row 1 + + pmaddwd m1, m4, [r4 + 1*16] + pmaddwd m5, m0, [r4 + 1*16] + pshufd m1, m1, 0xD8 + pshufd m5, m5, 0xD8 + mova m7, m1 + punpckhqdq m7, m5 + punpcklqdq m1, m5 + paddd m1, m7 + paddd m1, DCT_ADD1 + psrad m1, DCT_SHIFT1 + %if x == 1 + pshufd m1, m1, 0x1B + %endif + mova [r5 + 3*2*mmsize], m1 ; Row 3 + + pmaddwd m1, m4, [r4 + 2*16] + pmaddwd m5, m0, [r4 + 2*16] + pshufd m1, m1, 0xD8 + pshufd m5, m5, 0xD8 + mova m7, m1 + punpckhqdq m7, m5 + punpcklqdq m1, m5 + paddd m1, m7 + paddd m1, DCT_ADD1 + psrad m1, DCT_SHIFT1 + %if x == 1 + pshufd m1, m1, 0x1B + %endif + mova [r5 + 5*2*mmsize], m1 ; Row 5 + + pmaddwd m4, [r4 + 3*16] + pmaddwd m0, [r4 + 3*16] + pshufd m4, m4, 0xD8 + pshufd m0, m0, 0xD8 + mova m7, m4 + punpckhqdq m7, m0 + punpcklqdq m4, m0 + paddd m4, m7 + paddd m4, DCT_ADD1 + psrad m4, DCT_SHIFT1 + %if x == 1 + pshufd m4, m4, 0x1B + %endif + mova [r5 + 7*2*mmsize], m4; Row 7 + + ; even + lea r4, [tab_dct4] + paddw m0, m2, m3 ; m0 = [EE1 EE0] + pshufd m0, m0, 0xD8 + pshuflw m0, m0, 0xD8 + pshufhw m0, m0, 0xD8 + psubw m2, m3 ; m2 = [EO1 EO0] + pmullw m2, [pw_ppppmmmm] + pshufd m2, m2, 0xD8 + pshuflw m2, m2, 0xD8 + pshufhw m2, m2, 0xD8 + pmaddwd m3, m0, [r4 + 0*16] + paddd m3, DCT_ADD1 + psrad m3, DCT_SHIFT1 + %if x == 1 + pshufd m3, m3, 0x1B + %endif + mova [r5 + 0*2*mmsize], m3 ; Row 0 + pmaddwd m0, [r4 + 2*16] + paddd m0, DCT_ADD1 + psrad m0, DCT_SHIFT1 + %if x == 1 + pshufd m0, m0, 0x1B + %endif + mova [r5 + 4*2*mmsize], m0 ; Row 4 + pmaddwd m3, m2, [r4 + 1*16] + paddd m3, DCT_ADD1 + psrad m3, DCT_SHIFT1 + %if x == 1 + pshufd m3, m3, 0x1B + %endif + mova [r5 + 2*2*mmsize], m3 ; Row 2 + pmaddwd m2, [r4 + 3*16] + paddd m2, DCT_ADD1 + psrad m2, DCT_SHIFT1 + %if x == 1 + pshufd m2, m2, 0x1B + %endif + mova [r5 + 6*2*mmsize], m2 ; Row 6 + + %if x != 1 + lea r0, [r0 + r2 * 4] + add r5, mmsize + %endif +%assign x x+1 +%endrep + + mov r0, rsp ; r0 = pointer to Low Part + lea r4, [tab_dct8_2] + +%assign x 0 +%rep 4 + mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0] + mova m1, [r0 + 1*2*mmsize] + paddd m2, m0, [r0 + (0*2+1)*mmsize] + pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0] + paddd m3, m1, [r0 + (1*2+1)*mmsize] + pshufd m3, m3, 0x9C ; m3 = ^^ + psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0] + psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^ + + ; even + pshufd m4, m2, 0xD8 + pshufd m3, m3, 0xD8 + mova m7, m4 + punpckhqdq m7, m3 + punpcklqdq m4, m3 + mova m2, m4 + paddd m4, m7 ; m4 = [EE1 EE0 EE1 EE0] + psubd m2, m7 ; m2 = [EO1 EO0 EO1 EO0] + + pslld m4, 6 ; m4 = [64*EE1 64*EE0] + mova m5, m2 + pmuludq m5, [r4 + 0*16] + pshufd m7, m2, 0xF5 + movu m6, [r4 + 0*16 + 4] + pmuludq m7, m6 + pshufd m5, m5, 0x88 + pshufd m7, m7, 0x88 + punpckldq m5, m7 ; m5 = [36*EO1 83*EO0] + pshufd m7, m2, 0xF5 + pmuludq m2, [r4 + 1*16] + movu m6, [r4 + 1*16 + 4] + pmuludq m7, m6 + pshufd m2, m2, 0x88 + pshufd m7, m7, 0x88 + punpckldq m2, m7 ; m2 = [83*EO1 36*EO0] + + pshufd m3, m4, 0xD8 + pshufd m5, m5, 0xD8 + mova m7, m3 + punpckhqdq m7, m5 + punpcklqdq m3, m5 + paddd m3, m7 ; m3 = [Row2 Row0] + paddd m3, DCT_ADD2 + psrad m3, DCT_SHIFT2 + pshufd m4, m4, 0xD8 + pshufd m2, m2, 0xD8 + mova m7, m4 + punpckhqdq m7, m2 + punpcklqdq m4, m2 + psubd m4, m7 ; m4 = [Row6 Row4] + paddd m4, DCT_ADD2 + psrad m4, DCT_SHIFT2 + + packssdw m3, m3 + movd [r1 + 0*mmsize], m3 + pshufd m3, m3, 1 + movd [r1 + 2*mmsize], m3 + + packssdw m4, m4 + movd [r1 + 4*mmsize], m4 + pshufd m4, m4, 1 + movd [r1 + 6*mmsize], m4 + + ; odd + mova m2, m0 + pmuludq m2, [r4 + 2*16] + pshufd m7, m0, 0xF5 + movu m6, [r4 + 2*16 + 4] + pmuludq m7, m6 + pshufd m2, m2, 0x88 + pshufd m7, m7, 0x88 + punpckldq m2, m7 + mova m3, m1 + pmuludq m3, [r4 + 2*16] + pshufd m7, m1, 0xF5 + pmuludq m7, m6 + pshufd m3, m3, 0x88 + pshufd m7, m7, 0x88 + punpckldq m3, m7 + mova m4, m0 + pmuludq m4, [r4 + 3*16] + pshufd m7, m0, 0xF5 + movu m6, [r4 + 3*16 + 4] + pmuludq m7, m6 + pshufd m4, m4, 0x88 + pshufd m7, m7, 0x88 + punpckldq m4, m7 + mova m5, m1 + pmuludq m5, [r4 + 3*16] + pshufd m7, m1, 0xF5 + pmuludq m7, m6 + pshufd m5, m5, 0x88 + pshufd m7, m7, 0x88 + punpckldq m5, m7 + pshufd m2, m2, 0xD8 + pshufd m3, m3, 0xD8 + mova m7, m2 + punpckhqdq m7, m3 + punpcklqdq m2, m3 + paddd m2, m7 + pshufd m4, m4, 0xD8 + pshufd m5, m5, 0xD8 + mova m7, m4 + punpckhqdq m7, m5 + punpcklqdq m4, m5 + paddd m4, m7 + pshufd m2, m2, 0xD8 + pshufd m4, m4, 0xD8 + mova m7, m2 + punpckhqdq m7, m4 + punpcklqdq m2, m4 + paddd m2, m7 ; m2 = [Row3 Row1] + paddd m2, DCT_ADD2 + psrad m2, DCT_SHIFT2 + + packssdw m2, m2 + movd [r1 + 1*mmsize], m2 + pshufd m2, m2, 1 + movd [r1 + 3*mmsize], m2 + + mova m2, m0 + pmuludq m2, [r4 + 4*16] + pshufd m7, m0, 0xF5 + movu m6, [r4 + 4*16 + 4] + pmuludq m7, m6 + pshufd m2, m2, 0x88 + pshufd m7, m7, 0x88 + punpckldq m2, m7 + mova m3, m1 + pmuludq m3, [r4 + 4*16] + pshufd m7, m1, 0xF5 + pmuludq m7, m6 + pshufd m3, m3, 0x88 + pshufd m7, m7, 0x88 + punpckldq m3, m7 + mova m4, m0 + pmuludq m4, [r4 + 5*16] + pshufd m7, m0, 0xF5 + movu m6, [r4 + 5*16 + 4] + pmuludq m7, m6 + pshufd m4, m4, 0x88 + pshufd m7, m7, 0x88 + punpckldq m4, m7 + mova m5, m1 + pmuludq m5, [r4 + 5*16] + pshufd m7, m1, 0xF5 + pmuludq m7, m6 + pshufd m5, m5, 0x88 + pshufd m7, m7, 0x88 + punpckldq m5, m7 + pshufd m2, m2, 0xD8 + pshufd m3, m3, 0xD8 + mova m7, m2 + punpckhqdq m7, m3 + punpcklqdq m2, m3 + paddd m2, m7 + pshufd m4, m4, 0xD8 + pshufd m5, m5, 0xD8 + mova m7, m4 + punpckhqdq m7, m5 + punpcklqdq m4, m5 + paddd m4, m7 + pshufd m2, m2, 0xD8 + pshufd m4, m4, 0xD8 + mova m7, m2 + punpckhqdq m7, m4 + punpcklqdq m2, m4 + paddd m2, m7 ; m2 = [Row7 Row5] + paddd m2, DCT_ADD2 + psrad m2, DCT_SHIFT2 + + packssdw m2, m2 + movd [r1 + 5*mmsize], m2 + pshufd m2, m2, 1 + movd [r1 + 7*mmsize], m2 +%if x < 3 + add r1, mmsize/4 + add r0, 2*2*mmsize +%endif +%assign x x+1 +%endrep + + RET +%undef IDCT_SHIFT1 +%undef IDCT_ADD1 +%undef IDCT_SHIFT2 +%undef IDCT_ADD2 ;------------------------------------------------------- ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride) diff -Nru x265-1.5/source/common/x86/dct8.h x265-1.6/source/common/x86/dct8.h --- x265-1.5/source/common/x86/dct8.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/dct8.h 2015-04-02 16:46:36.000000000 +0000 @@ -24,6 +24,7 @@ #ifndef X265_DCT8_H #define X265_DCT8_H void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride); +void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride); void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride); void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride); void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride); diff -Nru x265-1.5/source/common/x86/intrapred16.asm x265-1.6/source/common/x86/intrapred16.asm --- x265-1.5/source/common/x86/intrapred16.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/intrapred16.asm 2015-04-02 16:46:36.000000000 +0000 @@ -65,6 +65,10 @@ pw_planar16_1: dw 15, 15, 15, 15, 15, 15, 15, 15 pd_planar32_1: dd 31, 31, 31, 31 +pw_planar32_1: dw 31, 31, 31, 31, 31, 31, 31, 31 +pw_planar32_L: dw 31, 30, 29, 28, 27, 26, 25, 24 +pw_planar32_H: dw 23, 22, 21, 20, 19, 18, 17, 16 + const planar32_table %assign x 31 %rep 8 @@ -82,15 +86,19 @@ SECTION .text cextern pw_1 +cextern pw_2 cextern pw_4 cextern pw_8 cextern pw_16 +cextern pw_32 cextern pw_1023 cextern pd_16 cextern pd_32 cextern pw_4096 cextern multiL cextern multiH +cextern multiH2 +cextern multiH3 cextern multi_2Row cextern pw_swap cextern pb_unpackwq1 @@ -99,6 +107,592 @@ ;----------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) ;----------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_dc4, 5,6,2 + movh m0, [r2 + 18] ; sumAbove + movh m1, [r2 + 2] ; sumLeft + + paddw m0, m1 + pshuflw m1, m0, 0x4E + paddw m0, m1 + pshuflw m1, m0, 0xB1 + paddw m0, m1 + + test r4d, r4d + + paddw m0, [pw_4] + psraw m0, 3 + + ; store DC 4x4 + movh [r0], m0 + movh [r0 + r1 * 2], m0 + movh [r0 + r1 * 4], m0 + lea r5, [r0 + r1 * 4] + movh [r5 + r1 * 2], m0 + + ; do DC filter + jz .end + movh m1, m0 + psllw m1, 1 + paddw m1, [pw_2] + movd r3d, m1 + paddw m0, m1 + ; filter top + movh m1, [r2 + 2] + paddw m1, m0 + psraw m1, 2 + movh [r0], m1 ; overwrite top-left pixel, we will update it later + + ; filter top-left + movzx r3d, r3w + movzx r4d, word [r2 + 18] + add r3d, r4d + movzx r4d, word [r2 + 2] + add r4d, r3d + shr r4d, 2 + mov [r0], r4w + + ; filter left + movu m1, [r2 + 20] + paddw m1, m0 + psraw m1, 2 + movd r3d, m1 + mov [r0 + r1 * 2], r3w + shr r3d, 16 + mov [r0 + r1 * 4], r3w + pextrw r3d, m1, 2 + mov [r5 + r1 * 2], r3w +.end: + RET + +;----------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) +;----------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_dc8, 5, 8, 2 + movu m0, [r2 + 34] + movu m1, [r2 + 2] + + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + pshufd m1, m0, 1 + paddw m0, m1 + pmaddwd m0, [pw_1] + + paddw m0, [pw_8] + psraw m0, 4 ; sum = sum / 16 + pshuflw m0, m0, 0 + pshufd m0, m0, 0 ; m0 = word [dc_val ...] + + test r4d, r4d + + ; store DC 8x8 + lea r6, [r1 + r1 * 4] + lea r6, [r6 + r1] + lea r5, [r6 + r1 * 4] + lea r7, [r6 + r1 * 8] + movu [r0], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 4], m0 + movu [r0 + r6], m0 + movu [r0 + r1 * 8], m0 + movu [r0 + r5], m0 + movu [r0 + r6 * 2], m0 + movu [r0 + r7], m0 + + ; Do DC Filter + jz .end + mova m1, [pw_2] + pmullw m1, m0 + paddw m1, [pw_2] + movd r4d, m1 ; r4d = DC * 2 + 2 + paddw m1, m0 ; m1 = DC * 3 + 2 + pshuflw m1, m1, 0 + pshufd m1, m1, 0 ; m1 = pixDCx3 + + ; filter top + movu m0, [r2 + 2] + paddw m0, m1 + psraw m0, 2 + movu [r0], m0 + + ; filter top-left + movzx r4d, r4w + movzx r3d, word [r2 + 34] + add r4d, r3d + movzx r3d, word [r2 + 2] + add r3d, r4d + shr r3d, 2 + mov [r0], r3w + + ; filter left + movu m0, [r2 + 36] + paddw m0, m1 + psraw m0, 2 + movh r3, m0 + mov [r0 + r1 * 2], r3w + shr r3, 16 + mov [r0 + r1 * 4], r3w + shr r3, 16 + mov [r0 + r6], r3w + shr r3, 16 + mov [r0 + r1 * 8], r3w + pshufd m0, m0, 0x6E + movh r3, m0 + mov [r0 + r5], r3w + shr r3, 16 + mov [r0 + r6 * 2], r3w + shr r3, 16 + mov [r0 + r7], r3w +.end: + RET + +;------------------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter) +;------------------------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_dc16, 5, 10, 4 + lea r3, [r2 + 66] + add r1, r1 + movu m0, [r3] + movu m1, [r3 + 16] + movu m2, [r2 + 2] + movu m3, [r2 + 18] + + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + movhlps m1, m0 + paddw m0, m1 + pshuflw m1, m0, 0x6E + paddw m0, m1 + pmaddwd m0, [pw_1] + + paddw m0, [pw_16] + psraw m0, 5 + movd r5d, m0 + pshuflw m0, m0, 0 ; m0 = word [dc_val ...] + pshufd m0, m0, 0 + + test r4d, r4d + + ; store DC 16x16 + lea r6, [r1 + r1 * 2] ;index 3 + lea r7, [r1 + r1 * 4] ;index 5 + lea r8, [r6 + r1 * 4] ;index 7 + lea r9, [r0 + r8] ;base + 7 + movu [r0], m0 + movu [r0 + 16], m0 + movu [r0 + r1], m0 + movu [r0 + 16 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r1 * 2 + 16], m0 + movu [r0 + r6], m0 + movu [r0 + r6 + 16], m0 + movu [r0 + r1 * 4], m0 + movu [r0 + r1 * 4 + 16], m0 + movu [r0 + r7], m0 + movu [r0 + r7 + 16], m0 + movu [r0 + r6 * 2], m0 + movu [r0 + r6 * 2 + 16], m0 + movu [r9], m0 + movu [r9 + 16], m0 + movu [r0 + r1 * 8], m0 + movu [r0 + r1 * 8 + 16], m0 + movu [r9 + r1 * 2], m0 + movu [r9 + r1 * 2 + 16], m0 + movu [r0 + r7 * 2], m0 + movu [r0 + r7 * 2 + 16], m0 + movu [r9 + r1 * 4], m0 + movu [r9 + r1 * 4 + 16], m0 + movu [r0 + r6 * 4], m0 + movu [r0 + r6 * 4 + 16], m0 + movu [r9 + r6 * 2], m0 + movu [r9 + r6 * 2 + 16], m0 + movu [r9 + r8], m0 + movu [r9 + r8 + 16], m0 + movu [r9 + r1 * 8], m0 + movu [r9 + r1 * 8 + 16], m0 + + ; Do DC Filter + jz .end + mova m1, [pw_2] + pmullw m1, m0 + paddw m1, [pw_2] + movd r4d, m1 + paddw m1, m0 + + ; filter top + movu m2, [r2 + 2] + paddw m2, m1 + psraw m2, 2 + movu [r0], m2 + movu m3, [r2 + 18] + paddw m3, m1 + psraw m3, 2 + movu [r0 + 16], m3 + + ; filter top-left + movzx r4d, r4w + movzx r5d, word [r3] + add r4d, r5d + movzx r5d, word [r2 + 2] + add r5d, r4d + shr r5d, 2 + mov [r0], r5w + + ; filter left + movu m2, [r3 + 2] + paddw m2, m1 + psraw m2, 2 + + movq r2, m2 + pshufd m2, m2, 0xEE + mov [r0 + r1], r2w + shr r2, 16 + mov [r0 + r1 * 2], r2w + shr r2, 16 + mov [r0 + r6], r2w + shr r2, 16 + mov [r0 + r1 * 4], r2w + movq r2, m2 + mov [r0 + r7], r2w + shr r2, 16 + mov [r0 + r6 * 2], r2w + shr r2, 16 + mov [r9], r2w + shr r2, 16 + mov [r0 + r1 * 8], r2w + + movu m3, [r3 + 18] + paddw m3, m1 + psraw m3, 2 + + movq r3, m3 + pshufd m3, m3, 0xEE + mov [r9 + r1 * 2], r3w + shr r3, 16 + mov [r0 + r7 * 2], r3w + shr r3, 16 + mov [r9 + r1 * 4], r3w + shr r3, 16 + mov [r0 + r6 * 4], r3w + movq r3, m3 + mov [r9 + r6 * 2], r3w + shr r3, 16 + mov [r9 + r8], r3w + shr r3, 16 + mov [r9 + r1 * 8], r3w +.end: + RET + +;------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter) +;------------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_dc32, 3, 4, 6 + lea r3, [r2 + 130] ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel) + add r2, 2 + add r1, r1 + movu m0, [r3] + movu m1, [r3 + 16] + movu m2, [r3 + 32] + movu m3, [r3 + 48] + paddw m0, m1 + paddw m2, m3 + paddw m0, m2 + movu m1, [r2] + movu m3, [r2 + 16] + movu m4, [r2 + 32] + movu m5, [r2 + 48] + paddw m1, m3 + paddw m4, m5 + paddw m1, m4 + paddw m0, m1 + movhlps m1, m0 + paddw m0, m1 + pshuflw m1, m0, 0x6E + paddw m0, m1 + pmaddwd m0, [pw_1] + + paddd m0, [pd_32] ; sum = sum + 32 + psrld m0, 6 ; sum = sum / 64 + pshuflw m0, m0, 0 + pshufd m0, m0, 0 + + lea r2, [r1 * 3] + ; store DC 32x32 +%assign x 1 +%rep 8 + movu [r0 + 0], m0 + movu [r0 + 16], m0 + movu [r0 + 32], m0 + movu [r0 + 48], m0 + movu [r0 + r1 + 0], m0 + movu [r0 + r1 + 16], m0 + movu [r0 + r1 + 32], m0 + movu [r0 + r1 + 48], m0 + movu [r0 + r1 * 2 + 0], m0 + movu [r0 + r1 * 2 + 16], m0 + movu [r0 + r1 * 2 + 32], m0 + movu [r0 + r1 * 2 + 48], m0 + movu [r0 + r2 + 0], m0 + movu [r0 + r2 + 16], m0 + movu [r0 + r2 + 32], m0 + movu [r0 + r2 + 48], m0 + %if x < 8 + lea r0, [r0 + r1 * 4] + %endif +%assign x x + 1 +%endrep + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_planar8, 3,3,5 + movu m1, [r2 + 2] + movu m2, [r2 + 34] + + movd m3, [r2 + 18] ; topRight = above[8]; + movd m4, [r2 + 50] ; bottomLeft = left[8]; + + pshuflw m3, m3, 0 + pshuflw m4, m4, 0 + pshufd m3, m3, 0 ; v_topRight + pshufd m4, m4, 0 ; v_bottomLeft + + pmullw m3, [multiL] ; (x + 1) * topRight + pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) * above[x] + paddw m3, [pw_8] + paddw m3, m4 + paddw m3, m0 + psubw m4, m1 + +%macro INTRA_PRED_PLANAR_8 1 +%if (%1 < 4) + pshuflw m1, m2, 0x55 * %1 + pshufd m1, m1, 0 +%else + pshufhw m1, m2, 0x55 * (%1 - 4) + pshufd m1, m1, 0xAA +%endif + pmullw m1, [pw_planar8_0] + paddw m1, m3 + psraw m1, 4 + movu [r0], m1 +%if (%1 < 7) + paddw m3, m4 + lea r0, [r0 + r1 * 2] +%endif +%endmacro + + INTRA_PRED_PLANAR_8 0 + INTRA_PRED_PLANAR_8 1 + INTRA_PRED_PLANAR_8 2 + INTRA_PRED_PLANAR_8 3 + INTRA_PRED_PLANAR_8 4 + INTRA_PRED_PLANAR_8 5 + INTRA_PRED_PLANAR_8 6 + INTRA_PRED_PLANAR_8 7 + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_planar16, 3,3,8 + movu m2, [r2 + 2] + movu m7, [r2 + 18] + + movd m3, [r2 + 34] ; topRight = above[16] + movd m6, [r2 + 98] ; bottomLeft = left[16] + + pshuflw m3, m3, 0 + pshuflw m6, m6, 0 + pshufd m3, m3, 0 ; v_topRight + pshufd m6, m6, 0 ; v_bottomLeft + + pmullw m4, m3, [multiH] ; (x + 1) * topRight + pmullw m3, [multiL] ; (x + 1) * topRight + pmullw m1, m2, [pw_planar16_1] ; (blkSize - 1 - y) * above[x] + pmullw m5, m7, [pw_planar16_1] ; (blkSize - 1 - y) * above[x] + paddw m4, [pw_16] + paddw m3, [pw_16] + paddw m4, m6 + paddw m3, m6 + paddw m4, m5 + paddw m3, m1 + psubw m1, m6, m7 + psubw m6, m2 + + movu m2, [r2 + 66] + movu m7, [r2 + 82] + +%macro INTRA_PRED_PLANAR_16 1 +%if (%1 < 4) + pshuflw m5, m2, 0x55 * %1 + pshufd m5, m5, 0 +%else +%if (%1 < 8) + pshufhw m5, m2, 0x55 * (%1 - 4) + pshufd m5, m5, 0xAA +%else +%if (%1 < 12) + pshuflw m5, m7, 0x55 * (%1 - 8) + pshufd m5, m5, 0 +%else + pshufhw m5, m7, 0x55 * (%1 - 12) + pshufd m5, m5, 0xAA +%endif +%endif +%endif +%if (%1 > 0) + paddw m3, m6 + paddw m4, m1 + lea r0, [r0 + r1 * 2] +%endif + pmullw m0, m5, [pw_planar8_0] + pmullw m5, [pw_planar16_0] + paddw m0, m4 + paddw m5, m3 + psraw m5, 5 + psraw m0, 5 + movu [r0], m5 + movu [r0 + 16], m0 +%endmacro + + INTRA_PRED_PLANAR_16 0 + INTRA_PRED_PLANAR_16 1 + INTRA_PRED_PLANAR_16 2 + INTRA_PRED_PLANAR_16 3 + INTRA_PRED_PLANAR_16 4 + INTRA_PRED_PLANAR_16 5 + INTRA_PRED_PLANAR_16 6 + INTRA_PRED_PLANAR_16 7 + INTRA_PRED_PLANAR_16 8 + INTRA_PRED_PLANAR_16 9 + INTRA_PRED_PLANAR_16 10 + INTRA_PRED_PLANAR_16 11 + INTRA_PRED_PLANAR_16 12 + INTRA_PRED_PLANAR_16 13 + INTRA_PRED_PLANAR_16 14 + INTRA_PRED_PLANAR_16 15 + RET + +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_planar32, 3,3,16 + movd m3, [r2 + 66] ; topRight = above[32] + + pshuflw m3, m3, 0x00 + pshufd m3, m3, 0x44 + + pmullw m0, m3, [multiL] ; (x + 1) * topRight + pmullw m1, m3, [multiH] ; (x + 1) * topRight + pmullw m2, m3, [multiH2] ; (x + 1) * topRight + pmullw m3, [multiH3] ; (x + 1) * topRight + + movd m6, [r2 + 194] ; bottomLeft = left[32] + pshuflw m6, m6, 0x00 + pshufd m6, m6, 0x44 + mova m5, m6 + paddw m5, [pw_32] + + paddw m0, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + mova m8, m6 + mova m9, m6 + mova m10, m6 + + mova m12, [pw_planar32_1] + movu m4, [r2 + 2] + psubw m8, m4 + pmullw m4, m12 + paddw m0, m4 + + movu m5, [r2 + 18] + psubw m9, m5 + pmullw m5, m12 + paddw m1, m5 + + movu m4, [r2 + 34] + psubw m10, m4 + pmullw m4, m12 + paddw m2, m4 + + movu m5, [r2 + 50] + psubw m6, m5 + pmullw m5, m12 + paddw m3, m5 + + mova m12, [pw_planar32_L] + mova m13, [pw_planar32_H] + mova m14, [pw_planar16_0] + mova m15, [pw_planar8_0] + add r1, r1 + +%macro PROCESS 1 + pmullw m5, %1, m12 + pmullw m11, %1, m13 + paddw m5, m0 + paddw m11, m1 + psrlw m5, 6 + psrlw m11, 6 + movu [r0], m5 + movu [r0 + 16], m11 + + pmullw m5, %1, m14 + pmullw %1, m15 + paddw m5, m2 + paddw %1, m3 + psrlw m5, 6 + psrlw %1, 6 + movu [r0 + 32], m5 + movu [r0 + 48], %1 +%endmacro + +%macro INCREMENT 0 + paddw m2, m10 + paddw m3, m6 + paddw m0, m8 + paddw m1, m9 + add r0, r1 +%endmacro + + add r2, 130 ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel) +%assign x 0 +%rep 4 + movu m4, [r2] + add r2, 16 +%assign y 0 +%rep 8 + %if y < 4 + pshuflw m7, m4, 0x55 * y + pshufd m7, m7, 0x44 + %else + pshufhw m7, m4, 0x55 * (y - 4) + pshufd m7, m7, 0xEE + %endif + PROCESS m7 + %if x + y < 10 + INCREMENT + %endif +%assign y y+1 +%endrep +%assign x x+1 +%endrep + RET + +;----------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) +;----------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_pred_dc4, 5,6,2 lea r3, [r2 + 18] @@ -160,6 +754,55 @@ .end: RET +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal intra_pred_planar4, 3,3,5 + movu m1, [r2 + 2] + movu m2, [r2 + 18] + pshufhw m3, m1, 0 ; topRight + pshufd m3, m3, 0xAA + pshufhw m4, m2, 0 ; bottomLeft + pshufd m4, m4, 0xAA + + pmullw m3, [multi_2Row] ; (x + 1) * topRight + pmullw m0, m1, [pw_planar4_1] ; (blkSize - 1 - y) * above[x] + + paddw m3, [pw_4] + paddw m3, m4 + paddw m3, m0 + psubw m4, m1 + + pshuflw m1, m2, 0 + pmullw m1, [pw_planar4_0] + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0], m1 + + pshuflw m1, m2, 01010101b + pmullw m1, [pw_planar4_0] + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0 + r1 * 2], m1 + lea r0, [r0 + 4 * r1] + + pshuflw m1, m2, 10101010b + pmullw m1, [pw_planar4_0] + paddw m1, m3 + paddw m3, m4 + psraw m1, 3 + movh [r0], m1 + + pshuflw m1, m2, 11111111b + pmullw m1, [pw_planar4_0] + paddw m1, m3 + psraw m1, 3 + movh [r0 + r1 * 2], m1 + RET + ;----------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel* above, int, int filter) ;----------------------------------------------------------------------------------- @@ -378,7 +1021,7 @@ ;------------------------------------------------------------------------------------------- INIT_XMM sse4 cglobal intra_pred_dc32, 3, 5, 6 - lea r3, [r2 + 130] + lea r3, [r2 + 130] ;130 = 32*sizeof(pixel)*2 + 1*sizeof(pixel) add r2, 2 add r1, r1 movu m0, [r3] diff -Nru x265-1.5/source/common/x86/intrapred8_allangs.asm x265-1.6/source/common/x86/intrapred8_allangs.asm --- x265-1.5/source/common/x86/intrapred8_allangs.asm 1970-01-01 00:00:00.000000000 +0000 +++ x265-1.6/source/common/x86/intrapred8_allangs.asm 2015-04-02 16:46:36.000000000 +0000 @@ -0,0 +1,23008 @@ +;***************************************************************************** +;* Copyright (C) 2013 x265 project +;* +;* Authors: Min Chen +;* Praveen Tiwari +;* +;* This program is free software; you can redistribute it and/or modify +;* it under the terms of the GNU General Public License as published by +;* the Free Software Foundation; either version 2 of the License, or +;* (at your option) any later version. +;* +;* This program is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;* GNU General Public License for more details. +;* +;* You should have received a copy of the GNU General Public License +;* along with this program; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. +;* +;* This program is also available under a commercial proprietary license. +;* For more information, contact us at license @ x265.com. +;*****************************************************************************/ + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA 32 + +SECTION .text + +; global constant +cextern pw_1024 + +; common constant with intrapred8.asm +cextern ang_table +cextern tab_S1 +cextern tab_S2 +cextern tab_Si + + +;----------------------------------------------------------------------------- +; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) +;----------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal all_angs_pred_4x4, 4, 4, 8 + +; mode 2 + +movh m0, [r1 + 10] +movd [r0], m0 + +palignr m1, m0, 1 +movd [r0 + 4], m1 + +palignr m1, m0, 2 +movd [r0 + 8], m1 + +palignr m1, m0, 3 +movd [r0 + 12], m1 + +; mode 3 + +mova m2, [pw_1024] + +pslldq m1, m0, 1 +pinsrb m1, [r1 + 9], 0 +punpcklbw m1, m0 + +lea r3, [ang_table] + +pmaddubsw m6, m1, [r3 + 26 * 16] +pmulhrsw m6, m2 +packuswb m6, m6 +movd [r0 + 16], m6 + +palignr m0, m1, 2 + +mova m7, [r3 + 20 * 16] + +pmaddubsw m3, m0, m7 +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 20], m3 + +; mode 6 [row 3] +movd [r0 + 76], m3 + +palignr m3, m1, 4 + +pmaddubsw m4, m3, [r3 + 14 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 24], m4 + +palignr m4, m1, 6 + +pmaddubsw m4, [r3 + 8 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 28], m4 + +; mode 4 + +pmaddubsw m5, m1, [r3 + 21 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 32], m5 + +pmaddubsw m5, m0, [r3 + 10 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 36], m5 + +pmaddubsw m5, m0, [r3 + 31 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 40], m5 + +pmaddubsw m4, m3, m7 +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 44], m4 + +; mode 5 + +pmaddubsw m5, m1, [r3 + 17 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 48], m5 + +pmaddubsw m5, m0, [r3 + 2 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 52], m5 + +pmaddubsw m5, m0, [r3 + 19 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 56], m5 + +pmaddubsw m4, m3, [r3 + 4 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 60], m4 + +; mode 6 + +pmaddubsw m5, m1, [r3 + 13 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 64], m5 + +movd [r0 + 68], m6 + +pmaddubsw m5, m0, [r3 + 7 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 72], m5 + +; mode 7 + +pmaddubsw m5, m1, [r3 + 9 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 80], m5 + +pmaddubsw m5, m1, [r3 + 18 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 84], m5 + +pmaddubsw m5, m1, [r3 + 27 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 88], m5 + +pmaddubsw m5, m0, [r3 + 4 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 92], m5 + +; mode 8 + +pmaddubsw m5, m1, [r3 + 5 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 96], m5 + +pmaddubsw m5, m1, [r3 + 10 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 100], m5 + +pmaddubsw m5, m1, [r3 + 15 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 104], m5 + +pmaddubsw m5, m1, [r3 + 20 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 108], m5 + +; mode 9 + +pmaddubsw m5, m1, [r3 + 2 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 112], m5 + +pmaddubsw m5, m1, [r3 + 4 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 116], m5 + +pmaddubsw m5, m1, [r3 + 6 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 120], m5 + +pmaddubsw m5, m1, [r3 + 8 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 124], m5 + +; mode 10 + +movd m3, [r1 + 9] +pshufd m4, m3, 0 +movu [r0 + 128], m4 + +pxor m5, m5 +movd m7, [r1 + 1] +pshufd m4, m7, 0 +punpcklbw m4, m5 + +pinsrb m7, [r1], 0 +pshufb m6, m7, m5 +punpcklbw m6, m5 + +psubw m4, m6 +psraw m4, 1 + +pshufb m6, m3, m5 +punpcklbw m6, m5 + +paddw m4, m6 +packuswb m4, m5 + +pextrb [r0 + 128], m4, 0 +pextrb [r0 + 132], m4, 1 +pextrb [r0 + 136], m4, 2 +pextrb [r0 + 140], m4, 3 + +; mode 11 + +pslldq m1, m1, 2 +pinsrb m1, [r1], 0 +pinsrb m1, [r1 + 9], 1 + +pmaddubsw m3, m1, [r3 + 30 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 144], m3 + +pmaddubsw m3, m1, [r3 + 28 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 148], m3 + +pmaddubsw m3, m1, [r3 + 26 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 152], m3 + +pmaddubsw m3, m1, [r3 + 24 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 156], m3 + +; mode 12 + +pmaddubsw m3, m1, [r3 + 27 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 160], m3 + +pmaddubsw m3, m1, [r3 + 22 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 164], m3 + +pmaddubsw m3, m1, [r3 + 17 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 168], m3 + +pmaddubsw m3, m1, [r3 + 12 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 172], m3 + +; mode 13 + +pmaddubsw m3, m1, [r3 + 23 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 176], m3 + +pmaddubsw m3, m1, [r3 + 14 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 180], m3 + +pmaddubsw m3, m1, [r3 + 5 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 184], m3 + +pslldq m5, m1, 2 +pinsrb m5, [r1 + 0], 1 +pinsrb m5, [r1 + 4], 0 + +pmaddubsw m4, m5, [r3 + 28 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 188], m4 + +; mode 14 + +pmaddubsw m4, m1, [r3 + 19 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 192], m4 + +pmaddubsw m7, m1, [r3 + 6 * 16] +pmulhrsw m7, m2 +packuswb m7, m7 +movd [r0 + 196], m7 + +pinsrb m5, [r1 + 2], 0 + +pmaddubsw m4, m5, [r3 + 25 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 200], m4 + +pmaddubsw m4, m5, [r3 + 12 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 204], m4 + +; mode 15 + +pmaddubsw m4, m1, [r3 + 15 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 208], m4 + +pmaddubsw m4, m5, [r3 + 30 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 212], m4 + +pmaddubsw m4, m5, [r3 + 13 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 216], m4 + +pslldq m4, m5, 2 +pinsrb m4, [r1 + 2], 1 +pinsrb m4, [r1 + 4], 0 + +pmaddubsw m6, m4, [r3 + 28 * 16] +pmulhrsw m6, m2 +packuswb m6, m6 +movd [r0 + 220], m6 + +; mode 16 + +pmaddubsw m6, m1, [r3 + 11 * 16] +pmulhrsw m6, m2 +packuswb m6, m6 +movd [r0 + 224], m6 + +pmaddubsw m6, m5, [r3 + 22 * 16] +pmulhrsw m6, m2 +packuswb m6, m6 +movd [r0 + 228], m6 + +pmaddubsw m6, m5, [r3 + 1 * 16] +pmulhrsw m6, m2 +packuswb m6, m6 +movd [r0 + 232], m6 + +pinsrb m4, [r1 + 3], 0 + +pmaddubsw m4, [r3 + 12 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 236], m4 + +; mode 17 + +movd [r0 + 240], m7 + +pslldq m1, 2 +pinsrb m1, [r1 + 1], 0 +pinsrb m1, [r1 + 0], 1 + +pmaddubsw m3, m1, [r3 + 12 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 244], m3 + +pslldq m1, 2 +pinsrb m1, [r1 + 1], 1 +pinsrb m1, [r1 + 2], 0 + +pmaddubsw m3, m1, [r3 + 18 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 248], m3 + +pslldq m1, 2 +pinsrb m1, [r1 + 2], 1 +pinsrb m1, [r1 + 4], 0 + +pmaddubsw m1, [r3 + 24 * 16] +pmulhrsw m1, m2 +packuswb m1, m1 +movd [r0 + 252], m1 + +; mode 18 + +movh m1, [r1] +movd [r0 + 256], m1 + +pslldq m3, m1, 1 +pinsrb m3, [r1 + 9], 0 +movd [r0 + 260], m3 + +pslldq m4, m3, 1 +pinsrb m4, [r1 + 10], 0 +movd [r0 + 264], m4 + +pslldq m4, 1 +pinsrb m4, [r1 + 11], 0 +movd [r0 + 268], m4 + +; mode 19 + +palignr m3, m1, 1 +punpcklbw m1, m3 + +pmaddubsw m7, m1, [r3 + 6 * 16] +pmulhrsw m7, m2 +packuswb m7, m7 +movd [r0 + 272], m7 + +pslldq m3, m1, 2 +pinsrb m3, [r1], 1 +pinsrb m3, [r1 + 9], 0 + +pmaddubsw m4, m3, [r3 + 12 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 276], m4 + +pslldq m4, m3, 2 +pinsrb m4, [r1 + 9], 1 +pinsrb m4, [r1 + 10], 0 + +pmaddubsw m5, m4, [r3 + 18 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 280], m5 + +pslldq m4, 2 +pinsrb m4, [r1 + 10], 1 +pinsrb m4, [r1 + 12], 0 + +pmaddubsw m4, [r3 + 24 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 284], m4 + +; mode 20 + +pmaddubsw m4, m1, [r3 + 11 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 288], m4 + +pinsrb m3, [r1 + 10], 0 + +pmaddubsw m4, m3, [r3 + 22 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 292], m4 + +pmaddubsw m4, m3, [r3 + 1 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 296], m4 + +pslldq m6, m3, 2 +pinsrb m6, [r1 + 10], 1 +pinsrb m6, [r1 + 11], 0 + +pmaddubsw m5, m6, [r3 + 12 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 300], m5 + +; mode 21 + +pmaddubsw m4, m1, [r3 + 15 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 304], m4 + +pmaddubsw m4, m3, [r3 + 30 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 308], m4 + +pmaddubsw m4, m3, [r3 + 13 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 312], m4 + +pinsrb m6, [r1 + 12], 0 + +pmaddubsw m6, [r3 + 28 * 16] +pmulhrsw m6, m2 +packuswb m6, m6 +movd [r0 + 316], m6 + +; mode 22 + +pmaddubsw m4, m1, [r3 + 19 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 320], m4 + +movd [r0 + 324], m7 + +pmaddubsw m4, m3, [r3 + 25 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 328], m4 + +pmaddubsw m4, m3, [r3 + 12 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 332], m4 + +; mode 23 + +pmaddubsw m4, m1, [r3 + 23 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 336], m4 + +pmaddubsw m4, m1, [r3 + 14 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 340], m4 + +pmaddubsw m4, m1, [r3 + 5 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 344], m4 + +pinsrb m3, [r1 + 12], 0 + +pmaddubsw m3, [r3 + 28 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 348], m3 + +; mode 24 + +pmaddubsw m3, m1, [r3 + 27 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 352], m3 + +pmaddubsw m3, m1, [r3 + 22 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 356], m3 + +pmaddubsw m3, m1, [r3 + 17 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 360], m3 + +pmaddubsw m3, m1, [r3 + 12 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 364], m3 + +; mode 25 + +pmaddubsw m3, m1, [r3 + 30 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 368], m3 + +pmaddubsw m3, m1, [r3 + 28 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 372], m3 + +pmaddubsw m3, m1, [r3 + 26 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 376], m3 + +pmaddubsw m1, [r3 + 24 * 16] +pmulhrsw m1, m2 +packuswb m1, m1 +movd [r0 + 380], m1 + +; mode 26 + +movh m1, [r1 + 1] +pshufd m3, m1, 0 +movu [r0 + 384], m3 + +pxor m4, m4 +movd m5, [r1 + 9] +pshufd m5, m5, 0 +punpcklbw m5, m4 + +pinsrb m6, [r1], 0 +pshufb m6, m4 +punpcklbw m6, m4 + +psubw m5, m6 +psraw m5, 1 + +pshufb m6, m1, m4 +punpcklbw m6, m4 + +paddw m5, m6 +packuswb m5, m4 + +pextrb [r0 + 384], m5, 0 +pextrb [r0 + 388], m5, 1 +pextrb [r0 + 392], m5, 2 +pextrb [r0 + 396], m5, 3 + +; mode 27 + +palignr m3, m1, 1 +punpcklbw m1, m3 + +pmaddubsw m3, m1, [r3 + 2 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 400], m3 + +pmaddubsw m3, m1, [r3 + 4 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 404], m3 + +pmaddubsw m3, m1, [r3 + 6 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 408], m3 + +pmaddubsw m3, m1, [r3 + 8 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 412], m3 + +; mode 28 + +pmaddubsw m3, m1, [r3 + 5 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 416], m3 + +pmaddubsw m3, m1, [r3 + 10 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 420], m3 + +pmaddubsw m3, m1, [r3 + 15 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 424], m3 + +pmaddubsw m3, m1, [r3 + 20 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 428], m3 + +; mode 29 + +pmaddubsw m3, m1, [r3 + 9 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 432], m3 + +pmaddubsw m3, m1, [r3 + 18 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 436], m3 + +pmaddubsw m3, m1, [r3 + 27 * 16] +pmulhrsw m3, m2 +packuswb m3, m3 +movd [r0 + 440], m3 + +palignr m3, m1, 2 + +pmaddubsw m4, m3, [r3 + 4 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 444], m4 + +; mode 30 + +pmaddubsw m4, m1, [r3 + 13 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 448], m4 + +pmaddubsw m7, m1, [r3 + 26 * 16] +pmulhrsw m7, m2 +packuswb m7, m7 +movd [r0 + 452], m7 + +pmaddubsw m5, m3, [r3 + 7 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 456], m5 + +pmaddubsw m6, m3, [r3 + 20 * 16] +pmulhrsw m6, m2 +packuswb m6, m6 +movd [r0 + 460], m6 + +; mode 31 + +pmaddubsw m4, m1, [r3 + 17 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 464], m4 + +pmaddubsw m5, m3, [r3 + 2 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 468], m5 + +pmaddubsw m5, m3, [r3 + 19 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 472], m5 + +palignr m4, m3, 2 + +pmaddubsw m5, m4, [r3 + 4 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 476], m5 + +; mode 32 + +pmaddubsw m5, m1, [r3 + 21 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 480], m5 + +pmaddubsw m5, m3, [r3 + 10 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 484], m5 + +pmaddubsw m5, m3, [r3 + 31 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 488], m5 + +pmaddubsw m5, m4, [r3 + 20 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 492], m5 + +; mode 33 + +movd [r0 + 496], m7 + +movd [r0 + 500], m6 + +pmaddubsw m5, m4, [r3 + 14 * 16] +pmulhrsw m5, m2 +packuswb m5, m5 +movd [r0 + 504], m5 + +psrldq m4, 2 + +pmaddubsw m4, [r3 + 8 * 16] +pmulhrsw m4, m2 +packuswb m4, m4 +movd [r0 + 508], m4 + +; mode 34 + +movh m7, [r1 + 2] +movd [r0 + 512], m7 + +psrldq m7, 1 +movd [r0 + 516], m7 + +psrldq m7, 1 +movd [r0 + 520], m7 + +psrldq m7, 1 +movd [r0 + 524], m7 + +RET + +;------------------------------------------------------------------------------ +; void all_angs_pred_8x8(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) +;------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal all_angs_pred_8x8, 3,4,8 + ; mode 2 + + movu m0, [r2 + 18] + palignr m1, m0, 1 + punpcklqdq m2, m0, m1 + movu [r0], m2 + + palignr m1, m0, 2 + palignr m2, m0, 3 + punpcklqdq m1, m2 + movu [r0 + 16], m1 + + palignr m1, m0, 4 + palignr m2, m0, 5 + punpcklqdq m1, m2 + movu [r0 + 32], m1 + + palignr m1, m0, 6 + palignr m2, m0, 7 + punpcklqdq m1, m2 + movu [r0 + 48], m1 + + ; mode 3 [row 0, 1] + + mova m7, [pw_1024] + lea r3, [ang_table] + + movu m0, [r1 + 17] + + palignr m1, m0, 1 + palignr m2, m0, 2 + + punpcklbw m3, m0, m1 + pmaddubsw m4, m3, [r3 + 26 * 16] + pmulhrsw m4, m7 + + punpcklbw m1, m2 + pmaddubsw m5, m1, [r3 + 20 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + + movu [r0 + 64], m4 + + ; mode 6 [row 1] + + movh [r0 + 264], m4 + + ; mode 6 [row 3] + + movhps [r0 + 280], m4 + + ; mode 4 [row 0, 1] + + pmaddubsw m4, m3, [r3 + 21 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m1, [r3 + 10 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 128], m4 + + ; mode 5 [row 0, 1] + + pmaddubsw m4, m3, [r3 + 17 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m1, [r3 + 2 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 192], m4 + + ; mode 6 [row 0] + + pmaddubsw m4, m3, [r3 + 13 * 16] + pmulhrsw m4, m7 + + pxor m5, m5 + + packuswb m4, m5 + movh [r0 + 256], m4 + + ; mode 7 [row 0, 1] + + pmaddubsw m4, m3, [r3 + 9 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 18 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 320], m4 + + ; mode 8 [row 0, 1] + + pmaddubsw m4, m3, [r3 + 5 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 10 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 384], m4 + + ; mode 8 [row 2, 3] + + pmaddubsw m4, m3, [r3 + 15 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 20 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 400], m4 + + ; mode 8 [row 4, 5] + + pmaddubsw m4, m3, [r3 + 25 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 30 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 416], m4 + + ; mode 8 [row 6, 7] + + pmaddubsw m4, m1, [r3 + 3 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m1, [r3 + 8 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 432], m4 + + ; mode 9 [row 0, 1] + + pmaddubsw m4, m3, [r3 + 2 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 4 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 448], m4 + + ; mode 9 [row 2, 3] + + pmaddubsw m4, m3, [r3 + 6 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 8 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 464], m4 + + ; mode 9 [row 4, 5] + + pmaddubsw m4, m3, [r3 + 10 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 12 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 480], m4 + + ; mode 9 [row 6, 7] + + pmaddubsw m4, m3, [r3 + 14 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 16 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 496], m4 + + ; mode 7 [row 2, 3] + + pmaddubsw m4, m3, [r3 + 27 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m1, [r3 + 4 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 336], m4 + + ; mode 7 [row 4, 5] + + pmaddubsw m4, m1, [r3 + 13 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m1, [r3 + 22 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 352], m4 + + ; mode 6 [row 2] + + pmaddubsw m4, m1, [r3 + 7 * 16] + pmulhrsw m4, m7 + + pxor m5, m5 + + packuswb m4, m5 + movh [r0 + 272], m4 + + ; mode 3 [row 2, 3] + + palignr m1, m0, 3 + palignr m3, m0, 4 + + punpcklbw m2, m1 + pmaddubsw m5, m2, [r3 + 14 * 16] + pmulhrsw m5, m7 + + punpcklbw m1, m3 + pmaddubsw m6, m1, [r3 + 8 * 16] + pmulhrsw m6, m7 + + packuswb m5, m6 + movu [r0 + 80], m5 + + ; mode 6 [row 7] + + movhps [r0 + 312], m5 + + ; mode 6 [row 5] + + movh [r0 + 296], m5 + + ; mode 4 [calculate and store row 4, 5] + + pmaddubsw m4, m1, [r3 + 9 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m1, [r3 + 30 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 160], m4 + + ; mode 5 [row 4, 5] + + pmaddubsw m4, m2, [r3 + 21 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m1, [r3 + 6 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 224], m4 + + ; mode 6 [row 4, 5] + + pmaddubsw m5, m2, [r3 + 1 * 16] + pmulhrsw m5, m7 + + pxor m6, m6 + + packuswb m5, m6 + movh [r0 + 288], m5 + + ; mode 6 [row 6, 7] + + pmaddubsw m5, m2, [r3 + 27 * 16] + pmulhrsw m5, m7 + + pxor m6, m6 + + packuswb m5, m6 + movh [r0 + 304], m5 + + ; mode 5 [calculate row 6] + + pmaddubsw m6, m1, [r3 + 23 * 16] + pmulhrsw m6, m7 + + ; mode 3 [row 4, 5] + + palignr m1, m0, 5 + + punpcklbw m3, m1 + pmaddubsw m4, m3, [r3 + 2 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 28 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 96], m4 + + ; mode 4 [calculate row 7] + + pmaddubsw m5, m3, [r3 + 19 * 16] + pmulhrsw m5, m7 + + ; mode 5 [calculate row 6] + + pmaddubsw m4, m3, [r3 + 8 * 16] + pmulhrsw m4, m7 + + packuswb m6, m4 + movu [r0 + 240], m6 + + ; mode 3 [row 6, 7] + + palignr m2, m0, 6 + palignr m3, m0, 7 + + punpcklbw m1, m2 + pmaddubsw m4, m1, [r3 + 22 * 16] + pmulhrsw m4, m7 + + punpcklbw m2, m3 + pmaddubsw m2, [r3 + 16 * 16] + pmulhrsw m2, m7 + + packuswb m4, m2 + movu [r0 + 112], m4 + + ; mode 4 [calculate row 7] + + pmaddubsw m2, m1, [r3 + 8 * 16] + pmulhrsw m2, m7 + + ; mode 4 [store row 6 and 7] + + packuswb m5, m2 + movu [r0 + 176], m5 + + ; mode 4 [row 2, 3] + + palignr m1, m0, 1 + palignr m2, m0, 2 + palignr m3, m0, 3 + + punpcklbw m1, m2 + pmaddubsw m4, m1, [r3 + 31 * 16] + pmulhrsw m4, m7 + + punpcklbw m2, m3 + pmaddubsw m5, m2, [r3 + 20 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 144], m4 + + ; mode 5 [row 2, 3] + + pmaddubsw m4, m1, [r3 + 19 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m2, [r3 + 4 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 208], m4 + + ; mode 7 [row 6, 7] + + pmaddubsw m4, m1, [r3 + 31 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m2, [r3 + 8 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 368], m4 + + ; mode 10 + + pshufb m1, m0, [tab_Si] + movu [r0 + 512], m1 + movu [r0 + 528], m1 + movu [r0 + 544], m1 + movu [r0 + 560], m1 + + pxor m0, m0 + + pshufb m1, m1, m0 + punpcklbw m1, m0 + + movu m2, [r1] + + pshufb m3, m2, m0 + punpcklbw m3, m0 + + psrldq m4, m2, 1 + punpcklbw m4, m0 + + movu m2, [r1 + 9] + punpcklbw m2, m0 + + psubw m4, m3 + psubw m2, m3 + + psraw m4, 1 + psraw m2, 1 + + paddw m4, m1 + paddw m2, m1 + + packuswb m4, m2 + + pextrb [r0 + 512], m4, 0 + pextrb [r0 + 520], m4, 1 + pextrb [r0 + 528], m4, 2 + pextrb [r0 + 536], m4, 3 + pextrb [r0 + 544], m4, 4 + pextrb [r0 + 552], m4, 5 + pextrb [r0 + 560], m4, 6 + pextrb [r0 + 568], m4, 7 + + ; mode 11 [row 0, 1] + + movu m0, [r1 + 16] + pinsrb m0, [r1], 0 + palignr m1, m0, 1 + punpcklbw m2, m0, m1 + + pmaddubsw m3, m2, [r3 + 30 * 16] + pmulhrsw m3, m7 + + pmaddubsw m4, m2, [r3 + 28 * 16] + pmulhrsw m4, m7 + + packuswb m3, m4 + movu [r0 + 576], m3 + + ; mode 11 [row 2, 3] + + pmaddubsw m3, m2, [r3 + 26 * 16] + pmulhrsw m3, m7 + + pmaddubsw m4, m2, [r3 + 24 * 16] + pmulhrsw m4, m7 + + packuswb m3, m4 + movu [r0 + 592], m3 + + ; mode 11 [row 4, 5] + + pmaddubsw m3, m2, [r3 + 22 * 16] + pmulhrsw m3, m7 + + pmaddubsw m4, m2, [r3 + 20 * 16] + pmulhrsw m4, m7 + + packuswb m5, m3, m4 + movu [r0 + 608], m5 + + ; mode 12 [row 0, 1] + + pmaddubsw m4, m2, [r3 + 27 * 16] + pmulhrsw m4, m7 + + packuswb m4, m3 + movu [r0 + 640], m4 + + ; mode 11 [row 6, 7] + + pmaddubsw m3, m2, [r3 + 18 * 16] + pmulhrsw m3, m7 + + pmaddubsw m4, m2, [r3 + 16 * 16] + pmulhrsw m4, m7 + + packuswb m3, m4 + movu [r0 + 624], m3 + + ; mode 12 [row 2, 3] + + pmaddubsw m3, m2, [r3 + 17 * 16] + pmulhrsw m3, m7 + + pmaddubsw m4, m2, [r3 + 12 * 16] + pmulhrsw m4, m7 + + packuswb m3, m4 + movu [r0 + 656], m3 + + ; mode 12 [row 4, 5] + + pmaddubsw m3, m2, [r3 + 7 * 16] + pmulhrsw m3, m7 + + pmaddubsw m4, m2, [r3 + 2 * 16] + pmulhrsw m4, m7 + + packuswb m3, m4 + movu [r0 + 672], m3 + + ; mode 12 [row 6, 7] + + pslldq m3, m2, 2 + pinsrb m3, [r1 + 0], 1 + pinsrb m3, [r1 + 6], 0 + + pmaddubsw m4, m3, [r3 + 29 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 24 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 688], m4 + + ; mode 13 [row 0, 1] + + pmaddubsw m4, m2, [r3 + 23 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m2, [r3 + 14 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 704], m4 + + ; mode 13 [row 2, 3] + + pmaddubsw m4, m2, [r3 + 5 * 16] + pmulhrsw m4, m7 + + pinsrb m3, [r1 + 4], 0 + pmaddubsw m5, m3, [r3 + 28 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 720], m4 + + ; mode 13 [row 4, 5] + + pmaddubsw m4, m3, [r3 + 19 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 10 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 736], m4 + + ; mode 13 [row 6, 7] + + pmaddubsw m4, m3, [r3 + 1 * 16] + pmulhrsw m4, m7 + + pslldq m5, m3, 2 + pinsrb m5, [r1 + 4], 1 + pinsrb m5, [r1 + 7], 0 + + pmaddubsw m5, [r3 + 24 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 752], m4 + + ; mode 14 [row 0, 1] + + pmaddubsw m4, m2, [r3 + 19 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m2, [r3 + 6 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 768], m4 + + ; mode 14 [row 2, 3] + + pinsrb m3, [r1 + 2], 0 + + pmaddubsw m4, m3, [r3 + 25 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 12 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 784], m4 + + ; mode 14 [row 4, 5] + + pslldq m1, m3, 2 + pinsrb m1, [r1 + 2], 1 + pinsrb m1, [r1 + 5], 0 + + pmaddubsw m4, m1, [r3 + 31 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m1, [r3 + 18 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 800], m4 + + ; mode 14 [row 6, 7] + + pmaddubsw m4, m1, [r3 + 5 * 16] + pmulhrsw m4, m7 + + pslldq m1, 2 + pinsrb m1, [r1 + 5], 1 + pinsrb m1, [r1 + 7], 0 + + pmaddubsw m5, m1, [r3 + 24 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 816], m4 + + ; mode 15 [row 0, 1] + + pmaddubsw m4, m2, [r3 + 15 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 30 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 832], m4 + + ; mode 15 [row 2, 3] + + pmaddubsw m4, m3, [r3 + 13 * 16] + pmulhrsw m4, m7 + + pslldq m1, m3, 2 + pinsrb m1, [r1 + 2], 1 + pinsrb m1, [r1 + 4], 0 + + pmaddubsw m5, m1, [r3 + 28 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 848], m4 + + ; mode 15 [row 4, 5] + + pmaddubsw m4, m1, [r3 + 11 * 16] + pmulhrsw m4, m7 + + pslldq m1, 2 + pinsrb m1, [r1 + 4], 1 + pinsrb m1, [r1 + 6], 0 + + pmaddubsw m5, m1, [r3 + 26 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 864], m4 + + ; mode 15 [row 6, 7] + + pmaddubsw m4, m1, [r3 + 9 * 16] + pmulhrsw m4, m7 + + pslldq m1, 2 + pinsrb m1, [r1 + 6], 1 + pinsrb m1, [r1 + 8], 0 + + pmaddubsw m1, [r3 + 24 * 16] + pmulhrsw m1, m7 + + packuswb m4, m1 + movu [r0 + 880], m4 + + ; mode 16 [row 0, 1] + + pmaddubsw m4, m2, [r3 + 11 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 22 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 896], m4 + + ; mode 16 [row 2, 3] + + pmaddubsw m4, m3, [r3 + 1 * 16] + pmulhrsw m4, m7 + + pslldq m3, 2 + pinsrb m3, [r1 + 2], 1 + pinsrb m3, [r1 + 3], 0 + + pmaddubsw m5, m3, [r3 + 12 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 912], m4 + + ; mode 16 [row 4, 5] + + pslldq m3, 2 + pinsrb m3, [r1 + 3], 1 + pinsrb m3, [r1 + 5], 0 + + pmaddubsw m4, m3, [r3 + 23 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m3, [r3 + 2 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 928], m4 + + ; mode 16 [row 6, 7] + + pslldq m3, 2 + pinsrb m3, [r1 + 5], 1 + pinsrb m3, [r1 + 6], 0 + + pmaddubsw m4, m3, [r3 + 13 * 16] + pmulhrsw m4, m7 + + pslldq m3, 2 + pinsrb m3, [r1 + 6], 1 + pinsrb m3, [r1 + 8], 0 + + pmaddubsw m3, [r3 + 24 * 16] + pmulhrsw m3, m7 + + packuswb m4, m3 + movu [r0 + 944], m4 + + ; mode 17 [row 0, 1] + + pmaddubsw m4, m2, [r3 + 6 * 16] + pmulhrsw m4, m7 + + pslldq m2, 2 + pinsrb m2, [r1 + 0], 1 + pinsrb m2, [r1 + 1], 0 + + pmaddubsw m3, m2, [r3 + 12 * 16] + pmulhrsw m3, m7 + + packuswb m4, m3 + movu [r0 + 960], m4 + + ; mode 17 [row 2, 3] + + pslldq m2, 2 + pinsrb m2, [r1 + 1], 1 + pinsrb m2, [r1 + 2], 0 + + pmaddubsw m4, m2, [r3 + 18 * 16] + pmulhrsw m4, m7 + + pslldq m2, 2 + pinsrb m2, [r1 + 2], 1 + pinsrb m2, [r1 + 4], 0 + + pmaddubsw m3, m2, [r3 + 24 * 16] + pmulhrsw m3, m7 + + packuswb m4, m3 + movu [r0 + 976], m4 + + ; mode 17 [row 4, 5] + + pslldq m2, 2 + pinsrb m2, [r1 + 4], 1 + pinsrb m2, [r1 + 5], 0 + + pmaddubsw m4, m2, [r3 + 30 * 16] + pmulhrsw m4, m7 + + pmaddubsw m3, m2, [r3 + 4 * 16] + pmulhrsw m3, m7 + + packuswb m4, m3 + movu [r0 + 992], m4 + + ; mode 17 [row 6, 7] + + pslldq m2, 2 + pinsrb m2, [r1 + 5], 1 + pinsrb m2, [r1 + 6], 0 + + pmaddubsw m4, m2, [r3 + 10 * 16] + pmulhrsw m4, m7 + + pslldq m2, 2 + pinsrb m2, [r1 + 6], 1 + pinsrb m2, [r1 + 7], 0 + + pmaddubsw m3, m2, [r3 + 16 * 16] + pmulhrsw m3, m7 + + packuswb m4, m3 + movu [r0 + 1008], m4 + + ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7] + + movh m1, [r2] + + pslldq m2, m1, 1 + pinsrb m2, [r2 + 1 + 16], 0 + punpcklqdq m1, m2 + movu [r0 + 1024], m1 + + pslldq m2, 1 + pinsrb m2, [r2 + 2 + 16], 0 + + pslldq m0, m2, 1 + pinsrb m0, [r2 + 3 + 16], 0 + punpcklqdq m2, m0 + movu [r0 + 1040], m2 + + pslldq m0, 1 + pinsrb m0, [r2 + 4 + 16], 0 + + pslldq m2, m0, 1 + pinsrb m2, [r2 + 5 + 16], 0 + punpcklqdq m0, m2 + movu [r0 + 1056], m0 + + pslldq m2, 1 + pinsrb m2, [r2 + 6 + 16], 0 + + pslldq m0, m2, 1 + pinsrb m0, [r2 + 7 + 16], 0 + punpcklqdq m2, m0 + movu [r0 + 1072], m2 + + ; mode 19 [row 0, 1] + + movu m0, [r1] + palignr m1, m0, 1 + punpcklbw m0, m1 + + pmaddubsw m1, m0, [r3 + 6 * 16] + pmulhrsw m1, m7 + + pslldq m2, m0, 2 + pinsrb m2, [r1], 1 + pinsrb m2, [r1 + 1 + 16], 0 + + pmaddubsw m3, m2, [r3 + 12 * 16] + pmulhrsw m3, m7 + + packuswb m1, m3 + movu [r0 + 1088], m1 + + ; mode 19 [row 2, 3] + + pslldq m2, 2 + pinsrb m2, [r1 + 1 + 16], 1 + pinsrb m2, [r1 + 2 + 16], 0 + + pmaddubsw m4, m2, [r3 + 18 * 16] + pmulhrsw m4, m7 + + pslldq m2, 2 + pinsrb m2, [r1 + 2 + 16], 1 + pinsrb m2, [r1 + 4 + 16], 0 + + pmaddubsw m5, m2, [r3 + 24 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 1104], m4 + + ; mode 19 [row 4, 5] + + pslldq m2, 2 + pinsrb m2, [r1 + 4 + 16], 1 + pinsrb m2, [r1 + 5 + 16], 0 + + pmaddubsw m4, m2, [r3 + 30 * 16] + pmulhrsw m4, m7 + + pmaddubsw m5, m2, [r3 + 4 * 16] + pmulhrsw m5, m7 + + packuswb m4, m5 + movu [r0 + 1120], m4 + + ; mode 19 [row 6, 7] + + pslldq m2, 2 + pinsrb m2, [r1 + 5 + 16], 1 + pinsrb m2, [r1 + 6 + 16], 0 + + pmaddubsw m4, m2, [r3 + 10 * 16] + pmulhrsw m4, m7 + + pslldq m2, 2 + pinsrb m2, [r1 + 6 + 16], 1 + pinsrb m2, [r1 + 7 + 16], 0 + + pmaddubsw m2, [r3 + 16 * 16] + pmulhrsw m2, m7 + + packuswb m4, m2 + movu [r0 + 1136], m4 + + ; mode 20 [row 0, 1] + + pmaddubsw m3, m0, [r3 + 11 * 16] + pmulhrsw m3, m7 + + pslldq m1, m0, 2 + pinsrb m1, [r1 + 0], 1 + pinsrb m1, [r1 + 2 + 16], 0 + + pmaddubsw m4, m1, [r3 + 22 * 16] + pmulhrsw m4, m7 + + packuswb m3, m4 + movu [r0 + 1152], m3 + + ; mode 20 [row 2, 3] + + pmaddubsw m3, m1, [r3 + 1 * 16] + pmulhrsw m3, m7 + + pslldq m2, m1, 2 + pinsrb m2, [r1 + 2 + 16], 1 + pinsrb m2, [r1 + 3 + 16], 0 + + pmaddubsw m4, m2, [r3 + 12 * 16] + pmulhrsw m4, m7 + + packuswb m3, m4 + movu [r0 + 1168], m3 + + ; mode 20 [row 4, 5] + + pslldq m2, 2 + pinsrb m2, [r1 + 3 + 16], 1 + pinsrb m2, [r1 + 5 + 16], 0 + + pmaddubsw m3, m2, [r3 + 23 * 16] + pmulhrsw m3, m7 + + pmaddubsw m4, m2, [r3 + 2 * 16] + pmulhrsw m4, m7 + + packuswb m3, m4 + movu [r0 + 1184], m3 + + ; mode 20 [row 6, 7] + + pslldq m2, 2 + pinsrb m2, [r1 + 5 + 16], 1 + pinsrb m2, [r1 + 6 + 16], 0 + + pmaddubsw m3, m2, [r3 + 13 * 16] + pmulhrsw m3, m7 + + pslldq m2, 2 + pinsrb m2, [r1 + 6 + 16], 1 + pinsrb m2, [r1 + 8 + 16], 0 + + pmaddubsw m4, m2, [r3 + 24 * 16] + pmulhrsw m4, m7 + + packuswb m3, m4 + movu [r0 + 1200], m3 + + ; mode 21 [row 0, 1] + + pmaddubsw m2, m0, [r3 + 15 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m1, [r3 + 30 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1216], m2 + + ; mode 21 [row 2, 3] + + pmaddubsw m2, m1, [r3 + 13 * 16] + pmulhrsw m2, m7 + + pslldq m3, m1, 2 + pinsrb m3, [r1 + 2 + 16], 1 + pinsrb m3, [r1 + 4 + 16], 0 + + pmaddubsw m4, m3, [r3 + 28 * 16] + pmulhrsw m4, m7 + + packuswb m2, m4 + movu [r0 + 1232], m2 + + ; mode 21 [row 4, 5] + + pmaddubsw m2, m3, [r3 + 11 * 16] + pmulhrsw m2, m7 + + pslldq m3, 2 + pinsrb m3, [r1 + 4 + 16], 1 + pinsrb m3, [r1 + 6 + 16], 0 + + pmaddubsw m4, m3, [r3 + 26 * 16] + pmulhrsw m4, m7 + + packuswb m2, m4 + movu [r0 + 1248], m2 + + ; mode 21 [row 6, 7] + + pmaddubsw m2, m3, [r3 + 9 * 16] + pmulhrsw m2, m7 + + pslldq m3, 2 + pinsrb m3, [r1 + 6 + 16], 1 + pinsrb m3, [r1 + 8 + 16], 0 + + pmaddubsw m4, m3, [r3 + 24 * 16] + pmulhrsw m4, m7 + + packuswb m2, m4 + movu [r0 + 1264], m2 + + ; mode 22 [row 0, 1] + + pmaddubsw m2, m0, [r3 + 19 * 16] + pmulhrsw m2, m7 + + pmaddubsw m4, m0, [r3 + 6 * 16] + pmulhrsw m4, m7 + + packuswb m2, m4 + movu [r0 + 1280], m2 + + ; mode 22 [row 2, 3] + + pmaddubsw m2, m1, [r3 + 25 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m1, [r3 + 12 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1296], m2 + + ; mode 22 [row 4, 5] + + pslldq m1, 2 + pinsrb m1, [r1 + 5 + 16], 0 + pinsrb m1, [r1 + 2 + 16], 1 + + pmaddubsw m2, m1, [r3 + 31 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m1, [r3 + 18 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1312], m2 + + ; mode 22 [row 6, 7] + + pmaddubsw m2, m1, [r3 + 5 * 16] + pmulhrsw m2, m7 + + pslldq m1, 2 + pinsrb m1, [r1 + 5 + 16], 1 + pinsrb m1, [r1 + 7 + 16], 0 + + pmaddubsw m1, [r3 + 24 * 16] + pmulhrsw m1, m7 + + packuswb m2, m1 + movu [r0 + 1328], m2 + + ; mode 23 [row 0, 1] + + pmaddubsw m2, m0, [r3 + 23 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m0, [r3 + 14 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1344], m2 + + ; mode 23 [row 2, 3] + + pmaddubsw m2, m0, [r3 + 5 * 16] + pmulhrsw m2, m7 + + pslldq m1, m0, 2 + pinsrb m1, [r1], 1 + pinsrb m1, [r1 + 4 + 16], 0 + + pmaddubsw m3, m1, [r3 + 28 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1360], m2 + + ; mode 23 [row 4, 5] + + pmaddubsw m2, m1, [r3 + 19 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m1, [r3 + 10 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1376], m2 + + ; mode 23 [row 6, 7] + + pmaddubsw m2, m1, [r3 + 1 * 16] + pmulhrsw m2, m7 + + pslldq m3, m1, 2 + pinsrb m3, [r1 + 4 + 16], 1 + pinsrb m3, [r1 + 7 + 16], 0 + + pmaddubsw m3, [r3 + 24 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1392], m2 + + ; mode 24 [row 0, 1] + + pmaddubsw m2, m0, [r3 + 27 * 16] + pmulhrsw m2, m7 + + pmaddubsw m5, m0, [r3 + 22 * 16] + pmulhrsw m5, m7 + + packuswb m2, m5 + movu [r0 + 1408], m2 + + ; mode 24 [row 2, 3] + + pmaddubsw m2, m0, [r3 + 17 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m0, [r3 + 12 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1424], m2 + + ; mode 24 [row 4, 5] + + pmaddubsw m2, m0, [r3 + 7 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m0, [r3 + 2 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1440], m2 + + ; mode 24 [row 6, 7] + + pinsrb m1, [r1 + 6 + 16], 0 + + pmaddubsw m2, m1, [r3 + 29 * 16] + pmulhrsw m2, m7 + + pmaddubsw m1, [r3 + 24 * 16] + pmulhrsw m1, m7 + + packuswb m2, m1 + movu [r0 + 1456], m2 + + ; mode 25 [row 0, 1] + + pmaddubsw m2, m0, [r3 + 30 * 16] + pmulhrsw m2, m7 + + pmaddubsw m1, m0, [r3 + 28 * 16] + pmulhrsw m1, m7 + + packuswb m2, m1 + movu [r0 + 1472], m2 + + ; mode 25 [row 2, 3] + + pmaddubsw m2, m0, [r3 + 26 * 16] + pmulhrsw m2, m7 + + pmaddubsw m1, m0, [r3 + 24 * 16] + pmulhrsw m1, m7 + + packuswb m2, m1 + movu [r0 + 1488], m2 + + ; mode 25 [row 4, 5] + + pmaddubsw m1, m0, [r3 + 20 * 16] + pmulhrsw m1, m7 + + packuswb m5, m1 + movu [r0 + 1504], m5 + + ; mode 25 [row 6, 7] + + pmaddubsw m2, m0, [r3 + 18 * 16] + pmulhrsw m2, m7 + + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m7 + + packuswb m2, m1 + movu [r0 + 1520], m2 + + ; mode 26 + + movu m0, [r1 + 1] + + pshufb m1, m0, [tab_Si] + movu [r0 + 1536], m1 + movu [r0 + 1552], m1 + movu [r0 + 1568], m1 + movu [r0 + 1584], m1 + + pxor m5, m5 + + pshufb m1, m1, m5 + punpcklbw m1, m5 + + movu m2, [r1 + 16] + pinsrb m2, [r1], 0 + + pshufb m3, m2, m5 + punpcklbw m3, m5 + + psrldq m4, m2, 1 + punpcklbw m4, m5 + + movu m2, [r1 + 9 + 16] + punpcklbw m2, m5 + + psubw m4, m3 + psubw m2, m3 + + psraw m4, 1 + psraw m2, 1 + + paddw m4, m1 + paddw m2, m1 + + packuswb m4, m2 + + pextrb [r0 + 1536], m4, 0 + pextrb [r0 + 1544], m4, 1 + pextrb [r0 + 1552], m4, 2 + pextrb [r0 + 1560], m4, 3 + pextrb [r0 + 1568], m4, 4 + pextrb [r0 + 1576], m4, 5 + pextrb [r0 + 1584], m4, 6 + pextrb [r0 + 1592], m4, 7 + + ; mode 27 [row 0, 1] + + palignr m6, m0, 1 + punpcklbw m4, m0, m6 + + pmaddubsw m1, m4, [r3 + 2 * 16] + pmulhrsw m1, m7 + + pmaddubsw m2, m4, [r3 + 4 * 16] + pmulhrsw m2, m7 + + packuswb m1, m2 + movu [r0 + 1600], m1 + + ; mode 27 [row 2, 3] + + pmaddubsw m1, m4, [r3 + 6 * 16] + pmulhrsw m1, m7 + + pmaddubsw m2, m4, [r3 + 8 * 16] + pmulhrsw m2, m7 + + packuswb m1, m2 + movu [r0 + 1616], m1 + + ; mode 27 [row 4, 5] + + pmaddubsw m3, m4, [r3 + 10 * 16] + pmulhrsw m3, m7 + + pmaddubsw m2, m4, [r3 + 12 * 16] + pmulhrsw m2, m7 + + packuswb m1, m3, m2 + movu [r0 + 1632], m1 + + ; mode 27 [row 6, 7] + + pmaddubsw m1, m4, [r3 + 14 * 16] + pmulhrsw m1, m7 + + pmaddubsw m2, m4, [r3 + 16 * 16] + pmulhrsw m2, m7 + + packuswb m1, m2 + movu [r0 + 1648], m1 + + ; mode 28 [row 0, 1] + + pmaddubsw m1, m4, [r3 + 5 * 16] + pmulhrsw m1, m7 + + packuswb m1, m3 + movu [r0 + 1664], m1 + + ; mode 28 [row 2, 3] + + pmaddubsw m1, m4, [r3 + 15 * 16] + pmulhrsw m1, m7 + + pmaddubsw m2, m4, [r3 + 20 * 16] + pmulhrsw m2, m7 + + packuswb m1, m2 + movu [r0 + 1680], m1 + + ; mode 28 [row 4, 5] + + pmaddubsw m1, m4, [r3 + 25 * 16] + pmulhrsw m1, m7 + + pmaddubsw m2, m4, [r3 + 30 * 16] + pmulhrsw m2, m7 + + packuswb m1, m2 + movu [r0 + 1696], m1 + + ; mode 28 [row 6, 7] + + palignr m1, m0, 2 + punpcklbw m5, m6, m1 + + pmaddubsw m2, m5, [r3 + 3 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m5, [r3 + 8 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1712], m2 + + ; mode 29 [row 0, 1] + + pmaddubsw m2, m4, [r3 + 9 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m4, [r3 + 18 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1728], m2 + + ; mode 29 [row 2, 3] + + pmaddubsw m2, m4, [r3 + 27 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m5, [r3 + 4 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1744], m2 + + ; mode 29 [row 4, 5] + + pmaddubsw m2, m5, [r3 + 13 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m5, [r3 + 22 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1760], m2 + + ; mode 29 [row 6, 7] + + pmaddubsw m2, m5, [r3 + 31 * 16] + pmulhrsw m2, m7 + + palignr m6, m0, 3 + punpcklbw m1, m6 + + pmaddubsw m3, m1, [r3 + 8 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1776], m2 + + ; mode 32 [row 2] + + movh [r0 + 1936], m2 + + ; mode 30 [row 0, 1] + + pmaddubsw m2, m4, [r3 + 13 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m4, [r3 + 26 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1792], m2 + + ; mode 30 [row 2, 3] + + pmaddubsw m2, m5, [r3 + 7 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m5, [r3 + 20 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1808], m2 + + ; mode 33 [row 1] + + movhps [r0 + 1992], m2 + + ; mode 30 [row 4, 5] + + pmaddubsw m2, m1, [r3 + 1 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m1, [r3 + 14 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1824], m2 + + ; mode 33 [row 2] + + movhps [r0 + 2000], m2 + + ; mode 30 [row 6, 7] + + pmaddubsw m2, m1, [r3 + 27 * 16] + pmulhrsw m2, m7 + + psrldq m0, 4 + punpcklbw m6, m0 + + pmaddubsw m3, m6, [r3 + 8 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1840], m2 + + ; mode 33 [row 3] + + movhps [r0 + 2008], m2 + + ; mode 31 [row 0, 1] + + pmaddubsw m2, m4, [r3 + 17 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m5, [r3 + 2 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1856], m2 + + ; mode 31 [row 2, 3] + + pmaddubsw m2, m5, [r3 + 19 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m1, [r3 + 4 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1872], m2 + + ; mode 31 [row 4, 5] + + pmaddubsw m2, m1, [r3 + 21 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m6, [r3 + 6 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1888], m2 + + ; mode 31 [row 6, 7] + + pmaddubsw m2, m6, [r3 + 23 * 16] + pmulhrsw m2, m7 + + movu m3, [r1 + 6] + punpcklbw m0, m3 + + pmaddubsw m3, m0, [r3 + 8 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1904], m2 + + ; mode 32 [row 0, 1] + + pmaddubsw m2, m4, [r3 + 21 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m5, [r3 + 10 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1920], m2 + + ; mode 32 [row 3] + + pmaddubsw m2, m1, [r3 + 20 * 16] + pmulhrsw m2, m7 + + pxor m3, m3 + + packuswb m2, m3 + movh [r0 + 1944], m2 + + ; mode 32 [row 4, 5] + + pmaddubsw m2, m6, [r3 + 9 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m6, [r3 + 30 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1952], m2 + + ; mode 33 [row 4, 5] + + pmaddubsw m2, m0, [r3 + 2 * 16] + pmulhrsw m2, m7 + + pmaddubsw m3, m0, [r3 + 28 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 2016], m2 + + ; mode 32 [row 6] + + pmaddubsw m2, m0, [r3 + 19 * 16] + pmulhrsw m2, m7 + + ; mode 32 [row 7] + + movu m0, [r1 + 6] + palignr m3, m0, 1 + punpcklbw m0, m3 + + pmaddubsw m3, m0, [r3 + 8 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 1968], m2 + + ; mode 33 [row 6, 7] + + pmaddubsw m2, m0, [r3 + 22 * 16] + pmulhrsw m2, m7 + + movu m0, [r1 + 7] + palignr m3, m0, 1 + punpcklbw m0, m3 + + pmaddubsw m3, m0, [r3 + 16 * 16] + pmulhrsw m3, m7 + + packuswb m2, m3 + movu [r0 + 2032], m2 + + ; mode 33 [row 0] + + pmaddubsw m2, m4, [r3 + 26 * 16] + pmulhrsw m2, m7 + + pxor m3, m3 + + packuswb m2, m3 + movh [r0 + 1984], m2 + + ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7] + + movu m0, [r2 + 2] + palignr m1, m0, 1 + punpcklqdq m2, m0, m1 + movu [r0 + 2048], m2 + + palignr m1, m0, 2 + palignr m2, m0, 3 + punpcklqdq m1, m2 + movu [r0 + 2064], m1 + + palignr m1, m0, 4 + palignr m2, m0, 5 + punpcklqdq m1, m2 + movu [r0 + 2080], m1 + + palignr m1, m0, 6 + palignr m2, m0, 7 + punpcklqdq m1, m2 + movu [r0 + 2096], m1 +RET + +;-------------------------------------------------------------------------------- +; void all_angs_pred_16x16(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) +;-------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal all_angs_pred_16x16, 3,4,8 + ; mode 2 + + movu m0, [r2 + 2 + 32] + movu [r0 + 0 * 16], m0 + + movu m1, m0 + + movu m6, [r2 + 18 + 32] + palignr m5, m6, m0, 1 + movu [r0 + 1 * 16], m5 + + movu m4, m5 + + palignr m5, m6, m0, 2 + movu [r0 + 2 * 16], m5 + palignr m5, m6, m0, 3 + movu [r0 + 3 * 16], m5 + palignr m5, m6, m0, 4 + movu [r0 + 4 * 16], m5 + palignr m5, m6, m0, 5 + movu [r0 + 5 * 16], m5 + palignr m5, m6, m0, 6 + movu [r0 + 6 * 16], m5 + palignr m5, m6, m0, 7 + movu [r0 + 7 * 16], m5 + + movu m7, m5 + + palignr m5, m6, m0, 8 + movu [r0 + 8 * 16], m5 + + movu m2, m5 + + palignr m5, m6, m0, 9 + movu [r0 + 9 * 16], m5 + + palignr m3, m6, m0, 10 + movu [r0 + 10 * 16], m3 + palignr m3, m6, m0, 11 + movu [r0 + 11 * 16], m3 + palignr m3, m6, m0, 12 + movu [r0 + 12 * 16], m3 + + ; mode 3 [row 15] + movu [r0 + (3-2)*16*16 + 15 * 16], m3 + + palignr m3, m6, m0, 13 + movu [r0 + 13 * 16], m3 + palignr m3, m6, m0, 14 + movu [r0 + 14 * 16], m3 + palignr m3, m6, m0, 15 + movu [r0 + 15 * 16], m3 + + ; mode 3 [row 0] + lea r3, [ang_table] + movu m3, [pw_1024] + movu m0, [r2 + 1 + 32] + punpcklbw m0, m1 + + ; mode 17 [row 8 - second half] + pmaddubsw m1, m0, [r3 + 22 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 248 * 16 + 8], m1 + ; mode 17 [row 8 - second half] end + + pmaddubsw m1, m0, [r3 + 26 * 16] + pmulhrsw m1, m3 + punpcklbw m7, m2 + pmaddubsw m2, m7, [r3 + 26 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 16 * 16], m1 + + ;mode 6 [row 1] + movu [r0 + 65 * 16], m1 + + ; mode 4 [row 0] + pmaddubsw m1, m0, [r3 + 21 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 21 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 32 * 16], m1 + + ; mode 5 [row 0] + pmaddubsw m1, m0, [r3 + 17 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 17 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 48 * 16], m1 + + ; mode 6 [row 0] + pmaddubsw m1, m0, [r3 + 13 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 13 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 64 * 16], m1 + + ; mode 7 [row 0] + pmaddubsw m1, m0, [r3 + 9 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 9 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 80 * 16], m1 + + ; mode 7 [row 1] + pmaddubsw m1, m0, [r3 + 18 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 18 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 81 * 16], m1 + + ; mode 7 [row 2] + pmaddubsw m1, m0, [r3 + 27 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 27 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 82 * 16], m1 + + ; mode 8 [row 0] + pmaddubsw m1, m0, [r3 + 5 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 5 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 96 * 16], m1 + + ; mode 8 [row 1] + pmaddubsw m1, m0, [r3 + 10 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 10 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 97 * 16], m1 + + ; mode 8 [row 2] + pmaddubsw m1, m0, [r3 + 15 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 15 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 98 * 16], m1 + + ; mode 8 [row 3] + pmaddubsw m1, m0, [r3 + 20 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 20 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 99 * 16], m1 + + ; mode 8 [row 4] + pmaddubsw m1, m0, [r3 + 25 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 25 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 100 * 16], m1 + + ; mode 8 [row 5] + pmaddubsw m1, m0, [r3 + 30 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 30 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 101 * 16], m1 + + ; mode 15 [row 13 - second half] + pmaddubsw m1, m0, [r3 + 18 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 221 * 16 + 8], m1 + ; mode 15 [row 13 - second half] end + + ; mode 15 [row 14 - second half] + pmaddubsw m1, m0, [r3 + 1 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 222 * 16 + 8], m1 + ; mode 15 [row 14 - second half] end + + ; mode 16 [row 10 - second half] + pmaddubsw m1, m0, [r3 + 25 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 234 * 16 + 8], m1 + ; mode 16 [row 10 - second half] end + + ; mode 16 [row 11 - second half] + pmaddubsw m1, m0, [r3 + 4 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 235 * 16 + 8], m1 + ; mode 16 [row 11 - second half] end + + ; mode 3 [row 1] + movu m6, [r3 + 20 * 16] + movu m0, [r2 + 2 + 32] + punpcklbw m0, m4 + + ; mode 17 [row 7 - second half] + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 247 * 16 + 8], m1 + + ; mode 17 [row 7 - second half] end + pmaddubsw m1, m0, m6 + pmulhrsw m1, m3 + movu m2, [r2 + 10 + 32] + punpcklbw m2, m5 + pmaddubsw m4, m2, m6 + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 17 * 16], m1 + + ;mode 6 [row 3] + movu [r0 + 67 * 16], m1 + + ; mode 4 row [row 1] + pmaddubsw m1, m0, [r3 + 10 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 10 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 33 * 16], m1 + + ; mode 4 row [row 2] + pmaddubsw m1, m0, [r3 + 31 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 31 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 34 * 16], m1 + + ; mode 7 [row 6] + movu [r0 + 86 * 16], m1 + + ; mode 5 row [row 1] + pmaddubsw m1, m0, [r3 + 2 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 2 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 49 * 16], m1 + + ; mode 5 row [row 2] + pmaddubsw m1, m0, [r3 + 19 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 19 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 50 * 16], m1 + + ; mode 6 [row 2] + pmaddubsw m1, m0, [r3 + 7 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 7 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 66 * 16], m1 + + ; mode 7 [row 3] + pmaddubsw m1, m0, [r3 + 4 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 83 * 16], m1 + + ; mode 7 [row 4] + pmaddubsw m1, m0, [r3 + 13 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 13 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 84 * 16], m1 + + ; mode 8 [row 8] + movu [r0 + 104 * 16], m1 + + ; mode 7 [row 5] + pmaddubsw m1, m0, [r3 + 22 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 22 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 85 * 16], m1 + + ; mode 8 [row 6] + pmaddubsw m1, m0, [r3 + 3 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 3 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 102 * 16], m1 + + ; mode 8 [row 7] + pmaddubsw m1, m0, [r3 + 8 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 8 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 103 * 16], m1 + + ; mode 8 [row 9] + pmaddubsw m1, m0, [r3 + 18 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 18 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 105 * 16], m1 + + ; mode 8 [row 10] + pmaddubsw m1, m0, [r3 + 23 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 23 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 106 * 16], m1 + + ; mode 8 [row 11] + pmaddubsw m1, m0, [r3 + 28 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 28 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 107 * 16], m1 + + ; mode 3 [row 2] + movu m0, [r2 + 3 + 32] + movd m1, [r2 + 19 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + + ; mode 17 [row 6 - second half] + pmaddubsw m1, m0, [r3 + 10 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 246 * 16 + 8], m1 + ; mode 17 [row 6 - second half] end + + pmaddubsw m1, m0, [r3 + 14 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 11 + 32] + movd m4, [r2 + 27 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 14 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 18 * 16], m1 + + ; mode 6 [row 5] + movu [r0 + 69 * 16], m1 + + ; mode 4 row [row 3] + pmaddubsw m1, m0, [r3 + 20 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 20 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 35 * 16], m1 + + ; mode 5 row [row 3] + pmaddubsw m1, m0, [r3 + 4 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 51 * 16], m1 + + ; mode 5 row [row 4] + pmaddubsw m1, m0, [r3 + 21 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 21 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 52 * 16], m1 + + ; mode 6 [row 4] + pmaddubsw m1, m0, [r3 + 1 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 68 * 16], m1 + + ; mode 6 [row 6] + pmaddubsw m1, m0, [r3 + 27 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 27 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 70 * 16], m1 + + ; mode 7 [row 7] + pmaddubsw m1, m0, [r3 + 8 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 8 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 87 * 16], m1 + + ; mode 7 [row 8] + pmaddubsw m1, m0, [r3 + 17 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 17 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 88 * 16], m1 + + ; mode 7 [row 9] + pmaddubsw m1, m0, [r3 + 26 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 26 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 89 * 16], m1 + + ; mode 8 [row 12] + pmaddubsw m1, m0, [r3 + 1 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 108 * 16], m1 + + ; mode 8 [row 13] + pmaddubsw m1, m0, [r3 + 6 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 6 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 109 * 16], m1 + + ; mode 8 [row 14] + pmaddubsw m1, m0, [r3 + 11 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 11 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 110 * 16], m1 + + ; mode 8 [row 15] + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 111 * 16], m1 + + ; mode 3 [row 3] + movu m0, [r2 + 4 + 32] + movd m1, [r2 + 20 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + + ; mode 17 [row 4 - second half] + pmaddubsw m1, m0, [r3 + 30 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 244 * 16 + 8], m1 + ; mode 17 [row 4 - second half] end + + ; mode 17 [row 5 - second half] + pmaddubsw m1, m0, [r3 + 4 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 245 * 16 + 8], m1 + ; mode 17 [row 5 - second half] end + + pmaddubsw m1, m0, [r3 + 8 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 12 + 32] + movd m4, [r2 + 28 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 8 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 19 * 16], m1 + + ; mode 6 [row 7] + movu [r0 + 71 * 16], m1 + + ; mode 4 row [row 4] + pmaddubsw m1, m0, [r3 + 9 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 9 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 36 * 16], m1 + + ; mode 4 row [row 5] + pmaddubsw m1, m0, [r3 + 30 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 30 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 37 * 16], m1 + + ; mode 7 row [row 13] + movu [r0 + 93 * 16], m1 + + ; mode 5 row [row 5] + pmaddubsw m1, m0, [r3 + 6 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 6 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 53 * 16], m1 + + ; mode 5 row [row 6] + pmaddubsw m1, m0, [r3 + 23 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 23 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 54 * 16], m1 + + ; mode 6 [row 8] + pmaddubsw m1, m0, [r3 + 21 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 21 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 72 * 16], m1 + + ; mode 7 [row 12] + movu [r0 + 92 * 16], m1 + + ; mode 7 [row 10] + pmaddubsw m1, m0, [r3 + 3 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 3 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 90 * 16], m1 + + ; mode 7 [row 11] + pmaddubsw m1, m0, [r3 + 12 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 12 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 91 * 16], m1 + + ; mode 3 [row 4] + movu m0, [r2 + 5 + 32] + movd m1, [r2 + 20 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + + ; mode 17 [row 3 - second half] + pmaddubsw m1, m0, [r3 + 24 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 243 * 16 + 8], m1 + + ; mode 17 [row 3 - second half] end + pmaddubsw m1, m0, [r3 + 2 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 13 + 32] + movd m4, [r2 + 29 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 2 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 20 * 16], m1 + + ;mode 6 [row 9] + movu [r0 + 73 * 16], m1 + + ; mode 4 row [row 6] + movu m6, [r3 + 19 * 16] + pmaddubsw m1, m0, m6 + pmulhrsw m1, m3 + pmaddubsw m4, m2, m6 + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 38 * 16], m1 + + ; mode 3 [row 5] + pmaddubsw m1, m0, [r3 + 28 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 28 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 21 * 16], m1 + + ;mode 6 [row 11] + movu [r0 + 75 * 16], m1 + + ; mode 5 row [row 7] + pmaddubsw m1, m0, [r3 + 8 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 8 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 55 * 16], m1 + + ; mode 5 row [row 8] + pmaddubsw m1, m0, [r3 + 25 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 25 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 56 * 16], m1 + + ; mode 6 [row 10] + pmaddubsw m1, m0, [r3 + 15 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 15 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 74 * 16], m1 + + ; mode 7 [row 14] + pmaddubsw m1, m0, [r3 + 7 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 7 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 94 * 16], m1 + + ; mode 7 [row 15] + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 95 * 16], m1 + + ; mode 3 [row 6] + movu m0, [r2 + 6 + 32] + movd m1, [r2 + 22 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + + ; mode 17 [row 2 - second half] + pmaddubsw m1, m0, [r3 + 18 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 242 * 16 + 8], m1 + ; mode 17 [row 2 - second half] end + + pmaddubsw m1, m0, [r3 + 22 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 14 + 32] + movd m4, [r2 + 30 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 22 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 22 * 16], m1 + + ; mode 6 [row 13] + movu [r0 + 77 * 16], m1 + + ; mode 4 row [row 7] + pmaddubsw m1, m0, [r3 + 8 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 8 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 39 * 16], m1 + + ; mode 4 row [row 8] + pmaddubsw m1, m0, [r3 + 29 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 29 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 40 * 16], m1 + + ; mode 5 row [row 9] + pmaddubsw m1, m0, [r3 + 10 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 10 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 57 * 16], m1 + + ; mode 5 row [row 10] + pmaddubsw m1, m0, [r3 + 27 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 27 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 58 * 16], m1 + + ; mode 6 [row 12] + pmaddubsw m1, m0, [r3 + 9 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 9 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 76 * 16], m1 + + ; mode 3 [row 7] + movu m0, [r2 + 7 + 32] + movd m1, [r2 + 27 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + + ; mode 17 [row 1 - second half] + pmaddubsw m1, m0, [r3 + 12 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 241 * 16 + 8], m1 + ; mode 17 [row 1 - second half] end + + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 15 + 32] + movd m4, [r2 + 25 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 23 * 16], m1 + + ; mode 6 [row 15] + movu [r0 + 79 * 16], m1 + + ; mode 4 row [row 9] + pmaddubsw m1, m0, [r3 + 18 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 18 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 41 * 16], m1 + + ; mode 5 row [row 11] + pmaddubsw m1, m0, [r3 + 12 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 12 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 59 * 16], m1 + + ; mode 5 row [row 12] + pmaddubsw m1, m0, [r3 + 29 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 29 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 60 * 16], m1 + + ; mode 6 [row 14] + pmaddubsw m1, m0, [r3 + 3 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 3 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 78 * 16], m1 + + ; mode 3 [row 8] + movu m0, [r2 + 8 + 32] + movd m1, [r2 + 24 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 + 10 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 16 + 32] + psrldq m4, m2, 1 + pinsrb m4, [r2 + 32], 15 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 10 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 24 * 16], m1 + + ; mode 4 row [row 10] + pmaddubsw m1, m0, [r3 + 7 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 7 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 42 * 16], m1 + + ; mode 4 row [row 11] + pmaddubsw m1, m0, [r3 + 28 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 28 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 43 * 16], m1 + + ; mode 5 row [row 13] + pmaddubsw m1, m0, [r3 + 14 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 14 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 61 * 16], m1 + + ; mode 5 row [row 14] + pmaddubsw m1, m0, [r3 + 31 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 31 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 62 * 16], m1 + + ; mode 3 [row 9] + movu m0, [r2 + 9 + 32] + movd m1, [r2 + 16 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 + 4 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 17 + 32] + movd m4, [r2 + 33 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 25 * 16], m1 + + ; mode 4 row [row 12] + pmaddubsw m1, m0, [r3 + 17 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 17 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 44 * 16], m1 + + ; mode 3 [row 10] + pmaddubsw m1, m0, [r3 + 30 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 30 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 26 * 16], m1 + + ; mode 5 row [row 15] + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 63 * 16], m1 + + ; mode 3 [row 11] + movu m0, [r2 + 10 + 32] + movd m1, [r2 + 26 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 + 24 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 18 + 32] + movd m4, [r2 + 34 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 27 * 16], m1 + + ; mode 4 row [row 13] + pmaddubsw m1, m0, [r3 + 6 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 6 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 45 * 16], m1 + + ; mode 4 row [row 14] + pmaddubsw m1, m0, [r3 + 27 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 27 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 46 * 16], m1 + + ; mode 3 [row 12] + movu m0, [r2 + 11 + 32] + movd m1, [r2 + 27 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 + 18 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 19 + 32] + movd m4, [r2 + 35 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 18 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 28 * 16], m1 + + ; mode 4 row [row 15] + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m2, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 47 * 16], m1 + + ; mode 3 [row 13] + movu m0, [r2 + 12 + 32] + movd m1, [r2 + 28 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 + 12 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 20 + 32] + movd m4, [r2 + 36 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 12 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 29 * 16], m1 + + ; mode 3 [row 14] + movu m0, [r2 + 13 + 32] + movd m1, [r2 + 29 + 32] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 + 6 * 16] + pmulhrsw m1, m3 + movu m2, [r2 + 21 + 32] + movd m4, [r2 + 37 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m2, [r3 + 6 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 30 * 16], m1 + + ; mode 9 + movu m0, [r1 + 1 + 32] + movd m1, [r1 + 17 + 32] + palignr m1, m0, 1 + + ; mode 9 [row 15] + movu [r0 + 127 * 16], m1 + + ; mode 9 [row 0] + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 + 2 * 16] + pmulhrsw m1, m3 + movu m7, [r1 + 9 + 32] + movd m4, [r2 + 25 + 32] + palignr m2, m7, 1 + punpcklbw m7, m2 + pmaddubsw m2, m7, [r3 + 2 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 112 * 16], m1 + + ; mode 9 [row 1] + pmaddubsw m1, m0, [r3 + 4 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 4 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 113 * 16], m1 + + ; mode 9 [row 2] + pmaddubsw m1, m0, [r3 + 6 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 6 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 114 * 16], m1 + + ; mode 9 [row 3] + pmaddubsw m1, m0, [r3 + 8 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 8 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 115 * 16], m1 + + ; mode 9 [row 4] + pmaddubsw m1, m0, [r3 + 10 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 10 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 116 * 16], m1 + + ; mode 9 [row 5] + pmaddubsw m1, m0, [r3 + 12 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 12 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 117 * 16], m1 + + ; mode 9 [row 6] + pmaddubsw m1, m0, [r3 + 14 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 14 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 118 * 16], m1 + + ; mode 9 [row 7] + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 16 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 119 * 16], m1 + + ; mode 9 [row 8] + pmaddubsw m1, m0, [r3 + 18 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 18 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 120 * 16], m1 + + ; mode 9 [row 9] + pmaddubsw m1, m0, [r3 + 20 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 20 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 121 * 16], m1 + + ; mode 9 [row 10] + pmaddubsw m1, m0, [r3 + 22 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 22 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 122 * 16], m1 + + ; mode 9 [row 11] + pmaddubsw m1, m0, [r3 + 24 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 24 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 123 * 16], m1 + + ; mode 9 [row 12] + pmaddubsw m1, m0, [r3 + 26 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 26 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 124 * 16], m1 + + ; mode 9 [row 13] + pmaddubsw m1, m0, [r3 + 28 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 28 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 125 * 16], m1 + + ; mode 9 [row 14] + pmaddubsw m1, m0, [r3 + 30 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 30 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 126 * 16], m1 + + ; mode 10 + movu m1, [r1 + 1 + 32] + movu [r0 + 128 * 16], m1 + movu [r0 + 129 * 16], m1 + movu [r0 + 130 * 16], m1 + movu [r0 + 131 * 16], m1 + movu [r0 + 132 * 16], m1 + movu [r0 + 133 * 16], m1 + movu [r0 + 134 * 16], m1 + movu [r0 + 135 * 16], m1 + movu [r0 + 136 * 16], m1 + movu [r0 + 137 * 16], m1 + movu [r0 + 138 * 16], m1 + movu [r0 + 139 * 16], m1 + movu [r0 + 140 * 16], m1 + movu [r0 + 141 * 16], m1 + movu [r0 + 142 * 16], m1 + movu [r0 + 143 * 16], m1 + + pxor m0, m0 + pshufb m1, m1, m0 + punpcklbw m1, m0 + pinsrb m2, [r1], 0 + pshufb m2, m2, m0 + punpcklbw m2, m0 + movu m4, [r1 + 1] + punpcklbw m5, m4, m0 + punpckhbw m4, m0 + psubw m5, m2 + psubw m4, m2 + psraw m5, 1 + psraw m4, 1 + paddw m5, m1 + paddw m4, m1 + packuswb m5, m4 + + pextrb [r0 + 128 * 16], m5, 0 + pextrb [r0 + 129 * 16], m5, 1 + pextrb [r0 + 130 * 16], m5, 2 + pextrb [r0 + 131 * 16], m5, 3 + pextrb [r0 + 132 * 16], m5, 4 + pextrb [r0 + 133 * 16], m5, 5 + pextrb [r0 + 134 * 16], m5, 6 + pextrb [r0 + 135 * 16], m5, 7 + pextrb [r0 + 136 * 16], m5, 8 + pextrb [r0 + 137 * 16], m5, 9 + pextrb [r0 + 138 * 16], m5, 10 + pextrb [r0 + 139 * 16], m5, 11 + pextrb [r0 + 140 * 16], m5, 12 + pextrb [r0 + 141 * 16], m5, 13 + pextrb [r0 + 142 * 16], m5, 14 + pextrb [r0 + 143 * 16], m5, 15 + + ; mode 11 + movu m0, [r1 + 32] + pinsrb m0, [r1], 0 + + ; mode 11 [row 15] + movu [r0 + 159 * 16], m0 + + ; mode 11 [row 0] + movu m1, [r1 + 1 + 32] + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 + 30 * 16] + pmulhrsw m1, m3 + movu m7, [r1 + 8 + 32] + movu m2, [r1 + 9 + 32] + punpcklbw m7, m2 + pmaddubsw m2, m7, [r3 + 30 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 144 * 16], m1 + + ; mode 11 [row 1] + pmaddubsw m1, m0, [r3 + 28 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 28 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 145 * 16], m1 + + ; mode 11 [row 2] + pmaddubsw m1, m0, [r3 + 26 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 26 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 146 * 16], m1 + + ; mode 11 [row 3] + pmaddubsw m1, m0, [r3 + 24 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 24 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 147 * 16], m1 + + ; mode 11 [row 4] + pmaddubsw m1, m0, [r3 + 22 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 22 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 148 * 16], m1 + + ; mode 11 [row 5] + pmaddubsw m1, m0, [r3 + 20 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 20 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 149 * 16], m1 + + ; mode 11 [row 6] + pmaddubsw m1, m0, [r3 + 18 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 18 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 150 * 16], m1 + + ; mode 11 [row 7] + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 16 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 151 * 16], m1 + + ; mode 11 [row 8] + pmaddubsw m1, m0, [r3 + 14 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 14 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 152 * 16], m1 + + ; mode 11 [row 9] + pmaddubsw m1, m0, [r3 + 12 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 12 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 153 * 16], m1 + + ; mode 11 [row 10] + pmaddubsw m1, m0, [r3 + 10 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 10 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 154 * 16], m1 + + ; mode 11 [row 11] + pmaddubsw m1, m0, [r3 + 8 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 8 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 155 * 16], m1 + + ; mode 11 [row 12] + pmaddubsw m1, m0, [r3 + 6 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 6 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 156 * 16], m1 + + ; mode 11 [row 13] + pmaddubsw m1, m0, [r3 + 4 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 4 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 157 * 16], m1 + + ; mode 11 [row 14] + pmaddubsw m1, m0, [r3 + 2 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 2 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 158 * 16], m1 + + ; mode 12 [row 0] + movu m0, [r2 + 32] + pinsrb m0, [r2], 0 + movu m1, [r2 + 1 + 32] + punpcklbw m0, m1 + pmaddubsw m1, m0, [r3 + 27 * 16] + pmulhrsw m1, m3 + movu m7, [r2 + 8 + 32] + movd m2, [r2 + 24 + 32] + palignr m2, m7, 1 + punpcklbw m7, m2 + pmaddubsw m2, m7, [r3 + 27 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 160 * 16], m1 + + ; mode 12 [row 1] + pmaddubsw m1, m0, [r3 + 22 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 22 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 161 * 16], m1 + + ; mode 12 [row 2] + pmaddubsw m1, m0, [r3 + 17 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 17 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 162 * 16], m1 + + ; mode 12 [row 3] + pmaddubsw m1, m0, [r3 + 12 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 12 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 163 * 16], m1 + + ; mode 12 [row 4] + pmaddubsw m1, m0, [r3 + 7 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 7 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 164 * 16], m1 + + ; mode 12 [row 5] + pmaddubsw m1, m0, [r3 + 2 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 2 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 165 * 16], m1 + + ; mode 13 [row 0] + pmaddubsw m1, m0, [r3 + 23 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 23 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 176 * 16], m1 + + ; mode 13 [row 1] + pmaddubsw m1, m0, [r3 + 14 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 14 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 177 * 16], m1 + + ; mode 13 [row 2] + pmaddubsw m1, m0, [r3 + 5 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 5 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 178 * 16], m1 + + ; mode 14 [row 0] + pmaddubsw m1, m0, [r3 + 19 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 19 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 192 * 16], m1 + + ; mode 14 [row 1] + pmaddubsw m1, m0, [r3 + 6 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 6 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 193 * 16], m1 + + ; mode 17 [row 0] + movu [r0 + 240 * 16], m1 + + ; mode 15 [row 0] + pmaddubsw m1, m0, [r3 + 15 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 15 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 208 * 16], m1 + + ; mode 15 [row 15 - second half] + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 223 * 16 + 8], m1 + ; mode 15 [row 15 - second half] end + + ; mode 16 [row 0] + pmaddubsw m1, m0, [r3 + 11 * 16] + pmulhrsw m1, m3 + pmaddubsw m2, m7, [r3 + 11 * 16] + pmulhrsw m2, m3 + packuswb m1, m2 + movu [r0 + 224 * 16], m1 + + ; mode 17 [row 9 - second half] + pmaddubsw m1, m0, [r3 + 28 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 249 * 16 + 8], m1 + ; mode 17 [row 9 - second half] end + + ; mode 17 [row 10 - second half] + pmaddubsw m1, m0, [r3 + 2 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 250 * 16 + 8], m1 + ; mode 17 [row 10 - second half] end + + ; mode 17 [row 1 - first half] + pslldq m6, m0, 2 + pinsrb m6, [r2], 1 + pinsrb m6, [r2 + 1], 0 + pmaddubsw m1, m6, [r3 + 12 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 241 * 16], m1 + + ; mode 17 [row 11 - second half] + pmaddubsw m1, m6, [r3 + 8 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 251 * 16 + 8], m1 + ; mode 17 [row 11 - second half] end + + ; mode 17 [row 2 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 1], 1 + pinsrb m6, [r2 + 2], 0 + pmaddubsw m1, m6, [r3 + 18 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 242 * 16], m1 + + ; mode 17 [row 12 - second half] + pmaddubsw m1, m6, [r3 + 14 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 252 * 16 + 8], m1 + ; mode 17 [row 12 - second half] end + + ; mode 17 [row 3 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 2], 1 + pinsrb m6, [r2 + 4], 0 + pmaddubsw m1, m6, [r3 + 24 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 243 * 16], m1 + + ; mode 17 [row 13 - first half] + pmaddubsw m1, m6, [r3 + 20 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 253 * 16 + 8], m1 + + ; mode 17 [row 4 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 4], 1 + pinsrb m6, [r2 + 5], 0 + pmaddubsw m1, m6, [r3 + 30 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 244 * 16], m1 + + ; mode 17 [row 5 - first half] + pmaddubsw m1, m6, [r3 + 4 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 245 * 16], m1 + + ; mode 17 [row 14 - second half] + pmaddubsw m1, m6, [r3 + 26 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 254 * 16 + 8], m1 + ; mode 17 [row 14 - second half] end + + ; mode 17 [row 6 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 5], 1 + pinsrb m6, [r2 + 6], 0 + pmaddubsw m1, m6, [r3 + 10 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 246 * 16], m1 + + ; mode 17 [row 7 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 6], 1 + pinsrb m6, [r2 + 7], 0 + pmaddubsw m1, m6, [r3 + 16 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 247 * 16], m1 + + ; mode 17 [row 8 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 7], 1 + pinsrb m6, [r2 + 9], 0 + pmaddubsw m1, m6, [r3 + 22 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 248 * 16], m1 + + ; mode 17 [row 9 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 9], 1 + pinsrb m6, [r2 + 10], 0 + pmaddubsw m1, m6, [r3 + 28 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 249 * 16], m1 + + ; mode 17 [row 10 - first half] + pmaddubsw m1, m6, [r3 + 2 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 250 * 16], m1 + + ; mode 17 [row 11 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 10], 1 + pinsrb m6, [r2 + 11], 0 + pmaddubsw m1, m6, [r3 + 8 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 251 * 16], m1 + + ; mode 17 [row 12 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 11], 1 + pinsrb m6, [r2 + 12], 0 + pmaddubsw m1, m6, [r3 + 14 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 252 * 16], m1 + + ; mode 17 [row 13 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 12], 1 + pinsrb m6, [r2 + 14], 0 + pmaddubsw m1, m6, [r3 + 20 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 253 * 16], m1 + + ; mode 17 [row 14 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 14], 1 + pinsrb m6, [r2 + 15], 0 + pmaddubsw m1, m6, [r3 + 26 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 254 * 16], m1 + + ; mode 16 [row 12 - second half] + pmaddubsw m1, m0, [r3 + 15 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 236 * 16 + 8], m1 + ; mode 16 [row 12 - second half] + + ; mode 12 [row 6] + pslldq m2, m0, 2 + pinsrb m2, [r2], 1 + pinsrb m2, [r2 + 6], 0 + pmaddubsw m1, m2, [r3 + 29 * 16] + pmulhrsw m1, m3 + movu m0, [r2 + 7 + 32] + psrldq m4, m0, 1 + punpcklbw m0, m4 + pmaddubsw m4, m0, [r3 + 29 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 166 * 16], m1 + + ; mode 12 [row 7] + pmaddubsw m1, m2, [r3 + 24 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 167 * 16], m1 + + ; mode 12 [row 8] + pmaddubsw m1, m2, [r3 + 19 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 19 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 168 * 16], m1 + + ; mode 12 [row 9] + pmaddubsw m1, m2, [r3 + 14 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 14 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 169 * 16], m1 + + ; mode 12 [row 10] + pmaddubsw m1, m2, [r3 + 9 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 9 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 170 * 16], m1 + + ; mode 12 [row 11] + pmaddubsw m1, m2, [r3 + 4 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 171 * 16], m1 + + ; mode 13 [row 3] + pinsrb m7, m2, [r2 + 4], 0 + pmaddubsw m1, m7, [r3 + 28 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 28 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 179 * 16], m1 + + ; mode 13 [row 4] + pmaddubsw m1, m7, [r3 + 19 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 19 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 180 * 16], m1 + + ; mode 13 [row 5] + pmaddubsw m1, m7, [r3 + 10 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 10 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 181 * 16], m1 + + ; mode 13 [row 6] + pmaddubsw m1, m7, [r3 + 1 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 182 * 16], m1 + + ; mode 14 [row 2] + pinsrb m5, m7, [r2 + 2], 0 + pmaddubsw m1, m5, [r3 + 25 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 25 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 194 * 16], m1 + + ; mode 14 [row 3] + pmaddubsw m1, m5, [r3 + 12 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 12 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 195 * 16], m1 + + ; mode 15 [row 1] + pmaddubsw m1, m5, [r3 + 30 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 30 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 209 * 16], m1 + + ; mode 15 [row 2] + pmaddubsw m1, m5, [r3 + 13 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 13 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 210 * 16], m1 + + ; mode 16 [row 1] + pmaddubsw m1, m5, [r3 + 22 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 22 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 225 * 16], m1 + + ; mode 16 [row 2] + pmaddubsw m1, m5, [r3 + 1 * 16] + pmulhrsw m1, m3 + pmaddubsw m4, m0, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m1, m4 + movu [r0 + 226 * 16], m1 + + ; mode 16 [row 13 - second half] + pmaddubsw m1, m5, [r3 + 26 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 237 * 16 + 8], m1 + ; mode 16 [row 13 - second half] + + ; mode 16 [row 14 - second half] + pmaddubsw m1, m5, [r3 + 5 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 238 * 16 + 8], m1 + ; mode 16 [row 14 - second half] + + ; mode 16 [row 3] + pslldq m6, m5, 2 + pinsrb m6, [r2 + 2], 1 + pinsrb m6, [r2 + 3], 0 + pmaddubsw m1, m6, [r3 + 12 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 227 * 16], m1 + + ; mode 16 [row 15 - second half] + pmaddubsw m1, m6, [r3 + 16 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 239 * 16 + 8], m1 + ; mode 16 [row 15 - second half] end + + ; mode 16 [row 4- first half] + pslldq m6, 2 + pinsrb m6, [r2 + 3], 1 + pinsrb m6, [r2 + 5], 0 + pmaddubsw m1, m6, [r3 + 23 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 228 * 16], m1 + + ; mode 16 [row 5- first half] + pmaddubsw m1, m6, [r3 + 2 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 229 * 16], m1 + + ; mode 16 [row 6- first half] + pslldq m6, 2 + pinsrb m6, [r2 + 5], 1 + pinsrb m6, [r2 + 6], 0 + pmaddubsw m1, m6, [r3 + 13 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 230 * 16], m1 + + ; mode 16 [row 7- first half] + pslldq m6, 2 + pinsrb m6, [r2 + 6], 1 + pinsrb m6, [r2 + 8], 0 + pmaddubsw m1, m6, [r3 + 24 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 231 * 16], m1 + + ; mode 16 [row 8- first half] + pmaddubsw m1, m6, [r3 + 3 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 232 * 16], m1 + ; mode 19 [row 0 - second half] end + + ; mode 16 [row 9- first half] + pslldq m6, 2 + pinsrb m6, [r2 + 8], 1 + pinsrb m6, [r2 + 9], 0 + pmaddubsw m1, m6, [r3 + 14 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 233 * 16], m1 + + ; mode 16 [row 10 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 9], 1 + pinsrb m6, [r2 + 11], 0 + pmaddubsw m1, m6, [r3 + 25 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 234 * 16], m1 + + ; mode 16 [row 11 - first half] + pmaddubsw m1, m6, [r3 + 4 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 235 * 16], m1 + + ; mode 16 [row 12 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 11], 1 + pinsrb m6, [r2 + 12], 0 + pmaddubsw m1, m6, [r3 + 15 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 236 * 16], m1 + + ; mode 16 [row 13 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 12], 1 + pinsrb m6, [r2 + 14], 0 + pmaddubsw m1, m6, [r3 + 26 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 237 * 16], m1 + + ; mode 16 [row 14 - first half] + pmaddubsw m1, m6, [r3 + 5 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 238 * 16], m1 + + ; mode 16 [row 15 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 14], 1 + pinsrb m6, [r2 + 15], 0 + pmaddubsw m1, m6, [r3 + 16 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 239 * 16], m1 + + ; mode 14 [row 4] + pslldq m5, 2 + pinsrb m5, [r2 + 2], 1 + pinsrb m5, [r2 + 5], 0 + movu m4, [r2 + 6 + 32] + psrldq m0, m4, 1 + punpcklbw m4, m0 + + ; mode 16 [row 3 - second half] + pmaddubsw m1, m4, [r3 + 12 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 227 * 16 + 8], m1 + + ; mode 16 [row 3 - second half] end + pmaddubsw m1, m5, [r3 + 31 * 16] + pmulhrsw m1, m3 + pmaddubsw m0, m4, [r3 + 31 * 16] + pmulhrsw m0, m3 + packuswb m1, m0 + movu [r0 + 196 * 16], m1 + + ; mode 14 [row 5] + pmaddubsw m1, m5, [r3 + 18 * 16] + pmulhrsw m1, m3 + pmaddubsw m0, m4, [r3 + 18 * 16] + pmulhrsw m0, m3 + packuswb m1, m0 + movu [r0 + 197 * 16], m1 + + ; mode 14 [row 6] + pmaddubsw m1, m5, [r3 + 5 * 16] + pmulhrsw m1, m3 + pmaddubsw m0, m4, [r3 + 5 * 16] + pmulhrsw m0, m3 + packuswb m1, m0 + movu [r0 + 198 * 16], m1 + + ; mode 15 [row 3] + movu m6, m5 + pinsrb m6, [r2 + 4], 0 + pmaddubsw m1, m6, [r3 + 28 * 16] + pmulhrsw m1, m3 + pmaddubsw m0, m4, [r3 + 28 * 16] + pmulhrsw m0, m3 + packuswb m1, m0 + movu [r0 + 211 * 16], m1 + + ; mode 15 [row 4] + pmaddubsw m1, m6, [r3 + 11 * 16] + pmulhrsw m1, m3 + pmaddubsw m0, m4, [r3 + 11 * 16] + pmulhrsw m0, m3 + packuswb m1, m0 + movu [r0 + 212 * 16], m1 + + ; mode 15 [row 5 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 4], 1 + pinsrb m6, [r2 + 6], 0 + pmaddubsw m1, m6, [r3 + 26 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 213 * 16], m1 + + ; mode 15 [row 6 - first half] + pmaddubsw m1, m6, [r3 + 9 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 214 * 16], m1 + + ; mode 15 [row 7 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 6], 1 + pinsrb m6, [r2 + 8], 0 + pmaddubsw m1, m6, [r3 + 24 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 215 * 16], m1 + + ; mode 15 [row 8 - first half] + pmaddubsw m1, m6, [r3 + 7 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 216 * 16], m1 + + ; mode 15 [row 9 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 8], 1 + pinsrb m6, [r2 + 9], 0 + pmaddubsw m1, m6, [r3 + 22 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 217 * 16], m1 + + ; mode 15 [row 10 - first half] + pmaddubsw m1, m6, [r3 + 5 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 218 * 16], m1 + + ; mode 15 [row 11 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 9], 1 + pinsrb m6, [r2 + 11], 0 + pmaddubsw m1, m6, [r3 + 20 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 219 * 16], m1 + + ; mode 15 [row 12 - first half] + pmaddubsw m1, m6, [r3 + 3 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 220 * 16], m1 + + ; mode 15 [row 13 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 11], 1 + pinsrb m6, [r2 + 13], 0 + pmaddubsw m1, m6, [r3 + 18 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 221 * 16], m1 + + ; mode 15 [row 14 - first half] + pmaddubsw m1, m6, [r3 + 1 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 222 * 16], m1 + + ; mode 15 [row 15 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 13], 1 + pinsrb m6, [r2 + 15], 0 + pmaddubsw m1, m6, [r3 + 16 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 223 * 16], m1 + + ; mode 14 [row 7] + pslldq m5, 2 + pinsrb m5, [r2 + 5], 1 + pinsrb m5, [r2 + 7], 0 + movu m0, [r2 + 5 + 32] + psrldq m6, m0, 1 + punpcklbw m0, m6 + + ; mode 15 [row 5 - second half] + pmaddubsw m1, m0, [r3 + 26 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 213 * 16 + 8], m1 + ; mode 15 [row 5 - second half] end + + ; mode 15 [row 6 - second half] + pmaddubsw m1, m0, [r3 + 9 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 214 * 16 + 8], m1 + ; mode 15 [row 6 - second half] end + + ; mode 16 [row 4 - second half] + pmaddubsw m1, m0, [r3 + 23 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 228 * 16 + 8], m1 + ; mode 16 [row 4 - second half] end + + ; mode 16 [row 5 - second half] + pmaddubsw m1, m0, [r3 + 2 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 229 * 16 + 8], m1 + + ; mode 16 [row 5 - second half] end + pmaddubsw m1, m5, [r3 + 24 * 16] + pmulhrsw m1, m3 + pmaddubsw m6, m0, [r3 + 24 * 16] + pmulhrsw m6, m3 + packuswb m1, m6 + movu [r0 + 199 * 16], m1 + + ; mode 14 [row 8] + pmaddubsw m1, m5, [r3 + 11 * 16] + pmulhrsw m1, m3 + pmaddubsw m6, m0, [r3 + 11 * 16] + pmulhrsw m6, m3 + packuswb m1, m6 + movu [r0 + 200 * 16], m1 + + ; mode 14 [row 9] + pslldq m5, 2 + pinsrb m5, [r2 + 7], 1 + pinsrb m5, [r2 + 10], 0 + movu m0, [r2 + 4 + 32] + psrldq m6, m0, 1 + punpcklbw m0, m6 + + ; mode 15 [row 7 - second half] + pmaddubsw m1, m0, [r3 + 24 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 215 * 16 + 8], m1 + ; mode 15 [row 7 - second half] end + + ; mode 15 [row 8 - second half] + pmaddubsw m1, m0, [r3 + 7 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 216 * 16 + 8], m1 + ; mode 15 [row 8 - second half] end + + ; mode 16 [row 6 - second half] + pmaddubsw m1, m0, [r3 + 13 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 230 * 16 + 8], m1 + ; mode 16 [row 6 - second half] end + + ; mode 15 [row 6 - second half] end + pmaddubsw m1, m5, [r3 + 30 * 16] + pmulhrsw m1, m3 + pmaddubsw m6, m0, [r3 + 30 * 16] + pmulhrsw m6, m3 + packuswb m1, m6 + movu [r0 + 201 * 16], m1 + + ; mode 14 [row 10] + pmaddubsw m1, m5, [r3 + 17 * 16] + pmulhrsw m1, m3 + pmaddubsw m6, m0, [r3 + 17 * 16] + pmulhrsw m6, m3 + packuswb m1, m6 + movu [r0 + 202 * 16], m1 + + ; mode 14 [row 11] + pmaddubsw m1, m5, [r3 + 4 * 16] + pmulhrsw m1, m3 + pmaddubsw m6, m0, [r3 + 4 * 16] + pmulhrsw m6, m3 + packuswb m1, m6 + movu [r0 + 203 * 16], m1 + + ; mode 14 [row 12] + pslldq m5, 2 + pinsrb m5, [r2 + 10], 1 + pinsrb m5, [r2 + 12], 0 + movu m0, [r2 + 3 + 32] + psrldq m6, m0, 1 + punpcklbw m0, m6 + + ; mode 15 [row 9 - second half] + pmaddubsw m1, m0, [r3 + 22 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 217 * 16 + 8], m1 + ; mode 15 [row 9 - second half] end + + ; mode 15 [row 10 - second half] + pmaddubsw m1, m0, [r3 + 5 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 218 * 16 + 8], m1 + ; mode 15 [row 10 - second half] end + + ; mode 16 [row 7 - second half] + pmaddubsw m1, m0, [r3 + 24 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 231 * 16 + 8], m1 + ; mode 16 [row 7 - second half] end + + ; mode 16 [row 8 - second half] + pmaddubsw m1, m0, [r3 + 3 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 232 * 16 + 8], m1 + ; mode 16 [row 8 - second half] end + + pmaddubsw m1, m5, [r3 + 23 * 16] + pmulhrsw m1, m3 + pmaddubsw m6, m0, [r3 + 23 * 16] + pmulhrsw m6, m3 + packuswb m1, m6 + movu [r0 + 204 * 16], m1 + + ; mode 14 [row 13] + pmaddubsw m1, m5, [r3 + 10 * 16] + pmulhrsw m1, m3 + pmaddubsw m6, m0, [r3 + 10 * 16] + pmulhrsw m6, m3 + packuswb m1, m6 + movu [r0 + 205 * 16], m1 + + ; mode 14 [row 14] + pslldq m5, 2 + pinsrb m5, [r2 + 12], 1 + pinsrb m5, [r2 + 15], 0 + movu m0, [r2 + 2 + 32] + psrldq m6, m0, 1 + punpcklbw m0, m6 + + ; mode 15 [row 11 - second half] + pmaddubsw m1, m0, [r3 + 20 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 219 * 16 + 8], m1 + ; mode 15 [row 11 - second half] end + + ; mode 15 [row 12 - second half] + pmaddubsw m1, m0, [r3 + 3 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 220 * 16 + 8], m1 + ; mode 15 [row 12 - second half] end + + ; mode 16 [row 9 - second half] + pmaddubsw m1, m0, [r3 + 14 * 16] + pmulhrsw m1, m3 + packuswb m1, m1 + movh [r0 + 233 * 16 + 8], m1 + + ; mode 16 [row 9 - second half] end + pmaddubsw m1, m5, [r3 + 29 * 16] + pmulhrsw m1, m3 + pmaddubsw m6, m0, [r3 + 29 * 16] + pmulhrsw m6, m3 + packuswb m1, m6 + movu [r0 + 206 * 16], m1 + + ; mode 14 [row 15] + pmaddubsw m1, m5, [r3 + 16 * 16] + pmulhrsw m1, m3 + pmaddubsw m6, m0, [r3 + 16 * 16] + pmulhrsw m6, m3 + packuswb m1, m6 + movu [r0 + 207 * 16], m1 + + ; mode 12 [row 12] + pslldq m0, m2, 2 + pinsrb m0, [r2 + 6], 1 + pinsrb m0, [r2 + 13], 0 + pmaddubsw m1, m0, [r3 + 31 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 31 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 172 * 16], m1 + + ; mode 12 [row 13] + pmaddubsw m1, m0, [r3 + 26 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 26 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 173 * 16], m1 + + ; mode 12 [row 14] + pmaddubsw m1, m0, [r3 + 21 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 21 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 174 * 16], m1 + + ; mode 12 [row 15] + pmaddubsw m1, m0, [r3 + 16 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 16 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 175 * 16], m1 + + ; mode 13 [row 7] + pslldq m7, 2 + pinsrb m7, [r2 + 4], 1 + pinsrb m7, [r2 + 7], 0 + pmaddubsw m1, m7, [r3 + 24 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 24 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 183 * 16], m1 + + ; mode 13 [row 8] + pmaddubsw m1, m7, [r3 + 15 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 15 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 184 * 16], m1 + + ; mode 13 [row 9] + pmaddubsw m1, m7, [r3 + 6 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 6 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 185 * 16], m1 + + ; mode 13 [row 10] + pslldq m7, 2 + pinsrb m7, [r2 + 7], 1 + pinsrb m7, [r2 + 11], 0 + pmaddubsw m1, m7, [r3 + 29 * 16] + pmulhrsw m1, m3 + movu m4, [r2 + 5 + 32] + psrldq m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, [r3 + 29 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 186 * 16], m1 + + ; mode 13 [row 11] + pmaddubsw m1, m7, [r3 + 20 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 20 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 187 * 16], m1 + + ; mode 13 [row 12] + pmaddubsw m1, m7, [r3 + 11 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 11 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 188 * 16], m1 + + ; mode 13 [row 13] + pmaddubsw m1, m7, [r3 + 2 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 2 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 189 * 16], m1 + + ; mode 13 [row 14] + pslldq m7, 2 + pinsrb m7, [r2 + 11], 1 + pinsrb m7, [r2 + 14], 0 + pmaddubsw m1, m7, [r3 + 25 * 16] + pmulhrsw m1, m3 + movu m4, [r2 + 4 + 32] + psrldq m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, [r3 + 25 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 190 * 16], m1 + + ; mode 13 [row 15] + pmaddubsw m1, m7, [r3 + 16 * 16] + pmulhrsw m1, m3 + pmaddubsw m5, m4, [r3 + 16 * 16] + pmulhrsw m5, m3 + packuswb m1, m5 + movu [r0 + 191 * 16], m1 + + ; mode 17 [row 15] + movu m0, [r2] + pshufb m1, m0, [tab_S1] + movu [r0 + 255 * 16], m1 + movu m2, [r2 + 32] + pinsrb m2, [r2], 0 + movd [r0 + 255 * 16 + 12], m2 + + ; mode 18 [row 0] + movu [r0 + 256 * 16], m0 + + ; mode 18 [row 1] + pslldq m4, m0, 1 + pinsrb m4, [r2 + 1 + 32], 0 + movu [r0 + 257 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 2 + 32], 0 + movu [r0 + 258 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 3 + 32], 0 + movu [r0 + 259 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 4 + 32], 0 + movu [r0 + 260 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 5 + 32], 0 + movu [r0 + 261 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 6 + 32], 0 + movu [r0 + 262 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 7 + 32], 0 + movu [r0 + 263 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 8 + 32], 0 + movu [r0 + 264 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 9 + 32], 0 + movu [r0 + 265 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 10 + 32], 0 + movu [r0 + 266 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 11 + 32], 0 + movu [r0 + 267 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 12 + 32], 0 + movu [r0 + 268 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 13 + 32], 0 + movu [r0 + 269 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 14 + 32], 0 + movu [r0 + 270 * 16], m4 + pslldq m4, 1 + pinsrb m4, [r2 + 15 + 32], 0 + movu [r0 + 271 * 16], m4 + + ; mode 19 [row 0] + psrldq m2, m0, 1 + punpcklbw m0, m2 + movu m5, [r2 + 8] + psrldq m6, m5, 1 + punpcklbw m5, m6 + pmaddubsw m4, m0, [r3 + 6 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 6 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 272 * 16], m4 + + ; mode 20 [row 0] + pmaddubsw m4, m0, [r3 + 11 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 11 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 288 * 16], m4 + + ; mode 21 [row 0] + pmaddubsw m4, m0, [r3 + 15 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 15 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 304 * 16], m4 + + ; mode 22 [row 0] + pmaddubsw m4, m0, [r3 + 19 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 19 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 320 * 16], m4 + + ; mode 22 [row 1] + pmaddubsw m4, m0, [r3 + 6 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 6 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 321 * 16], m4 + + ; mode 23 [row 0] + pmaddubsw m4, m0, [r3 + 23 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 23 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 336 * 16], m4 + + ; mode 23 [row 1] + pmaddubsw m4, m0, [r3 + 14 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 14 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 337 * 16], m4 + + ; mode 23 [row 2] + pmaddubsw m4, m0, [r3 + 5 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 5 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 338 * 16], m4 + + ; mode 24 [row 0] + pmaddubsw m4, m0, [r3 + 27 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 27 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 352 * 16], m4 + + ; mode 24 [row 1] + pmaddubsw m4, m0, [r3 + 22 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 22 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 353 * 16], m4 + + ; mode 24 [row 2] + pmaddubsw m4, m0, [r3 + 17 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 17 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 354 * 16], m4 + + ; mode 24 [row 3] + pmaddubsw m4, m0, [r3 + 12 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 12 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 355 * 16], m4 + + ; mode 24 [row 4] + pmaddubsw m4, m0, [r3 + 7 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 7 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 356 * 16], m4 + + ; mode 24 [row 5] + pmaddubsw m4, m0, [r3 + 2 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 2 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 357 * 16], m4 + + ; mode 24 [row 6 - first half] + pslldq m7, m0, 2 + pinsrb m7, [r2 + 0], 1 + pinsrb m7, [r2 + 6 + 32], 0 + pmaddubsw m4, m7, [r3 + 29 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 358 * 16], m4 + + ; mode 24 [row 7 - first half] + pmaddubsw m4, m7, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 359 * 16], m4 + + ; mode 24 [row 8 - first half] + pmaddubsw m4, m7, [r3 + 19 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 360 * 16], m4 + + ; mode 24 [row 9 - first half] + pmaddubsw m4, m7, [r3 + 14 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 361 * 16], m4 + + ; mode 24 [row 10 - first half] + pmaddubsw m4, m7, [r3 + 9 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 362 * 16], m4 + + ; mode 24 [row 11 - first half] + pmaddubsw m4, m7, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 363 * 16], m4 + + ; mode 24 [row 12 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 6 + 32], 1 + pinsrb m7, [r2 + 13 + 32], 0 + pmaddubsw m4, m7, [r3 + 31 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 364 * 16], m4 + + ; mode 24 [row 13 - first half] + pmaddubsw m4, m7, [r3 + 26 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 365 * 16], m4 + + ; mode 24 [row 14 - first half] + pmaddubsw m4, m7, [r3 + 21 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 366 * 16], m4 + + ; mode 24 [row 15 - first half] + pmaddubsw m4, m7, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 367 * 16], m4 + + ; mode 23 [row 3 - first half] + pslldq m7, m0, 2 + pinsrb m7, [r2 + 0], 1 + pinsrb m7, [r2 + 4 + 32], 0 + pmaddubsw m4, m7, [r3 + 28 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 339 * 16], m4 + + ; mode 23 [row 4 - first half] + pmaddubsw m4, m7, [r3 + 19 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 340 * 16], m4 + + ; mode 23 [row 5 - first half] + pmaddubsw m4, m7, [r3 + 10 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 341 * 16], m4 + + ; mode 23 [row 6 - first half] + pmaddubsw m4, m7, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 342 * 16], m4 + + ; mode 23 [row 7 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 4 + 32], 1 + pinsrb m7, [r2 + 7 + 32], 0 + pmaddubsw m4, m7, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 343 * 16], m4 + + ; mode 23 [row 8 - first half] + pmaddubsw m4, m7, [r3 + 15 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 344 * 16], m4 + + ; mode 23 [row 9 - first half] + pmaddubsw m4, m7, [r3 + 6 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 345 * 16], m4 + + ; mode 23 [row 10 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 7 + 32], 1 + pinsrb m7, [r2 + 11 + 32], 0 + pmaddubsw m4, m7, [r3 + 29 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 346 * 16], m4 + + ; mode 23 [row 11 - first half] + pmaddubsw m4, m7, [r3 + 20 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 347 * 16], m4 + + ; mode 23 [row 12 - first half] + pmaddubsw m4, m7, [r3 + 11 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 348 * 16], m4 + + ; mode 23 [row 13 - first half] + pmaddubsw m4, m7, [r3 + 2 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 349 * 16], m4 + + ; mode 23 [row 14 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 11 + 32], 1 + pinsrb m7, [r2 + 14 + 32], 0 + pmaddubsw m4, m7, [r3 + 25 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 350 * 16], m4 + + ; mode 23 [row 15 - first half] + pmaddubsw m4, m7, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 351 * 16], m4 + + ; mode 21 [row 15 - first half] + pmaddubsw m4, m0, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 319 * 16 + 8], m4 + ; mode 21 [row 15 - second half] end + + ; mode 20 [row 1 - first half] + pslldq m7, m0, 2 + pinsrb m7, [r2 + 0], 1 + pinsrb m7, [r2 + 2 + 32], 0 + pmaddubsw m4, m7, [r3 + 22 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 289 * 16], m4 + + ; mode 20 [row 2 - first half] + pmaddubsw m4, m7, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 290 * 16], m4 + + ; mode 21 [row 1 - first half] + pmaddubsw m4, m7, [r3 + 30 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 305 * 16], m4 + + ; mode 21 [row 2 - first half] + pmaddubsw m4, m7, [r3 + 13 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 306 * 16], m4 + + ; mode 22 [row 2 - first half] + pmaddubsw m4, m7, [r3 + 25 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 322 * 16], m4 + + ; mode 22 [row 3 - first half] + pmaddubsw m4, m7, [r3 + 12 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 323 * 16], m4 + + ; mode 22 [row 4 - first half] + pslldq m1, m7, 2 + pinsrb m1, [r2 + 2 + 32], 1 + pinsrb m1, [r2 + 5 + 32], 0 + pmaddubsw m4, m1, [r3 + 31 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 324 * 16], m4 + + ; mode 22 [row 5 - first half] + pmaddubsw m4, m1, [r3 + 18 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 325 * 16], m4 + + ; mode 22 [row 6 - first half] + pmaddubsw m4, m1, [r3 + 5 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 326 * 16], m4 + + ; mode 22 [row 7 - first half] + pslldq m1, 2 + pinsrb m1, [r2 + 5 + 32], 1 + pinsrb m1, [r2 + 7 + 32], 0 + pmaddubsw m4, m1, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 327 * 16], m4 + + ; mode 22 [row 8 - first half] + pmaddubsw m4, m1, [r3 + 11 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 328 * 16], m4 + + ; mode 22 [row 9 - first half] + pslldq m1, 2 + pinsrb m1, [r2 + 7 + 32], 1 + pinsrb m1, [r2 + 10 + 32], 0 + pmaddubsw m4, m1, [r3 + 30 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 329 * 16], m4 + + ; mode 22 [row 10 - first half] + pmaddubsw m4, m1, [r3 + 17 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 330 * 16], m4 + + ; mode 22 [row 11 - first half] + pmaddubsw m4, m1, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 331 * 16], m4 + + ; mode 22 [row 12 - first half] + pslldq m1, 2 + pinsrb m1, [r2 + 10 + 32], 1 + pinsrb m1, [r2 + 12 + 32], 0 + pmaddubsw m4, m1, [r3 + 23 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 332 * 16], m4 + + ; mode 22 [row 13 - first half] + pmaddubsw m4, m1, [r3 + 10 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 333 * 16], m4 + + ; mode 22 [row 14 - first half] + pslldq m1, 2 + pinsrb m1, [r2 + 12 + 32], 1 + pinsrb m1, [r2 + 15 + 32], 0 + pmaddubsw m4, m1, [r3 + 29 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 334 * 16], m4 + + ; mode 22 [row 15 - first half] + pmaddubsw m4, m1, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 335 * 16], m4 + + ; mode 21 [row 3 - first half] + pslldq m6, m7, 2 + pinsrb m6, [r2 + 2 + 32], 1 + pinsrb m6, [r2 + 4 + 32], 0 + pmaddubsw m4, m6, [r3 + 28 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 307 * 16], m4 + + ; mode 21 [row 4 - first half] + pmaddubsw m4, m6, [r3 + 11 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 308 * 16], m4 + + ; mode 21 [row 5 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 4 + 32], 1 + pinsrb m6, [r2 + 6 + 32], 0 + pmaddubsw m4, m6, [r3 + 26 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 309 * 16], m4 + + ; mode 21 [row 6 - first half] + pmaddubsw m4, m6, [r3 + 9 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 310 * 16], m4 + + ; mode 21 [row 7 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 6 + 32], 1 + pinsrb m6, [r2 + 8 + 32], 0 + pmaddubsw m4, m6, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 311 * 16], m4 + + ; mode 21 [row 8 - first half] + pmaddubsw m4, m6, [r3 + 7 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 312 * 16], m4 + + ; mode 21 [row 9 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 8 + 32], 1 + pinsrb m6, [r2 + 9 + 32], 0 + pmaddubsw m4, m6, [r3 + 22 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 313 * 16], m4 + + ; mode 21 [row 10 - first half] + pmaddubsw m4, m6, [r3 + 5 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 314 * 16], m4 + + ; mode 21 [row 11 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 9 + 32], 1 + pinsrb m6, [r2 + 11 + 32], 0 + pmaddubsw m4, m6, [r3 + 20 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 315 * 16], m4 + + ; mode 21 [row 12 - first half] + pmaddubsw m4, m6, [r3 + 3 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 316 * 16], m4 + + ; mode 21 [row 13 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 11 + 32], 1 + pinsrb m6, [r2 + 13 + 32], 0 + pmaddubsw m4, m6, [r3 + 18 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 317 * 16], m4 + + ; mode 21 [row 14 - first half] + pmaddubsw m4, m6, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 318 * 16], m4 + + ; mode 21 [row 15 - first half] + pslldq m6, 2 + pinsrb m6, [r2 + 32 + 13], 1 + pinsrb m6, [r2 + 32 + 15], 0 + pmaddubsw m4, m6, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 319 * 16], m4 + + ; mode 20 [row 13 - second half] + pmaddubsw m4, m7, [r3 + 26 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 301 * 16 + 8], m4 + ; mode 20 [row 13 - second half] + + ; mode 20 [row 14 - second half] + pmaddubsw m4, m7, [r3 + 5 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 302 * 16 + 8], m4 + ; mode 20 [row 14 - second half] + + ; mode 20 [row 3 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 32 + 2], 1 + pinsrb m7, [r2 + 32 + 3], 0 + pmaddubsw m4, m7, [r3 + 12 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 291 * 16], m4 + + ; mode 20 [row 15 - second half] + pmaddubsw m4, m7, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 303 * 16 + 8], m4 + ; mode 20 [row 15 - second half] + + ; mode 20 [row 4 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 32 + 3], 1 + pinsrb m7, [r2 + 32 + 5], 0 + pmaddubsw m4, m7, [r3 + 23 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 292 * 16], m4 + + ; mode 20 [row 5 - first half] + pmaddubsw m4, m7, [r3 + 2 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 293 * 16], m4 + + ; mode 20 [row 6 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 32 + 5], 1 + pinsrb m7, [r2 + 32 + 6], 0 + pmaddubsw m4, m7, [r3 + 13 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 294 * 16], m4 + + ; mode 20 [row 7 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 32 + 6], 1 + pinsrb m7, [r2 + 32 + 8], 0 + pmaddubsw m4, m7, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 295 * 16], m4 + + ; mode 20 [row 8 - first half] + pmaddubsw m4, m7, [r3 + 3 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 296 * 16], m4 + + ; mode 20 [row 9 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 32 + 8], 1 + pinsrb m7, [r2 + 32 + 9], 0 + pmaddubsw m4, m7, [r3 + 14 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 297 * 16], m4 + + ; mode 20 [row 10 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 32 + 9], 1 + pinsrb m7, [r2 + 32 + 11], 0 + pmaddubsw m4, m7, [r3 + 25 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 298 * 16], m4 + + ; mode 20 [row 11 - first half] + pmaddubsw m4, m7, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 299 * 16], m4 + + ; mode 20 [row 12 - first half] + movu m1, [r3 + 15 * 16] + pslldq m7, 2 + pinsrb m7, [r2 + 32 + 11], 1 + pinsrb m7, [r2 + 32 + 12], 0 + pmaddubsw m4, m7, [r3 + 15 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 300 * 16], m4 + + ; mode 20 [row 13 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 32 + 12], 1 + pinsrb m7, [r2 + 32 + 14], 0 + pmaddubsw m4, m7, [r3 + 26 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 301 * 16], m4 + + ; mode 20 [row 14 - first half] + pmaddubsw m4, m7, [r3 + 5 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 302 * 16], m4 + + ; mode 20 [row 15 - first half] + pslldq m7, 2 + pinsrb m7, [r2 + 32 + 14], 1 + pinsrb m7, [r2 + 32 + 15], 0 + pmaddubsw m4, m7, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 303 * 16], m4 + + ; mode 19 [row 1] + pslldq m0, 2 + pinsrb m0, [r2], 1 + pinsrb m0, [r2 + 32 + 1], 0 + pslldq m5, 2 + pinsrb m5, [r2 + 8], 1 + pinsrb m5, [r2 + 7], 0 + + ; mode 20 [row 1 - second half] + pmaddubsw m4, m5, [r3 + 22 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 289 * 16 + 8], m4 + ; mode 20 [row 1 - second half] end + + ; mode 20 [row 2 - second half] + pmaddubsw m4, m5, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 290 * 16 + 8], m4 + ; mode 20 [row 2 - second half] end + + ; mode 21 [row 2 - second half] + pmaddubsw m4, m5, [r3 + 30 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 305 * 16 + 8], m4 + ; mode 21 [row 2 - second half] end + + ; mode 21 [row 3 - second half] + pmaddubsw m4, m5, [r3 + 13 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 306 * 16 + 8], m4 + ; mode 21 [row 3 - second half] end + + ; mode 21 [row 4 - second half] + pmaddubsw m4, m5, [r3 + 11 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 307 * 16 + 8], m4 + ; mode 21 [row 4 - second half] end + + ; mode 22 [row 2 - second half] + pmaddubsw m4, m5, [r3 + 25 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 322 * 16 + 8], m4 + ; mode 22 [row 2 - second half] end + + ; mode 22 [row 3 - second half] + pmaddubsw m4, m5, [r3 + 12 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 323 * 16 + 8], m4 + ; mode 22 [row 3 - second half] end + + ; mode 23 [row 3 - second half] + pmaddubsw m4, m5, [r3 + 28 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 339 * 16 + 8], m4 + ; mode 23 [row 3 - second half] end + + ; mode 23 [row 4 - second half] + pmaddubsw m4, m5, [r3 + 19 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 340 * 16 + 8], m4 + ; mode 23 [row 4 - second half] end + + ; mode 23 [row 5 - second half] + pmaddubsw m4, m5, [r3 + 10 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 341 * 16 + 8], m4 + ; mode 23 [row 5 - second half] end + + ; mode 23 [row 6 - second half] + pmaddubsw m4, m5, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 342 * 16 + 8], m4 + ; mode 23 [row 6 - second half] end + + ; mode 24 [row 6 - second half] + pmaddubsw m4, m5, [r3 + 29 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 358 * 16 + 8], m4 + ; mode 24 [row 6 - second half] end + + ; mode 24 [row 7 - second half] + pmaddubsw m4, m5, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 359 * 16 + 8], m4 + ; mode 24 [row 7 - second half] end + + ; mode 24 [row 8 - second half] + pmaddubsw m4, m5, [r3 + 19 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 360 * 16 + 8], m4 + ; mode 24 [row 8 - second half] end + + ; mode 24 [row 9 - second half] + pmaddubsw m4, m5, [r3 + 14 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 361 * 16 + 8], m4 + ; mode 24 [row 9 - second half] end + + ; mode 24 [row 10 - second half] + pmaddubsw m4, m5, [r3 + 9 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 362 * 16 + 8], m4 + ; mode 24 [row 10 - second half] end + + ; mode 24 [row 11 - second half] + pmaddubsw m4, m5, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 363 * 16 + 8], m4 + ; mode 24 [row 11 - second half] end + + pmaddubsw m4, m0, [r3 + 12 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 12 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 273 * 16], m4 + + ; mode 19 [row 2] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 1], 1 + pinsrb m0, [r2 + 32 + 2], 0 + pslldq m5, 2 + pinsrb m5, [r2 + 7], 1 + pinsrb m5, [r2 + 6], 0 + + ; mode 20 [row 3 - second half] + pmaddubsw m4, m5, [r3 + 12 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 291 * 16 + 8], m4 + ; mode 20 [row 3 - second half] end + + ; mode 21 [row 3 - second half] + pmaddubsw m4, m5, [r3 + 28 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 307 * 16 + 8], m4 + ; mode 21 [row 3 - second half] end + + ; mode 21 [row 4 - second half] + pmaddubsw m4, m5, [r3 + 11 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 308 * 16 + 8], m4 + ; mode 21 [row 4 - second half] end + + ; mode 22 [row 4 - second half] + pmaddubsw m4, m5, [r3 + 31 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 324 * 16 + 8], m4 + ; mode 22 [row 4 - second half] end + + ; mode 22 [row 5 - second half] + pmaddubsw m4, m5, [r3 + 18 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 325 * 16 + 8], m4 + ; mode 22 [row 5 - second half] end + + ; mode 22 [row 6 - second half] + pmaddubsw m4, m5, [r3 + 5 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 326 * 16 + 8], m4 + ; mode 22 [row 6 - second half] end + + ; mode 23 [row 7 - second half] + pmaddubsw m4, m5, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 343 * 16 + 8], m4 + ; mode 23 [row 7 - second half] end + + ; mode 23 [row 8 - second half] + pmaddubsw m4, m5, [r3 + 15 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 344 * 16 + 8], m4 + ; mode 23 [row 8 - second half] end + + ; mode 23 [row 9 - second half] + pmaddubsw m4, m5, [r3 + 6 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 345 * 16 + 8], m4 + ; mode 23 [row 9 - second half] end + + ; mode 24 [row 12 - second half] + pmaddubsw m4, m5, [r3 + 31 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 364 * 16 + 8], m4 + ; mode 24 [row 12 - second half] end + + ; mode 24 [row 13 - second half] + pmaddubsw m4, m5, [r3 + 26 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 365 * 16 + 8], m4 + ; mode 24 [row 13 - second half] end + + ; mode 24 [row 14 - second half] + pmaddubsw m4, m5, [r3 + 21 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 366 * 16 + 8], m4 + ; mode 24 [row 14 - second half] end + + ; mode 24 [row 15 - second half] + pmaddubsw m4, m5, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 367 * 16 + 8], m4 + ; mode 24 [row 15 - second half] end + + pmaddubsw m4, m0, [r3 + 18 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 18 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 274 * 16], m4 + + ; mode 19 [row 3] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 2], 1 + pinsrb m0, [r2 + 32 + 4], 0 + pslldq m5, 2 + pinsrb m5, [r2 + 6], 1 + pinsrb m5, [r2 + 5], 0 + + ; mode 20 [row 4 - second half] + pmaddubsw m4, m5, [r3 + 23 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 292 * 16 + 8], m4 + ; mode 20 [row 4 - second half] end + + ; mode 20 [row 5 - second half] + pmaddubsw m4, m5, [r3 + 2 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 293 * 16 + 8], m4 + ; mode 20 [row 5 - second half] end + + ; mode 21 [row 5 - second half] + pmaddubsw m4, m5, [r3 + 26 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 309 * 16 + 8], m4 + ; mode 21 [row 5 - second half] end + + ; mode 21 [row 6 - second half] + pmaddubsw m4, m5, [r3 + 9 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 310 * 16 + 8], m4 + ; mode 21 [row 6 - second half] end + + ; mode 22 [row 7 - second half] + pmaddubsw m4, m5, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 327 * 16 + 8], m4 + ; mode 22 [row 7 - second half] end + + ; mode 22 [row 8 - second half] + pmaddubsw m4, m5, [r3 + 11 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 328 * 16 + 8], m4 + ; mode 22 [row 7 - second half] end + + ; mode 23 [row 10 - second half] + pmaddubsw m4, m5, [r3 + 29 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 346 * 16 + 8], m4 + ; mode 23 [row 10 - second half] end + + ; mode 23 [row 11 - second half] + pmaddubsw m4, m5, [r3 + 20 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 347 * 16 + 8], m4 + ; mode 23 [row 11 - second half] end + + ; mode 23 [row 12 - second half] + pmaddubsw m4, m5, [r3 + 11 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 348 * 16 + 8], m4 + ; mode 23 [row 12 - second half] end + + ; mode 23 [row 13 - second half] + pmaddubsw m4, m5, [r3 + 2 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 349 * 16 + 8], m4 + ; mode 23 [row 13 - second half] end + + pmaddubsw m4, m0, [r3 + 24 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 24 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 275 * 16], m4 + + ; mode 19 [row 4] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 4], 1 + pinsrb m0, [r2 + 32 + 5], 0 + pslldq m5, 2 + pinsrb m5, [r2 + 5], 1 + pinsrb m5, [r2 + 4], 0 + + ; mode 20 [row 6 - second half] + pmaddubsw m4, m5, [r3 + 13 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 294 * 16 + 8], m4 + ; mode 20 [row 6 - second half] end + + ; mode 21 [row 7 - second half] + pmaddubsw m4, m5, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 311 * 16 + 8], m4 + ; mode 21 [row 7 - second half] end + + ; mode 21 [row 8 - second half] + pmaddubsw m4, m5, [r3 + 7 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 312 * 16 + 8], m4 + ; mode 21 [row 8 - second half] end + + ; mode 22 [row 9 - second half] + pmaddubsw m4, m5, [r3 + 30 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 329 * 16 + 8], m4 + ; mode 22 [row 9 - second half] end + + ; mode 22 [row 10 - second half] + pmaddubsw m4, m5, [r3 + 17 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 330 * 16 + 8], m4 + ; mode 22 [row 10 - second half] end + + ; mode 22 [row 11 - second half] + pmaddubsw m4, m5, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 331 * 16 + 8], m4 + ; mode 22 [row 11 - second half] end + + ; mode 23 [row 14 - second half] + pmaddubsw m4, m5, [r3 + 25 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 350 * 16 + 8], m4 + ; mode 23 [row 14 - second half] end + + ; mode 23 [row 15 - second half] + pmaddubsw m4, m5, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 351 * 16 + 8], m4 + + ; mode 23 [row 15 - second half] end + pmaddubsw m4, m0, [r3 + 30 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 30 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 276 * 16], m4 + + ; mode 19 [row 5] + pmaddubsw m4, m0, [r3 + 4 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 4 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 277 * 16], m4 + + ; mode 19 [row 6] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 5], 1 + pinsrb m0, [r2 + 32 + 6], 0 + pslldq m5, 2 + pinsrb m5, [r2 + 4], 1 + pinsrb m5, [r2 + 3], 0 + + ; mode 20 [row 7 - second half] + pmaddubsw m4, m5, [r3 + 24 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 295 * 16 + 8], m4 + ; mode 20 [row 7 - second half] end + + ; mode 20 [row 8 - second half] + pmaddubsw m4, m5, [r3 + 3 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 296 * 16 + 8], m4 + ; mode 20 [row 8 - second half] end + + ; mode 21 [row 9 - second half] + pmaddubsw m4, m5, [r3 + 22 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 313 * 16 + 8], m4 + ; mode 21 [row 9 - second half] end + + ; mode 21 [row 10 - second half] + pmaddubsw m4, m5, [r3 + 5 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 314 * 16 + 8], m4 + ; mode 21 [row 10 - second half] end + + ; mode 22 [row 12 - second half] + pmaddubsw m4, m5, [r3 + 23 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 332 * 16 + 8], m4 + ; mode 22 [row 12 - second half] end + + ; mode 22 [row 12 - second half] + pmaddubsw m4, m5, [r3 + 10 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 333 * 16 + 8], m4 + ; mode 22 [row 12 - second half] end + + pmaddubsw m4, m0, [r3 + 10 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 10 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 278 * 16], m4 + + ; mode 19 [row 7] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 6], 1 + pinsrb m0, [r2 + 32 + 7], 0 + pslldq m5, 2 + pinsrb m5, [r2 + 3], 1 + pinsrb m5, [r2 + 2], 0 + + ; mode 20 [row 9 - second half] + pmaddubsw m4, m5, [r3 + 14 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 297 * 16 + 8], m4 + ; mode 20 [row 9 - second half] + + ; mode 21 [row 11 - second half] + pmaddubsw m4, m5, [r3 + 20 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 315 * 16 + 8], m4 + ; mode 21 [row 11 - second half] end + + ; mode 21 [row 12 - second half] + pmaddubsw m4, m5, [r3 + 3 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 316 * 16 + 8], m4 + ; mode 21 [row 12 - second half] end + + ; mode 22 [row 14 - second half] + pmaddubsw m4, m5, [r3 + 29 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 334 * 16 + 8], m4 + ; mode 22 [row 14 - second half] end + + ; mode 22 [row 15 - second half] + pmaddubsw m4, m5, [r3 + 16 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 335 * 16 + 8], m4 + ; mode 22 [row 15 - second half] end + + pmaddubsw m4, m0, [r3 + 16 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 16 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 279 * 16], m4 + + ; mode 19 [row 8] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 7], 1 + pinsrb m0, [r2 + 32 + 9], 0 + pslldq m5, 2 + pinsrb m5, [r2 + 2], 1 + pinsrb m5, [r2 + 1], 0 + + ; mode 20 [row 10 - second half] + pmaddubsw m4, m5, [r3 + 25 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 298 * 16 + 8], m4 + ; mode 20 [row 10 - second half] end + + ; mode 20 [row 11 - second half] + pmaddubsw m4, m5, [r3 + 4 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 299 * 16 + 8], m4 + ; mode 20 [row 11 - second half] end + + ; mode 21 [row 13 - second half] + pmaddubsw m4, m5, [r3 + 18 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 317 * 16 + 8], m4 + ; mode 21 [row 13 - second half] end + + ; mode 21 [row 14 - second half] + pmaddubsw m4, m5, [r3 + 1 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 318 * 16 + 8], m4 + ; mode 21 [row 14 - second half] end + + pmaddubsw m4, m0, [r3 + 22 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 22 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 280 * 16], m4 + + ; mode 19 [row 9] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 9], 1 + pinsrb m0, [r2 + 32 + 10], 0 + pslldq m5, 2 + pinsrb m5, [r2 + 1], 1 + pinsrb m5, [r2 + 0], 0 + + ; mode 20 [row 12 - second half] + pmaddubsw m4, m5, [r3 + 15 * 16] + pmulhrsw m4, m3 + packuswb m4, m4 + movh [r0 + 300 * 16 + 8], m4 + + ; mode 20 [row 12 - second half] end + pmaddubsw m4, m0, [r3 + 28 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 28 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 281 * 16], m4 + + ; mode 19 [row 10] + pmaddubsw m4, m0, [r3 + 2 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 2 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 282 * 16], m4 + + ; mode 19 [row 11] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 10], 1 + pinsrb m0, [r2 + 32 + 11], 0 + pmaddubsw m4, m0, [r3 + 8 * 16] + pmulhrsw m4, m3 + pslldq m5, 2 + pinsrb m5, [r2], 1 + pinsrb m5, [r2 + 32 + 1], 0 + pmaddubsw m6, m5, [r3 + 8 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 283 * 16], m4 + + ; mode 19 [row 12] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 11], 1 + pinsrb m0, [r2 + 32 + 12], 0 + pslldq m5, 2 + pinsrb m5, [r2 + 32 + 1], 1 + pinsrb m5, [r2 + 32 + 2], 0 + pmaddubsw m4, m0, [r3 + 14 * 16] + pmulhrsw m4, m3 + pmaddubsw m6, m5, [r3 + 14 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 284 * 16], m4 + + ; mode 19 [row 13] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 12], 1 + pinsrb m0, [r2 + 32 + 14], 0 + pmaddubsw m4, m0, [r3 + 20 * 16] + pmulhrsw m4, m3 + pslldq m5, 2 + pinsrb m5, [r2 + 32 + 2], 1 + pinsrb m5, [r2 + 32 + 4], 0 + pmaddubsw m6, m5, [r3 + 20 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 285 * 16], m4 + + ; mode 19 [row 14] + pslldq m0, 2 + pinsrb m0, [r2 + 32 + 14], 1 + pinsrb m0, [r2 + 32 + 15], 0 + pmaddubsw m4, m0, [r3 + 26 * 16] + pmulhrsw m4, m3 + pslldq m5, 2 + pinsrb m5, [r2 + 32 + 4], 1 + pinsrb m5, [r2 + 32 + 5], 0 + pmaddubsw m6, m5, [r3 + 26 * 16] + pmulhrsw m6, m3 + packuswb m4, m6 + movu [r0 + 286 * 16], m4 + + ; mode 19 [row 15] + movu m0, [r2 + 32] + pshufb m0, [tab_S1] + movu [r0 + 287 * 16], m0 + movd m1, [r2] + movd [r0 + 287 * 16 + 12], m1 + + ; mode 25 + movu m1, [r1] + + ; mode 26 [all rows] + psrldq m6, m1, 1 + pinsrb m6, [r1 + 16], 15 + movu m7, m6 + movu [r0 + 384 * 16], m6 + movu [r0 + 385 * 16], m6 + movu [r0 + 386 * 16], m6 + movu [r0 + 387 * 16], m6 + movu [r0 + 388 * 16], m6 + movu [r0 + 389 * 16], m6 + movu [r0 + 390 * 16], m6 + movu [r0 + 391 * 16], m6 + movu [r0 + 392 * 16], m6 + movu [r0 + 393 * 16], m6 + movu [r0 + 394 * 16], m6 + movu [r0 + 395 * 16], m6 + movu [r0 + 396 * 16], m6 + movu [r0 + 397 * 16], m6 + movu [r0 + 398 * 16], m6 + movu [r0 + 399 * 16], m6 + + pxor m0, m0 + pshufb m6, m6, m0 + punpcklbw m6, m0 + pinsrb m2, [r1], 0 + pshufb m2, m2, m0 + punpcklbw m2, m0 + movu m4, [r1 + 1 + 32] + punpcklbw m5, m4, m0 + punpckhbw m4, m0 + psubw m5, m2 + psubw m4, m2 + psraw m5, 1 + psraw m4, 1 + paddw m5, m6 + paddw m4, m6 + packuswb m5, m4 + + pextrb [r0 + 384 * 16], m5, 0 + pextrb [r0 + 385 * 16], m5, 1 + pextrb [r0 + 386 * 16], m5, 2 + pextrb [r0 + 387 * 16], m5, 3 + pextrb [r0 + 388 * 16], m5, 4 + pextrb [r0 + 389 * 16], m5, 5 + pextrb [r0 + 390 * 16], m5, 6 + pextrb [r0 + 391 * 16], m5, 7 + pextrb [r0 + 392 * 16], m5, 8 + pextrb [r0 + 393 * 16], m5, 9 + pextrb [r0 + 394 * 16], m5, 10 + pextrb [r0 + 395 * 16], m5, 11 + pextrb [r0 + 396 * 16], m5, 12 + pextrb [r0 + 397 * 16], m5, 13 + pextrb [r0 + 398 * 16], m5, 14 + pextrb [r0 + 399 * 16], m5, 15 + + ; mode 25 [row 15] + movu [r0 + 383 * 16], m1 + + ; mode 25 [row 0] + psrldq m2, m1, 1 + punpcklbw m1, m2 + movu m2, [r1 + 8] + psrldq m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m4, m1, [r3 + 30 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 30 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 368 * 16], m4 + + ; mode 25 [row 1] + pmaddubsw m4, m1, [r3 + 28 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 28 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 369 * 16], m4 + + ; mode 25 [row 2] + pmaddubsw m4, m1, [r3 + 26 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 26 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 370 * 16], m4 + + ; mode 25 [row 3] + pmaddubsw m4, m1, [r3 + 24 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 24 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 371 * 16], m4 + + ; mode 25 [row 4] + pmaddubsw m4, m1, [r3 + 22 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 22 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 372 * 16], m4 + + ; mode 25 [row 5] + pmaddubsw m4, m1, [r3 + 20 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 20 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 373 * 16], m4 + + ; mode 25 [row 6] + pmaddubsw m4, m1, [r3 + 18 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 18 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 374 * 16], m4 + + ; mode 25 [row 7] + pmaddubsw m4, m1, [r3 + 16 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 16 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 375 * 16], m4 + + ; mode 25 [row 8] + pmaddubsw m4, m1, [r3 + 14 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 14 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 376 * 16], m4 + + ; mode 25 [row 9] + pmaddubsw m4, m1, [r3 + 12 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 12 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 377 * 16], m4 + + ; mode 25 [row 10] + pmaddubsw m4, m1, [r3 + 10 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 10 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 378 * 16], m4 + + ; mode 25 [row 11] + pmaddubsw m4, m1, [r3 + 8 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 8 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 379 * 16], m4 + + ; mode 25 [row 12] + pmaddubsw m4, m1, [r3 + 6 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 6 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 380 * 16], m4 + + ; mode 25 [row 13] + pmaddubsw m4, m1, [r3 + 4 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 4 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 381 * 16], m4 + + ; mode 25 [row 14] + pmaddubsw m4, m1, [r3 + 2 * 16] + pmulhrsw m4, m3 + pmaddubsw m5, m2, [r3 + 2 * 16] + pmulhrsw m5, m3 + packuswb m4, m5 + movu [r0 + 382 * 16], m4 + + ; mode 27 [row 15] + psrldq m6, m7, 1 + punpcklbw m7, m6 + pinsrb m6, [r1 + 17], 15 + movu [r0 + 415 * 16], m6 + + ; mode 27 [row 0] + movu m4, [r1 + 9] + psrldq m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m6, m7, [r3 + 2 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 2 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 400 * 16], m6 + + ; mode 27 [row 1] + pmaddubsw m6, m7, [r3 + 4 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 4 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 401 * 16], m6 + + ; mode 27 [row 2] + pmaddubsw m6, m7, [r3 + 6 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 6 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 402 * 16], m6 + + ; mode 27 [row 3] + pmaddubsw m6, m7, [r3 + 8 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 8 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 403 * 16], m6 + + ; mode 27 [row 4] + pmaddubsw m6, m7, [r3 + 10 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 10 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 404 * 16], m6 + + ; mode 27 [row 5] + pmaddubsw m6, m7, [r3 + 12 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 12 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 405 * 16], m6 + + ; mode 27 [row 6] + pmaddubsw m6, m7, [r3 + 14 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 14 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 406 * 16], m6 + + ; mode 27 [row 7] + pmaddubsw m6, m7, [r3 + 16 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 16 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 407 * 16], m6 + + ; mode 27 [row 8] + pmaddubsw m6, m7, [r3 + 18 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 18 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 408 * 16], m6 + + ; mode 27 [row 9] + pmaddubsw m6, m7, [r3 + 20 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 20 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 409 * 16], m6 + + ; mode 27 [row 10] + pmaddubsw m6, m7, [r3 + 22 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 22 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 410 * 16], m6 + + ; mode 27 [row 11] + pmaddubsw m6, m7, [r3 + 24 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 24 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 411 * 16], m6 + + ; mode 27 [row 12] + pmaddubsw m6, m7, [r3 + 26 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 26 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 412 * 16], m6 + + ; mode 27 [row 13] + pmaddubsw m6, m7, [r3 + 28 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 28 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 413 * 16], m6 + + ; mode 27 [row 14] + pmaddubsw m6, m7, [r3 + 30 * 16] + pmulhrsw m6, m3 + pmaddubsw m5, m4, [r3 + 30 * 16] + pmulhrsw m5, m3 + packuswb m6, m5 + movu [r0 + 414 * 16], m6 + + ; mode 28 [row 0] + movu m1, [r2 + 1] + psrldq m2, m1, 1 + punpcklbw m1, m2 + movu m4, [r2 + 9] + psrldq m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m2, m1, [r3 + 5 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 5 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 416 * 16], m2 + + ; mode 28 [row 0] + pmaddubsw m2, m1, [r3 + 5 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 5 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 416 * 16], m2 + + ; mode 28 [row 1] + pmaddubsw m2, m1, [r3 + 10 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 10 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 417 * 16], m2 + + ; mode 28 [row 2] + pmaddubsw m2, m1, [r3 + 15 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 15 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 418 * 16], m2 + + ; mode 28 [row 3] + pmaddubsw m2, m1, [r3 + 20 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 20 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 419 * 16], m2 + + ; mode 28 [row 4] + pmaddubsw m2, m1, [r3 + 25 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 25 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 420 * 16], m2 + + ; mode 28 [row 5] + pmaddubsw m2, m1, [r3 + 30 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 30 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 421 * 16], m2 + + ; mode 29 [row 0] + pmaddubsw m2, m1, [r3 + 9 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 9 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 432 * 16], m2 + + ; mode 29 [row 1] + pmaddubsw m2, m1, [r3 + 18 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 18 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 433 * 16], m2 + + ; mode 29 [row 2] + pmaddubsw m2, m1, [r3 + 27 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 27 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 434 * 16], m2 + + ; mode 30 [row 0] + pmaddubsw m2, m1, [r3 + 13 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 13 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 448 * 16], m2 + + ; mode 30 [row 1] + pmaddubsw m2, m1, [r3 + 26 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 26 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 449 * 16], m2 + + ; mode 33 [row 0] + movu [r0 + 496 * 16], m2 + + ; mode 31 [row 0] + pmaddubsw m2, m1, [r3 + 17 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 17 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 464 * 16], m2 + + ; mode 32 [row 0] + pmaddubsw m2, m1, [r3 + 21 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 21 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 480 * 16], m2 + + ; mode 28 [row 6] + movd m7, [r2 + 9] + palignr m7, m1, 2 + pmaddubsw m2, m7, [r3 + 3 * 16] + pmulhrsw m2, m3 + movd m6, [r2 + 17] + palignr m6, m4, 2 + pmaddubsw m5, m6, [r3 + 3 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 422 * 16], m2 + + ; mode 28 [row 7] + pmaddubsw m2, m7, [r3 + 8 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 8 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 423 * 16], m2 + + ; mode 28 [row 8] + pmaddubsw m2, m7, [r3 + 13 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 13 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 424 * 16], m2 + + ; mode 28 [row 9] + pmaddubsw m2, m7, [r3 + 18 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 18 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 425 * 16], m2 + + ; mode 28 [row 10] + pmaddubsw m2, m7, [r3 + 23 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 23 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 426 * 16], m2 + + ; mode 29 [row 3] + pmaddubsw m2, m7, [r3 + 4 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 4 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 435 * 16], m2 + + ; mode 29 [row 4] + pmaddubsw m2, m7, [r3 + 13 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 13 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 436 * 16], m2 + + ; mode 29 [row 5] + pmaddubsw m2, m7, [r3 + 22 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 22 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 437 * 16], m2 + + ; mode 29 [row 6] + pmaddubsw m2, m7, [r3 + 31 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 31 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 438 * 16], m2 + + ; mode 32 [row 2] + movu [r0 + 482 * 16], m2 + + ; mode 30 [row 2] + pmaddubsw m2, m7, [r3 + 7 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 7 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 450 * 16], m2 + + ; mode 30 [row 3] + pmaddubsw m2, m7, [r3 + 20 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 20 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 451 * 16], m2 + + ; mode 33 [row 1] + movu [r0 + 497 * 16], m2 + + ; mode 31 [row 1] + pmaddubsw m2, m7, [r3 + 2 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 2 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 465 * 16], m2 + + ; mode 31 [row 2] + pmaddubsw m2, m7, [r3 + 19 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 19 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 466 * 16], m2 + + ; mode 32 [row 1] + pmaddubsw m2, m7, [r3 + 10 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 10 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 481 * 16], m2 + + ; mode 28 [row 11] + pmaddubsw m2, m7, [r3 + 28 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 28 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 427 * 16], m2 + + ; mode 28 [row 12] + movd m1, [r2 + 10] + palignr m1, m7, 2 + pmaddubsw m2, m1, [r3 + 1 * 16] + pmulhrsw m2, m3 + movd m4, [r2 + 18] + palignr m4, m6, 2 + pmaddubsw m5, m4, [r3 + 1 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 428 * 16], m2 + + ; mode 30 [row 4] + movu [r0 + 452 * 16], m2 + + ; mode 28 [row 13] + pmaddubsw m2, m1, [r3 + 6 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 6 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 429 * 16], m2 + + ; mode 28 [row 14] + pmaddubsw m2, m1, [r3 + 11 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 11 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 430 * 16], m2 + + ; mode 28 [row 15] + pmaddubsw m2, m1, [r3 + 16 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 16 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 431 * 16], m2 + + ; mode 29 [row 7] + pmaddubsw m2, m1, [r3 + 8 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 8 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 439 * 16], m2 + + ; mode 29 [row 8] + pmaddubsw m2, m1, [r3 + 17 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 17 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 440 * 16], m2 + + ; mode 29 [row 9] + pmaddubsw m2, m1, [r3 + 26 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 26 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 441 * 16], m2 + + ; mode 30 [row 5] + pmaddubsw m2, m1, [r3 + 14 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 14 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 453 * 16], m2 + + ; mode 33 [row 2] + movu [r0 + 498 * 16], m2 + + ; mode 30 [row 6] + pmaddubsw m2, m1, [r3 + 27 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 27 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 454 * 16], m2 + + ; mode 31 [row 3] + pmaddubsw m2, m1, [r3 + 4 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 4 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 467 * 16], m2 + + ; mode 31 [row 4] + pmaddubsw m2, m1, [r3 + 21 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 21 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 468 * 16], m2 + + ; mode 32 [row 3] + pmaddubsw m2, m1, [r3 + 20 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 20 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 483 * 16], m2 + + ; mode 29 [row 10] + movd m7, [r2 + 11] + palignr m7, m1, 2 + pmaddubsw m2, m7, [r3 + 3 * 16] + pmulhrsw m2, m3 + movd m6, [r2 + 19] + palignr m6, m4, 2 + pmaddubsw m5, m6, [r3 + 3 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 442 * 16], m2 + + ; mode 29 [row 11] + pmaddubsw m2, m7, [r3 + 12 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 12 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 443 * 16], m2 + + ; mode 29 [row 12] + pmaddubsw m2, m7, [r3 + 21 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 21 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 444 * 16], m2 + + ; mode 30 [row 8] + movu [r0 + 456 * 16], m2 + + ; mode 29 [row 13] + pmaddubsw m2, m7, [r3 + 30 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 30 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 445 * 16], m2 + + ; mode 32 [row 5] + movu [r0 + 485 * 16], m2 + + ; mode 30 [row 7] + pmaddubsw m2, m7, [r3 + 8 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 8 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 455 * 16], m2 + + ; mode 33 [row 3] + movu [r0 + 499 * 16], m2 + + ; mode 31 [row 5] + pmaddubsw m2, m7, [r3 + 6 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 6 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 469 * 16], m2 + + ; mode 31 [row 6] + pmaddubsw m2, m7, [r3 + 23 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 23 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 470 * 16], m2 + + ; mode 32 [row 4] + pmaddubsw m2, m7, [r3 + 9 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 9 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 484 * 16], m2 + + movu m1, m7 + movu m4, m6 + + ; mode 29 [row 14] + movu m1, [r2 + 12] + palignr m1, m7, 2 + pmaddubsw m2, m1, [r3 + 7 * 16] + pmulhrsw m2, m3 + movd m4, [r2 + 20] + palignr m4, m6, 2 + pmaddubsw m5, m4, [r3 + 7 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 446 * 16], m2 + + ; mode 29 [row 15] + pmaddubsw m2, m1, [r3 + 16 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 16 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 447 * 16], m2 + + ; mode 30 [row 9] + pmaddubsw m2, m1, [r3 + 2 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 2 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 457 * 16], m2 + + ; mode 33 [row 4] + movu [r0 + 500 * 16], m2 + + ; mode 30 [row 10] + pmaddubsw m2, m1, [r3 + 15 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 15 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 458 * 16], m2 + + ; mode 30 [row 11] + pmaddubsw m2, m1, [r3 + 28 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 28 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 459 * 16], m2 + + ; mode 33 [row 5] + movu [r0 + 501 * 16], m2 + + ; mode 31 [row 7] + pmaddubsw m2, m1, [r3 + 8 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 8 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 471 * 16], m2 + + ; mode 31 [row 8] + pmaddubsw m2, m1, [r3 + 25 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 25 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 472 * 16], m2 + + ; mode 32 [row 6] + pmaddubsw m2, m1, [r3 + 19 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 19 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 486 * 16], m2 + + ; mode 30 [row 12] + movd m7, [r2 + 13] + palignr m7, m1, 2 + pmaddubsw m2, m7, [r3 + 9 * 16] + pmulhrsw m2, m3 + movd m6, [r2 + 21] + palignr m6, m4, 2 + pmaddubsw m5, m6, [r3 + 9 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 460 * 16], m2 + + ; mode 30 [row 13] + pmaddubsw m2, m7, [r3 + 22 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 22 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 461 * 16], m2 + + ; mode 33 [row 6] + movu [r0 + 502 * 16], m2 + + ; mode 31 [row 9] + pmaddubsw m2, m7, [r3 + 10 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 10 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 473 * 16], m2 + + ; mode 31 [row 10] + pmaddubsw m2, m7, [r3 + 27 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 27 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 474 * 16], m2 + + ; mode 32 [row 7] + pmaddubsw m2, m7, [r3 + 8 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 8 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 487 * 16], m2 + + ; mode 32 [row 8] + pmaddubsw m2, m7, [r3 + 29 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 29 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 488 * 16], m2 + + + movu m1, m7 + movu m4, m6 + + ; mode 30 [row 14] + movd m1, [r2 + 14] + palignr m1, m7, 2 + pmaddubsw m2, m1, [r3 + 3 * 16] + pmulhrsw m2, m3 + movd m4, [r2 + 22] + palignr m4, m6, 2 + pmaddubsw m5, m4, [r3 + 3 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 462 * 16], m2 + + ; mode 30 [row 15] + pmaddubsw m2, m1, [r3 + 16 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 16 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 463 * 16], m2 + + ; mode 33 [row 7] + movu [r0 + 503 * 16], m2 + + ; mode 31 [row 11] + pmaddubsw m2, m1, [r3 + 12 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 12 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 475 * 16], m2 + + ; mode 31 [row 12] + pmaddubsw m2, m1, [r3 + 29 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 29 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 476 * 16], m2 + + ; mode 32 [row 9] + pmaddubsw m2, m1, [r3 + 18 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 18 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 489 * 16], m2 + + ; mode 31 [row 13] + movd m7, [r2 + 15] + palignr m7, m1, 2 + pmaddubsw m2, m7, [r3 + 14 * 16] + pmulhrsw m2, m3 + movd m6, [r2 + 23] + palignr m6, m4, 2 + pmaddubsw m5, m6, [r3 + 14 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 477 * 16], m2 + + ; mode 31 [row 14] + pmaddubsw m2, m7, [r3 + 31 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 31 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 478 * 16], m2 + + ; mode 32 [row 10] + pmaddubsw m2, m7, [r3 + 7 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 7 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 490 * 16], m2 + + ; mode 32 [row 11] + pmaddubsw m2, m7, [r3 + 28 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 28 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 491 * 16], m2 + + ; mode 33 [row 8] + pmaddubsw m2, m7, [r3 + 10 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 10 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 504 * 16], m2 + + ; mode 31 [row 15] + movd m1, [r2 + 16] + palignr m1, m7, 2 + pmaddubsw m2, m1, [r3 + 16 * 16] + pmulhrsw m2, m3 + movd m4, [r2 + 24] + palignr m4, m6, 2 + pmaddubsw m5, m4, [r3 + 16 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 479 * 16], m2 + + ; mode 32 [row 12] + pmaddubsw m2, m1, [r3 + 17 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 17 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 492 * 16], m2 + + ; mode 33 [row 9] + pmaddubsw m2, m1, [r3 + 4 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 4 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 505 * 16], m2 + + ; mode 33 [row 10] + pmaddubsw m2, m1, [r3 + 30 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 30 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 506 * 16], m2 + + ; mode 33 [row 10] + pmaddubsw m2, m1, [r3 + 4 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 4 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 505 * 16], m2 + + ; mode 32 [row 13] + movd m7, [r2 + 17] + palignr m7, m1, 2 + pmaddubsw m2, m7, [r3 + 6 * 16] + pmulhrsw m2, m3 + + movd m6, [r2 + 25] + palignr m6, m4, 2 + pmaddubsw m5, m6, [r3 + 6 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 493 * 16], m2 + + ; mode 32 [row 14] + pmaddubsw m2, m7, [r3 + 27 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 27 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 494 * 16], m2 + + ; mode 33 [row 11] + pmaddubsw m2, m7, [r3 + 24 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m6, [r3 + 24 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 507 * 16], m2 + + ; mode 32 [row 15] + movd m1, [r2 + 18] + palignr m1, m7, 2 + pmaddubsw m2, m1, [r3 + 16 * 16] + pmulhrsw m2, m3 + psrldq m4, 2 + pinsrb m4, [r2 + 26], 14 + pinsrb m4, [r2 + 27], 15 + movd m4, [r2 + 26] + palignr m4, m6, 2 + pmaddubsw m5, m4, [r3 + 16 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 495 * 16], m2 + + ; mode 33 [row 12] + pmaddubsw m2, m1, [r3 + 18 * 16] + pmulhrsw m2, m3 + pmaddubsw m5, m4, [r3 + 18 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 508 * 16], m2 + + ; mode 33 [row 13] + movd m7, [r2 + 19] + palignr m7, m1, 2 + pmaddubsw m2, m7, [r3 + 12 * 16] + pmulhrsw m2, m3 + movd m6, [r2 + 27] + palignr m6, m4, 2 + pmaddubsw m5, m6, [r3 + 12 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 509 * 16], m2 + + ; mode 33 [row 14] + movd m1, [r2 + 20] + palignr m1, m7, 2 + pmaddubsw m2, m1, [r3 + 6 * 16] + pmulhrsw m2, m3 + movd m4, [r2 + 28] + palignr m4, m6, 2 + pmaddubsw m5, m4, [r3 + 6 * 16] + pmulhrsw m5, m3 + packuswb m2, m5 + movu [r0 + 510 * 16], m2 + + ; mode 34 [row 0] + movu m1, [r2 + 2] + movu [r0 + 512 * 16], m1 + movu m2, [r2 + 18] + palignr m3, m2, m1, 1 + movu [r0 + 513 * 16], m3 + palignr m3, m2, m1, 2 + movu [r0 + 514 * 16], m3 + palignr m3, m2, m1, 3 + movu [r0 + 515 * 16], m3 + palignr m3, m2, m1, 4 + movu [r0 + 516 * 16], m3 + palignr m3, m2, m1, 5 + movu [r0 + 517 * 16], m3 + palignr m3, m2, m1, 6 + movu [r0 + 518 * 16], m3 + palignr m3, m2, m1, 7 + movu [r0 + 519 * 16], m3 + palignr m3, m2, m1, 8 + movu [r0 + 520 * 16], m3 + palignr m3, m2, m1, 9 + movu [r0 + 521 * 16], m3 + palignr m3, m2, m1, 10 + movu [r0 + 522 * 16], m3 + palignr m3, m2, m1, 11 + movu [r0 + 523 * 16], m3 + palignr m3, m2, m1, 12 + movu [r0 + 524 * 16], m3 + + ; mode 33 [row 15] + movu [r0 + 511 * 16], m3 + + ; mode 34 + palignr m3, m2, m1, 13 + movu [r0 + 525 * 16], m3 + palignr m3, m2, m1, 14 + movu [r0 + 526 * 16], m3 + palignr m3, m2, m1, 15 + movu [r0 + 527 * 16], m3 + RET + +;-------------------------------------------------------------------------------- +; void all_angs_pred_32x32(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) +;-------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal all_angs_pred_32x32, 3,7,8, 0-4 + mov r6d, [r1 + 64] + mov r3d, [r1] + mov [rsp], r6d + mov [r1 + 64], r3b + mov r3d, [r2] + mov r6d, [r2 + 64] + mov [r2 + 64], r3b + + lea r3, [r2] + lea r4, [r2 + 64] + lea r2, [r1 + 64] + + ;mode 2[row 0] + movu m0, [r4 + 2] + movu [r0 + 0 * 16], m0 + movu m1, [r4 + 18] + movu [r0 + 1 * 16], m1 + + ;mode 9 [row 15] + movu [r0 + 478 * 16], m0 + movu [r0 + 479 * 16], m1 + + ;mode 2[row 1] + movu m2, [r4 + 34] + palignr m3, m1, m0, 1 + movu [r0 + 2 * 16], m3 + palignr m4, m2, m1, 1 + movu [r0 + 3 * 16], m4 + + ; mode 9 [row 31] + movu [r0 + 510 * 16], m3 + movu [r0 + 511 * 16], m4 + + ;mode 2[row 17] + movu [r0 + 34 * 16], m4 + movu m5, [r4 + 35] + movu [r0 + 35 * 16], m5 + + ;mode 2[row 2] + palignr m3, m1, m0, 2 + movu [r0 + 4 * 16], m3 + palignr m4, m2, m1, 2 + movu [r0 + 5 * 16], m4 + + ;mode 2[row 18] + movu [r0 + 36 * 16], m4 + movu m6, [r4 + 51] + palignr m7, m6, m5, 1 + movu [r0 + 37 * 16], m7 + + ;mode 2[row 3] + palignr m3, m1, m0, 3 + movu [r0 + 6 * 16], m3 + palignr m4, m2, m1, 3 + movu [r0 + 7 * 16], m4 + + ;mode 2[row 19] + movu [r0 + 38 * 16], m4 + palignr m7, m6, m5, 2 + movu [r0 + 39 * 16], m7 + + ;mode 2[row 4] + palignr m3, m1, m0, 4 + movu [r0 + 8 * 16], m3 + palignr m4, m2, m1, 4 + movu [r0 + 9 * 16], m4 + + ; mode 8 [row 31] + movu [r0 + 446 * 16], m3 + movu [r0 + 447 * 16], m4 + + ;mode 2[row 20] + movu [r0 + 40 * 16], m4 + palignr m7, m6, m5, 3 + movu [r0 + 41 * 16], m7 + + ; mode 4 [row 31] + movu [r0 + 190 * 16], m4 + movu [r0 + 191 * 16], m7 + + ;mode 2[row 5] + palignr m3, m1, m0, 5 + movu [r0 + 10 * 16], m3 + palignr m4, m2, m1, 5 + movu [r0 + 11 * 16], m4 + + ;mode 2[row 21] + movu [r0 + 42 * 16], m4 + palignr m7, m6, m5, 4 + movu [r0 + 43 * 16], m7 + + ;mode 2[row 6] + palignr m3, m1, m0, 6 + movu [r0 + 12 * 16], m3 + palignr m4, m2, m1, 6 + movu [r0 + 13 * 16], m4 + + ;mode 2[row 22] + movu [r0 + 44 * 16], m4 + palignr m7, m6, m5, 5 + movu [r0 + 45 * 16], m7 + + ;mode 2[row 7] + palignr m3, m1, m0, 7 + movu [r0 + 14 * 16], m3 + palignr m4, m2, m1, 7 + movu [r0 + 15 * 16], m4 + + ;mode 2[row 23] + movu [r0 + 46 * 16], m4 + palignr m7, m6, m5, 6 + movu [r0 + 47 * 16], m7 + + ;mode 2[row 8] + palignr m3, m1, m0, 8 + movu [r0 + 16 * 16], m3 + palignr m4, m2, m1, 8 + movu [r0 + 17 * 16], m4 + + ;mode 7[row 31] + movu [r0 + 382 * 16], m3 + movu [r0 + 383 * 16], m4 + + ;mode 2[row 24] + movu [r0 + 48 * 16], m4 + palignr m7, m6, m5, 7 + movu [r0 + 49 * 16], m7 + + ;mode 2[row 9] + palignr m3, m1, m0, 9 + movu [r0 + 18 * 16], m3 + palignr m4, m2, m1, 9 + movu [r0 + 19 * 16], m4 + + ;mode 2[row 25] + movu [r0 + 50 * 16], m4 + palignr m7, m6, m5, 8 + movu [r0 + 51 * 16], m7 + + ; mode 3 [row 31] + movu [r0 + 126 * 16], m4 + movu [r0 + 127 * 16], m7 + + ;mode 2[row 10] + palignr m3, m1, m0, 10 + movu [r0 + 20 * 16], m3 + palignr m4, m2, m1, 10 + movu [r0 + 21 * 16], m4 + + ;mode 2[row 26] + movu [r0 + 52 * 16], m4 + palignr m7, m6, m5, 9 + movu [r0 + 53 * 16], m7 + + ;mode 2[row 11] + palignr m3, m1, m0, 11 + movu [r0 + 22 * 16], m3 + palignr m4, m2, m1, 11 + movu [r0 + 23 * 16], m4 + + ;mode 2[row 27] + movu [r0 + 54 * 16], m4 + palignr m7, m6, m5, 10 + movu [r0 + 55 * 16], m7 + + ;mode 2[row 12] + palignr m3, m1, m0, 12 + movu [r0 + 24 * 16], m3 + palignr m4, m2, m1, 12 + movu [r0 + 25 * 16], m4 + + ; mode 6 [row 31] + movu [r0 + 318 * 16], m3 + movu [r0 + 319 * 16], m4 + + ; mode 3 [row 15] + movu [r0 + 94 * 16], m3 + movu [r0 + 95 * 16], m4 + + ;mode 2[row 28] + movu [r0 + 56 * 16], m4 + palignr m7, m6, m5, 11 + movu [r0 + 57 * 16], m7 + + ;mode 2[row 13] + palignr m3, m1, m0, 13 + movu [r0 + 26 * 16], m3 + palignr m4, m2, m1, 13 + movu [r0 + 27 * 16], m4 + + ;mode 2[row 29] + movu [r0 + 58 * 16], m4 + palignr m7, m6, m5, 12 + movu [r0 + 59 * 16], m7 + + ;mode 2[row 14] + palignr m3, m1, m0, 14 + movu [r0 + 28 * 16], m3 + palignr m4, m2, m1, 14 + movu [r0 + 29 * 16], m4 + + ;mode 2[row 30] + movu [r0 + 60 * 16], m4 + palignr m7, m6, m5, 13 + movu [r0 + 61 * 16], m7 + + ;mode 2[row 15] + palignr m3, m1, m0, 15 + movu [r0 + 30 * 16], m3 + palignr m4, m2, m1, 15 + movu [r0 + 31 * 16], m4 + + ;mode 2[row 31] + movu [r0 + 62 * 16], m4 + palignr m7, m6, m5, 14 + movu [r0 + 63 * 16], m7 + + ;mode 2[row 16] + movu [r0 + 32 * 16], m1 + movu [r0 + 33 * 16], m2 + + ; mode 5[row 31] + movu [r0 + 254 * 16], m1 + movu [r0 + 255 * 16], m2 + + ; mode 3 [row 0] + lea r5, [ang_table] + movu m6, [r5 + 26 * 16] + movu m7, [pw_1024 ] + movu m1, [r4 + 1 ] + punpcklbw m1, m0 + pmaddubsw m0, m1, m6 + pmulhrsw m0, m7 + movu m2, [r4 + 9] + movd m3, [r4 + 10] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m0, m3 + movu [r0 + 64 * 16], m0 + + ; mode 6 [row 1 - first half] + movu [r0 + 258 * 16], m0 + + ; mode 9 [row 12 - first half] + movu [r0 + 472 * 16], m0 + + movu m0, [r4 + 17] + movd m3, [r4 + 18] + palignr m3, m0, 1 + punpcklbw m0, m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 25] + movd m5, [r4 + 26] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 65 * 16], m3 + + ; mode 6 [row 1 - second half] + movu [r0 + 259 * 16], m3 + + ; mode 9 [row 12 - second half] + movu [r0 + 473 * 16], m3 + + ; mode 4 [row 0] + movu m6, [r5 + 21 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 128 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 129 * 16], m3 + + ; mode 5 [row 0] + movu m6, [r5 + 17 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 192 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 193 * 16], m3 + + ; mode 6 [row 0] + movu m6, [r5 + 13 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 256 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 257 * 16], m3 + + ; mode 7 [row 0] + movu m6, [r5 + 9 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 320 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 321 * 16], m3 + + ; mode 7 [row 1] + movu m6, [r5 + 18 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 322 * 16], m3 + + ; mode 9 [row 8 - first half] + movu [r0 + 464 * 16], m3 + + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 323 * 16], m3 + + ; mode 9 [row 8 - second half] + movu [r0 + 465 * 16], m3 + + ; mode 7 [row 2] + movu m6, [r5 + 27 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 324 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 325 * 16], m3 + + ; mode 8 [row 0] + movu m6, [r5 + 5 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 384 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 385 * 16], m3 + + ; mode 8 [row 1] + movu m6, [r5 + 10 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 386 * 16], m3 + + ; mode 9 [row 4 - first half] + movu [r0 + 456 * 16], m3 + + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 387 * 16], m3 + + ; mode 9 [row 4 - second half] + movu [r0 + 457 * 16], m3 + + ; mode 8 [row 2] + movu m6, [r5 + 15 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 388 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 389 * 16], m3 + + ; mode 8 [row 3] + movu m6, [r5 + 20 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 390 * 16], m3 + + ; mode 9 [row 9 - first half] + movu [r0 + 466 * 16], m3 + + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 391 * 16], m3 + + ; mode 9 [row 9 - second half] + movu [r0 + 467 * 16], m3 + + ; mode 8 [row 4] + movu m6, [r5 + 25 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 392 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 393 * 16], m3 + + ; mode 8 [row 5] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 394 * 16], m3 + + ; mode 9 [row 14 - first half] + movu [r0 + 476 * 16], m3 + + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 395 * 16], m3 + + ; mode 9 [row 14 - second half] + movu [r0 + 477 * 16], m3 + + ; mode 9 [row 0] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 448 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 449 * 16], m3 + + ; mode 9 [row 1] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 450 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 451 * 16], m3 + + ; mode 9 [row 2] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 452 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 453 * 16], m3 + + ; mode 9 [row 3] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 454 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 455 * 16], m3 + + ; mode 9 [row 5] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 458 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 459 * 16], m3 + + ; mode 9 [row 6] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 460 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 461 * 16], m3 + + ; mode 9 [row 7] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 462 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 463 * 16], m3 + + ; mode 9 [row 10] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 468 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 469 * 16], m3 + + ; mode 9 [row 11] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 470 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 471 * 16], m3 + + ; mode 9 [row 13] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 474 * 16], m3 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 475 * 16], m3 + + ; mode 3 [row 1] + movu m6, [r5 + 20 * 16] + movu m0, [r4 + 2] + movd m1, [r4 + 3] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 10] + movd m3, [r4 + 11] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 66 * 16], m1 + + ; mode 6 [row 3 - first half] + movu [r0 + 262 * 16], m1 + + ; mode 9 [row 25 - first half] + movu [r0 + 498 * 16], m1 + + movu m1, [r4 + 18] + movd m3, [r4 + 19] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 26] + movd m5, [r4 + 27] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 67 * 16], m3 + + ; mode 6 [row 3 - second half] + movu [r0 + 263 * 16], m3 + + ; mode 9 [row 25 - second half] + movu [r0 + 499 * 16], m3 + + ; mode 4 [row 1] + movu m6, [r5 + 10 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 130 * 16], m3 + + ; mode 9 [row 20 - first half] + movu [r0 + 488 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 131 * 16], m3 + + ; mode 9 [row 20 - second half] + movu [r0 + 489 * 16], m3 + + ; mode 4 [row 2] + movu m6, [r5 + 31 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 132 * 16], m3 + + ; mode 7 [row 6 - first half] + movu [r0 + 332 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 133 * 16], m3 + + ; mode 7 [row 6 - second half] + movu [r0 + 333 * 16], m3 + + ; mode 5 [row 1] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 194 * 16], m3 + + ; mode 5 [row 1 - first half] + movu [r0 + 480 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 195 * 16], m3 + + ; mode 5 [row 1 - second half] + movu [r0 + 481 * 16], m3 + + ; mode 5 [row 2] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 196 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 197 * 16], m3 + + ; mode 6 [row 2] + movu m6, [r5 + 7 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 260 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 261 * 16], m3 + + ; mode 7 [row 3] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 326 * 16], m3 + + ; mode 9 [row 17 - first half] + movu [r0 + 482 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 327 * 16], m3 + + ; mode 9 [row 17 - second half] + movu [r0 + 483 * 16], m3 + + ; mode 7 [row 4] + movu m6, [r5 + 13 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 328 * 16], m3 + + ; mode 8 [row 8 - first half] + movu [r0 + 400 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 329 * 16], m3 + + ; mode 8 [row 8 - second half] + movu [r0 + 401 * 16], m3 + + ; mode 7 [row 5] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 330 * 16], m3 + + ; mode 9 [row 26 - first half] + movu [r0 + 500 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 331 * 16], m3 + + ; mode 9 [row 26 - second half] + movu [r0 + 501 * 16], m3 + + ; mode 8 [row 6] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 396 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 397 * 16], m3 + + ; mode 9 [row 18] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 484 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 485 * 16], m3 + + ; mode 9 [row 21] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 490 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 491 * 16], m3 + + ; mode 9 [row 22] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 492 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 493 * 16], m3 + + ; mode 9 [row 23] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 494 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 495 * 16], m3 + + ; mode 9 [row 27] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 502 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 503 * 16], m3 + + ; mode 9 [row 28] + movu m6, [r5 + 26 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 504 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 505 * 16], m3 + + ; mode 9 [row 30] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 508 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 509 * 16], m3 + + ; mode 8 [row 7] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 398 * 16], m3 + + ; mode 9 [row 19 - first half] + movu [r0 + 486 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 399 * 16], m3 + + ; mode 9 [row 19 - second half] + movu [r0 + 487 * 16], m3 + + ; mode 8 [row 9] + movu m6, [r5 + 18 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 402 * 16], m3 + + ; mode 9 [row 24 - first half] + movu [r0 + 496 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 403 * 16], m3 + + ; mode 9 [row 24 - second half] + movu [r0 + 497 * 16], m3 + + ; mode 8 [row 10] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 404 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 405 * 16], m3 + + ; mode 8 [row 11] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 406 * 16], m3 + + ; mode 9 [row 29 - first half] + movu [r0 + 506 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 407 * 16], m3 + + ; mode 9 [row 29 - second half] + movu [r0 + 507 * 16], m3 + + ; mode 3 [row 2] + movu m6, [r5 + 14 * 16] + movu m0, [r4 + 3] + movd m1, [r4 + 4] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 11] + movd m3, [r4 + 12] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 68 * 16], m1 + + ; mode 3 [row 2 - first half] + movu [r0 + 266 * 16], m1 + + movu m1, [r4 + 19] + movd m3, [r4 + 20] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 27] + movd m5, [r4 + 28] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 69 * 16], m3 + + ; mode 3 [row 2 - second half] + movu [r0 + 267 * 16], m3 + + ; mode 4 [row 3] + movu m6, [r5 + 20 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 134 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 135 * 16], m3 + + ; mode 5 [row 3] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 198 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 199 * 16], m3 + + ; mode 5 [row 4] + movu m6, [r5 + 21 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 200 * 16], m3 + + ; mode 8 [row 16 - first half] + movu [r0 + 416 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 201 * 16], m3 + + ; mode 8 [row 16 - second half] + movu [r0 + 417 * 16], m3 + + ; mode 6 [row 4] + movu m6, [r5 + 1 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 264 * 16], m3 + + ; mode 6 [row 4 - first half] + movu [r0 + 408 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 265 * 16], m3 + + ; mode 6 [row 4 - second half] + movu [r0 + 409 * 16], m3 + + ; mode 6 [row 6] + movu m6, [r5 + 27 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 268 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 269 * 16], m3 + + ; mode 7 [row 7] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 334 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 335 * 16], m3 + + ; mode 7 [row 8] + movu m6, [r5 + 17 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 336 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 337 * 16], m3 + + ; mode 7 [row 9] + movu m6, [r5 + 26 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 338 * 16], m3 + + ; mode 8 [row 17 - first half] + movu [r0 + 418 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 339 * 16], m3 + + ; mode 8 [row 17 - second half] + movu [r0 + 419 * 16], m3 + + ; mode 8 [row 13] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 410 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 411 * 16], m3 + + ; mode 8 [row 14] + movu m6, [r5 + 11 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 412 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 413 * 16], m3 + + ; mode 8 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 414 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 415 * 16], m3 + + ; mode 8 [row 18] + movu m6, [r5 + 31 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 420 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 421 * 16], m3 + + ; mode 3 [row 3] + movu m6, [r5 + 8 * 16] + movu m0, [r4 + 4] + movd m1, [r4 + 5] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 12] + movd m3, [r4 + 13] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 70 * 16], m1 + + ; mode 6 [row 7 - first half] + movu [r0 + 270 * 16], m1 + + movu m1, [r4 + 20] + movd m3, [r4 + 21] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 28] + movd m5, [r4 + 29] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 71 * 16], m3 + + ; mode 6 [row 7 - second half] + movu [r0 + 271 * 16], m3 + + ; mode 4 [row 4] + movu m6, [r5 + 9 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 136 * 16], m3 + + ; mode 4 [row 4 - first half] + movu [r0 + 424 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 137 * 16], m3 + + ; mode 4 [row 4 - second half] + movu [r0 + 425 * 16], m3 + + ; mode 4 [row 5] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 138 * 16], m3 + + ; mode 7 [row 13 - first half] + movu [r0 + 346 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 139 * 16], m3 + + ; mode 7 [row 13 - second half] + movu [r0 + 347 * 16], m3 + + ; mode 5 [row 5] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 202 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 203 * 16], m3 + + ; mode 5 [row 6] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 204 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 205 * 16], m3 + + ; mode 6 [row 8] + movu m6, [r5 + 21 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 272 * 16], m3 + + ; mode 7 [row 12 - first half] + movu [r0 + 344 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 273 * 16], m3 + + ; mode 7 [row 12 - second half] + movu [r0 + 345 * 16], m3 + + ; mode 7 [row 10] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 340 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 341 * 16], m3 + + ; mode 7 [row 11] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 342 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 343 * 16], m3 + + ; mode 8 [row 19] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 422 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 423 * 16], m3 + + ; mode 8 [row 21] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 426 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 427 * 16], m3 + + ; mode 8 [row 22] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 428 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 429 * 16], m3 + + ; mode 8 [row 23] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 430 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 431 * 16], m3 + + ; mode 8 [row 24] + movu m6, [r5 + 29 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 432 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 433 * 16], m3 + + ; mode 3 [row 4] + movu m6, [r5 + 2 * 16] + movu m0, [r4 + 5] + movd m1, [r4 + 6] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 13] + movd m3, [r4 + 14] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 72 * 16], m1 + + ; mode 3 [row 4 - first half] + movu [r0 + 274 * 16], m1 + + ; mode 8 [row 25 - first half] + movu [r0 + 434 * 16], m1 + + movu m1, [r4 + 21] + movd m3, [r4 + 22] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 29] + movd m5, [r4 + 30] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 73 * 16], m3 + + ; mode 3 [row 4 - second half] + movu [r0 + 275 * 16], m3 + + ; mode 8 [row 25 - second half] + movu [r0 + 435 * 16], m3 + + ; mode 3 [row 5] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 74 * 16], m3 + + ; mode 3 [row 5 - first half] + movu [r0 + 278 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 75 * 16], m3 + + ; mode 3 [row 5 - second half] + movu [r0 + 279 * 16], m3 + + ; mode 4 [row 6] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 140 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 141 * 16], m3 + + ; mode 5 [row 7] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 206 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 207 * 16], m3 + + ; mode 5 [row 8] + movu m6, [r5 + 25 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 208 * 16], m3 + + ; mode 7 [row 16 - first half] + movu [r0 + 352 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 209 * 16], m3 + + ; mode 7 [row 16 - second half] + movu [r0 + 353 * 16], m3 + + ; mode 6 [row 10] + movu m6, [r5 + 15 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 276 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 277 * 16], m3 + + ; mode 7 [row 14] + movu m6, [r5 + 7 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 348 * 16], m3 + + ; mode 8 [row 26 - first half] + movu [r0 + 436 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 349 * 16], m3 + + ; mode 8 [row 26 - second half] + movu [r0 + 437 * 16], m3 + + ; mode 7 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 350 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 351 * 16], m3 + + ; mode 8 [row 27] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 438 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 439 * 16], m3 + + ; mode 8 [row 28] + movu m6, [r5 + 17 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 440 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 441 * 16], m3 + + ; mode 8 [row 29] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 442 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 443 * 16], m3 + + ; mode 8 [row 30] + movu m6, [r5 + 27 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 444 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 445 * 16], m3 + + ; mode 3 [row 6] + movu m6, [r5 + 22 * 16] + movu m0, [r4 + 6] + movd m1, [r4 + 7] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 14] + movd m3, [r4 + 15] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 76 * 16], m1 + + ; mode 6 [row 13 - first half] + movu [r0 + 282 * 16], m1 + + movu m1, [r4 + 22] + movd m3, [r4 + 23] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 30] + movd m5, [r4 + 31] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 77 * 16], m3 + + ; mode 6 [row 13 - second half] + movu [r0 + 283 * 16], m3 + + ; mode 4 [row 7] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 142 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 143 * 16], m3 + + ; mode 4 [row 8] + movu m6, [r5 + 29 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 144 * 16], m3 + + ; mode 4 [row 8 - first half] + movu [r0 + 360 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 145 * 16], m3 + + ; mode 4 [row 8 - second half] + movu [r0 + 361 * 16], m3 + + ; mode 5 [row 9] + movu m6, [r5 + 10 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 210 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 211 * 16], m3 + + ; mode 5 [row 10] + movu m6, [r5 + 27 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 212 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 213 * 16], m3 + + ; mode 7 [row 17] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 354 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 355 * 16], m3 + + ; mode 7 [row 18] + movu m6, [r5 + 11 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 356 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 357 * 16], m3 + + ; mode 7 [row 19] + movu m6, [r5 + 20 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 358 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 359 * 16], m3 + + ; mode 6 [row 12] + movu m6, [r5 + 9 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 280 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 281 * 16], m3 + + ; mode 3 [row 7] + movu m6, [r5 + 16 * 16] + movu m0, [r4 + 7] + movd m1, [r4 + 8] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 15] + movd m3, [r4 + 16] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 78 * 16], m1 + + ; mode 6 [row 15 - first half] + movu [r0 + 286 * 16], m1 + + movu m1, [r4 + 23] + movd m3, [r4 + 24] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 31] + movd m5, [r4 + 32] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 79 * 16], m3 + + ; mode 6 [row 15 - second half] + movu [r0 + 287 * 16], m3 + + ; mode 4 [row 9] + movu m6, [r5 + 18 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 146 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 147 * 16], m3 + + ; mode 5 [row 11] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 214 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 215 * 16], m3 + + ; mode 5 [row 12] + movu m6, [r5 + 29 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 216 * 16], m3 + + ; mode 6 [row 16 - first half] + movu [r0 + 288 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 217 * 16], m3 + + ; mode 6 [row 16 - second half] + movu [r0 + 289 * 16], m3 + + ; mode 6 [row 14] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 284 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 285 * 16], m3 + + ; mode 7 [row 21] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 362 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 363 * 16], m3 + + ; mode 7 [row 22] + movu m6, [r5 + 15 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 364 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 365 * 16], m3 + + ; mode 7 [row 23] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 366 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 367 * 16], m3 + + ; mode 3 [row 8] + movu m6, [r5 + 10 * 16] + movu m0, [r4 + 8] + movd m1, [r4 + 9] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 16] + movd m3, [r4 + 17] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 80 * 16], m1 + + ; mode 7 [row 25 - first half] + movu [r0 + 290 * 16], m1 + + ; mode 6 [row 17 - first half] + movu [r0 + 370 * 16], m1 + + movu m1, [r4 + 24] + movd m3, [r4 + 25] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 32] + movd m5, [r4 + 33] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 81 * 16], m3 + + ; mode 7 [row 25 - second half] + movu [r0 + 291 * 16], m3 + + ; mode 6 [row 17 - second half] + movu [r0 + 371 * 16], m3 + + ; mode 4 [row 10] + movu m6, [r5 + 7 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 148 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 149 * 16], m3 + + ; mode 4 [row 11] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 150 * 16], m3 + + ; mode 7 [row 27 - first half] + movu [r0 + 374 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 151 * 16], m3 + + ; mode 7 [row 27 - second half] + movu [r0 + 375 * 16], m3 + + ; mode 5 [row 13] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 218 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 219 * 16], m3 + + ; mode 5 [row 14] + movu m6, [r5 + 31 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 220 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 221 * 16], m3 + + ; mode 6 [row 18] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 292 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 293 * 16], m3 + + ; mode 7 [row 24] + movu m6, [r5 + 1 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 368 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 369 * 16], m3 + + ; mode 7 [row 26] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 372 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 373 * 16], m3 + + ; mode 3 [row 9] + movu m6, [r5 + 4 * 16] + movu m0, [r4 + 9] + movd m1, [r4 + 10] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 17] + movd m3, [r4 + 18] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 82 * 16], m1 + + ; mode 6 [row 19 - first half] + movu [r0 + 294 * 16], m1 + + movu m1, [r4 + 25] + movd m3, [r4 + 26] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 33] + movd m5, [r4 + 34] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 83 * 16], m3 + + ; mode 6 [row 19 - second half] + movu [r0 + 295 * 16], m3 + + ; mode 4 [row 12] + movu m6, [r5 + 17 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 152 * 16], m3 + + ; mode 4 [row 12 - first half] + movu [r0 + 296 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 153 * 16], m3 + + ; mode 4 [row 12 - second half] + movu [r0 + 297 * 16], m3 + + ; mode 3 [row 10] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 84 * 16], m3 + + ; mode 6 [row 21 - first half] + movu [r0 + 298 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 85 * 16], m3 + + ; mode 6 [row 21 - second half] + movu [r0 + 299 * 16], m3 + + ; mode 5 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 222 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 223 * 16], m3 + + ; mode 7 [row 28] + movu m6, [r5 + 5 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 376 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 377 * 16], m3 + + ; mode 7 [row 29] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 378 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 379 * 16], m3 + + ; mode 7 [row 30] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 380 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 381 * 16], m3 + + ; mode 3 [row 11] + movu m6, [r5 + 24 * 16] + movu m0, [r4 + 10] + movd m1, [r4 + 11] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 18] + movd m3, [r4 + 19] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 86 * 16], m1 + + ; mode 6 [row 23 - first half] + movu [r0 + 302 * 16], m1 + + movu m1, [r4 + 26] + movd m3, [r4 + 27] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 34] + movd m5, [r4 + 35] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 87 * 16], m3 + + ; mode 6 [row 23 - second half] + movu [r0 + 303 * 16], m3 + + ; mode 4 [row 13] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 154 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 155 * 16], m3 + + ; mode 4 [row 14] + movu m6, [r5 + 27 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 156 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 157 * 16], m3 + + ; mode 5 [row 16] + movu m6, [r5 + 1 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 224 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 225 * 16], m3 + + ; mode 5 [row 17] + movu m6, [r5 + 18 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 226 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 227 * 16], m3 + + ; mode 6 [row 22] + movu m6, [r5 + 11 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 300 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 301 * 16], m3 + + ; mode 3 [row 12] + movu m6, [r5 + 18 * 16] + movu m0, [r4 + 11] + movd m1, [r4 + 12] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 19] + movd m3, [r4 + 20] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 88 * 16], m1 + + ; mode 6 [row 25 - first half] + movu [r0 + 306 * 16], m1 + + movu m1, [r4 + 27] + movd m3, [r4 + 28] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 35] + movd m5, [r4 + 36] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 89 * 16], m3 + + ; mode 6 [row 25 - second half] + movu [r0 + 307 * 16], m3 + + ; mode 4 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 158 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 159 * 16], m3 + + ; mode 5 [row 18] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 228 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 229 * 16], m3 + + ; mode 5 [row 19] + movu m6, [r5 + 20 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 230 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 231 * 16], m3 + + ; mode 6 [row 24] + movu m6, [r5 + 5 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 304 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 305 * 16], m3 + + ; mode 6 [row 26] + movu m6, [r5 + 31 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 308 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 309 * 16], m3 + + ; mode 3 [row 13] + movu m6, [r5 + 12 * 16] + movu m0, [r4 + 12] + movd m1, [r4 + 13] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 20] + movd m3, [r4 + 21] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 90 * 16], m1 + + movu m1, [r4 + 28] + movd m3, [r4 + 29] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 36] + movd m5, [r4 + 37] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 91 * 16], m3 + + ; mode 4 [row 16] + movu m6, [r5 + 5 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 160 * 16], m3 + + ; mode 5 [row 20 - first half] + movu [r0 + 232 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 161 * 16], m3 + + ; mode 5 [row 20 - second half] + movu [r0 + 233 * 16], m3 + + ; mode 4 [row 17] + movu m6, [r5 + 26 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 162 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 163 * 16], m3 + + ; mode 5 [row 21] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 234 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 235 * 16], m3 + + ; mode 6 [row 27] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 310 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 311 * 16], m3 + + ; mode 6 [row 28] + movu m6, [r5 + 25 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 312 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 313 * 16], m3 + + ; mode 3 [row 14] + movu m6, [r5 + 6 * 16] + movu m0, [r4 + 13] + movd m1, [r4 + 14] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 21] + movd m3, [r4 + 22] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 92 * 16], m1 + + ; mode 6 [row 29 - first half] + movu [r0 + 314 * 16], m1 + + movu m1, [r4 + 29] + movd m3, [r4 + 30] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 37] + movd m5, [r4 + 38] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 93 * 16], m3 + + ; mode 6 [row 29 - second half] + movu [r0 + 315 * 16], m3 + + ; mode 4 [row 18] + movu m6, [r5 + 15 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 164 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 165 * 16], m3 + + ; mode 5 [row 22] + movu m6, [r5 + 7 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 236 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 237 * 16], m3 + + ; mode 5 [row 23] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 238 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 239 * 16], m3 + + ; mode 6 [row 30] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 316 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 317 * 16], m3 + + ; mode 3 [row 16] + movu m6, [r5 + 26 * 16] + movu m0, [r4 + 14] + movd m1, [r4 + 15] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 22] + movd m3, [r4 + 23] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 96 * 16], m1 + + ; mode 5 [row 25 - first half] + movu [r0 + 242 * 16], m1 + + movu m1, [r4 + 30] + movd m3, [r4 + 31] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 38] + movd m5, [r4 + 39] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 97 * 16], m3 + + ; mode 5 [row 25 - second half] + movu [r0 + 243 * 16], m3 + + ; mode 4 [row 19] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 166 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 167 * 16], m3 + + ; mode 4 [row 20] + movu m6, [r5 + 25 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 168 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 169 * 16], m3 + + ; mode 5 [row 24] + movu m6, [r5 + 9 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 240 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 241 * 16], m3 + + ; mode 3 [row 17] + movu m6, [r5 + 20 * 16] + movu m0, [r4 + 15] + movd m1, [r4 + 16] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 23] + movd m3, [r4 + 24] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 98 * 16], m1 + + movu m1, [r4 + 31] + movd m3, [r4 + 32] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 39] + movd m5, [r4 + 40] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 99 * 16], m3 + + ; mode 4 [row 21] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 170 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 171 * 16], m3 + + ; mode 5 [row 26] + movu m6, [r5 + 11 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 244 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 245 * 16], m3 + + ; mode 5 [row 27] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 246 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 247 * 16], m3 + + ; mode 3 [row 18] + movu m6, [r5 + 14 * 16] + movu m0, [r4 + 16] + movd m1, [r4 + 17] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 24] + movd m3, [r4 + 25] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 100 * 16], m1 + + movu m1, [r4 + 32] + movd m3, [r4 + 33] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 40] + movd m5, [r4 + 41] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 101 * 16], m3 + + ; mode 4 [row 22] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 172 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 173 * 16], m3 + + ; mode 4 [row 23] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 174 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 175 * 16], m3 + + ; mode 5 [row 28] + movu m6, [r5 + 13 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 248 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 249 * 16], m3 + + ; mode 5 [row 29] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 250 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 251 * 16], m3 + + ; mode 3 [row 19] + movu m6, [r5 + 8 * 16] + movu m0, [r4 + 17] + movd m1, [r4 + 18] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 25] + movd m3, [r4 + 26] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 102 * 16], m1 + + movu m1, [r4 + 33] + movd m3, [r4 + 34] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 41] + movd m5, [r4 + 42] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 103 * 16], m3 + + ; mode 4 [row 24] + movu m6, [r5 + 13 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 176 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 177 * 16], m3 + + ; mode 5 [row 30] + movu m6, [r5 + 15 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 252 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 253 * 16], m3 + + ; mode 3 [row 20] + movu m6, [r5 + 2 * 16] + movu m0, [r4 + 18] + movd m1, [r4 + 19] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 26] + movd m3, [r4 + 27] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 104 * 16], m1 + + movu m1, [r4 + 34] + movd m3, [r4 + 35] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 42] + movd m5, [r4 + 43] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 105 * 16], m3 + + ; mode 4 [row 25] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 178 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 179 * 16], m3 + + ; mode 4 [row 26] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 180 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 181 * 16], m3 + + ; mode 3 [row 21] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 106 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 107 * 16], m3 + + ; mode 3 [row 22] + movu m6, [r5 + 22 * 16] + movu m0, [r4 + 19] + movd m1, [r4 + 20] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 27] + movd m3, [r4 + 28] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 108 * 16], m1 + + movu m1, [r4 + 35] + movd m3, [r4 + 36] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 43] + movd m5, [r4 + 44] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 109 * 16], m3 + + ; mode 4 [row 27] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 182 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 183 * 16], m3 + + ; mode 3 [row 23] + movu m6, [r5 + 16 * 16] + movu m0, [r4 + 20] + movd m1, [r4 + 21] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 28] + movd m3, [r4 + 29] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 110 * 16], m1 + + movu m1, [r4 + 36] + movd m3, [r4 + 37] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 44] + movd m5, [r4 + 45] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 111 * 16], m3 + + ; mode 4 [row 28] + movu m6, [r5 + 1 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 184 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 185 * 16], m3 + + ; mode 4 [row 29] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 186 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 187 * 16], m3 + + ; mode 3 [row 24] + movu m6, [r5 + 10 * 16] + movu m0, [r4 + 21] + movd m1, [r4 + 22] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 29] + movd m3, [r4 + 30] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 112 * 16], m1 + + movu m1, [r4 + 37] + movd m3, [r4 + 38] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 45] + movd m5, [r4 + 46] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 113 * 16], m3 + + ; mode 4 [row 30] + movu m6, [r5 + 11 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 188 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 189 * 16], m3 + + ; mode 3 [row 25] + movu m6, [r5 + 4 * 16] + movu m0, [r4 + 22] + movd m1, [r4 + 23] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 30] + movd m3, [r4 + 31] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 114 * 16], m1 + + movu m1, [r4 + 38] + movd m3, [r4 + 39] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 46] + movd m5, [r4 + 47] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 115 * 16], m3 + + ; mode 3 [row 26] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 116 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 117 * 16], m3 + + ; mode 3 [row 27] + movu m6, [r5 + 24 * 16] + movu m0, [r4 + 23] + movd m1, [r4 + 24] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 31] + movd m3, [r4 + 32] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 118 * 16], m1 + + movu m1, [r4 + 39] + movd m3, [r4 + 40] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 47] + movd m5, [r4 + 48] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 119 * 16], m3 + + ; mode 3 [row 28] + movu m6, [r5 + 18 * 16] + movu m0, [r4 + 24] + movd m1, [r4 + 25] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 32] + movd m3, [r4 + 33] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 120 * 16], m1 + + movu m1, [r4 + 40] + movd m3, [r4 + 41] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 48] + movd m5, [r4 + 49] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 121 * 16], m3 + + ; mode 3 [row 29] + movu m6, [r5 + 12 * 16] + movu m0, [r4 + 25] + movd m1, [r4 + 26] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 33] + movd m3, [r4 + 34] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 122 * 16], m1 + + movu m1, [r4 + 41] + movd m3, [r4 + 42] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 49] + movd m5, [r4 + 50] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 123 * 16], m3 + + ; mode 3 [row 30] + movu m6, [r5 + 6 * 16] + movu m0, [r4 + 26] + movd m1, [r4 + 27] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r4 + 34] + movd m3, [r4 + 35] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 124 * 16], m1 + + movu m1, [r4 + 42] + movd m3, [r4 + 43] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r4 + 50] + movd m5, [r4 + 51] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 125 * 16], m3 + + ; mode 10 + movu m1, [r2 + 1] + movu m2, [r2 + 17] + movu [r0 + 512 * 16], m1 + movu [r0 + 513 * 16], m2 + movu [r0 + 514 * 16], m1 + movu [r0 + 515 * 16], m2 + movu [r0 + 516 * 16], m1 + movu [r0 + 517 * 16], m2 + movu [r0 + 518 * 16], m1 + movu [r0 + 519 * 16], m2 + movu [r0 + 520 * 16], m1 + movu [r0 + 521 * 16], m2 + movu [r0 + 522 * 16], m1 + movu [r0 + 523 * 16], m2 + movu [r0 + 524 * 16], m1 + movu [r0 + 525 * 16], m2 + movu [r0 + 526 * 16], m1 + movu [r0 + 527 * 16], m2 + + movu [r0 + 528 * 16], m1 + movu [r0 + 529 * 16], m2 + movu [r0 + 530 * 16], m1 + movu [r0 + 531 * 16], m2 + movu [r0 + 532 * 16], m1 + movu [r0 + 533 * 16], m2 + movu [r0 + 534 * 16], m1 + movu [r0 + 535 * 16], m2 + movu [r0 + 536 * 16], m1 + movu [r0 + 537 * 16], m2 + movu [r0 + 538 * 16], m1 + movu [r0 + 539 * 16], m2 + movu [r0 + 540 * 16], m1 + movu [r0 + 541 * 16], m2 + movu [r0 + 542 * 16], m1 + movu [r0 + 543 * 16], m2 + + movu [r0 + 544 * 16], m1 + movu [r0 + 545 * 16], m2 + movu [r0 + 546 * 16], m1 + movu [r0 + 547 * 16], m2 + movu [r0 + 548 * 16], m1 + movu [r0 + 549 * 16], m2 + movu [r0 + 550 * 16], m1 + movu [r0 + 551 * 16], m2 + movu [r0 + 552 * 16], m1 + movu [r0 + 553 * 16], m2 + movu [r0 + 554 * 16], m1 + movu [r0 + 555 * 16], m2 + movu [r0 + 556 * 16], m1 + movu [r0 + 557 * 16], m2 + movu [r0 + 558 * 16], m1 + movu [r0 + 559 * 16], m2 + + movu [r0 + 560 * 16], m1 + movu [r0 + 561 * 16], m2 + movu [r0 + 562 * 16], m1 + movu [r0 + 563 * 16], m2 + movu [r0 + 564 * 16], m1 + movu [r0 + 565 * 16], m2 + movu [r0 + 566 * 16], m1 + movu [r0 + 567 * 16], m2 + movu [r0 + 568 * 16], m1 + movu [r0 + 569 * 16], m2 + movu [r0 + 570 * 16], m1 + movu [r0 + 571 * 16], m2 + movu [r0 + 572 * 16], m1 + movu [r0 + 573 * 16], m2 + movu [r0 + 574 * 16], m1 + movu [r0 + 575 * 16], m2 + + ; mode 11 [row 0] + movu m0, [r4] + + ; mode 11 [row 15 - first half] + movu [r0 + 606 * 16], m0 + + movu [r0 + 606 * 16], m0 + + ; mode 12 [row 31] + pslldq m6, m0, 4 + pinsrb m6, [r3 + 26], 0 + pinsrb m6, [r3 + 19], 1 + pinsrb m6, [r3 + 13], 2 + pinsrb m6, [r3 + 6], 3 + movu [r0 + 702 * 16], m6 + movu m6, [r4 + 12] + movu [r0 + 703 * 16], m6 + + ; mode 11 [row 31] + pslldq m6, m0, 1 + pinsrb m6, [r3 + 16], 0 + movu [r0 + 638 * 16], m6 + movu m6, [r4 + 15] + movu [r0 + 639 * 16], m6 + + movd m1, [r4 + 1] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m1, m0, [r5 + 30 * 16] + pmulhrsw m1, m7 + movu m2, [r4 + 8] + movd m3, [r4 + 9] + palignr m3, m2, 1 + punpcklbw m2, m3 + pmaddubsw m3, m2, [r5 + 30 * 16] + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 576 * 16], m1 + + movu m1, [r4 + 16] + + ; mode 11 [row 15 - second half] + movu [r0 + 607 * 16], m1 + + movd m3, [r4 + 17] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, [r5 + 30 * 16] + pmulhrsw m3, m7 + movu m4, [r4 + 24] + movd m5, [r4 + 25] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, [r5 + 30 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 577 * 16], m3 + + ; mode 11 [row 1] + pmaddubsw m3, m0, [r5 + 28 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 578 * 16], m3 + pmaddubsw m3, m1, [r5 + 28 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 579 * 16], m3 + + ; mode 11 [row 2] + pmaddubsw m3, m0, [r5 + 26 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 26 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 580 * 16], m3 + pmaddubsw m3, m1, [r5 + 26 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 26 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 581 * 16], m3 + + ; mode 11 [row 3] + pmaddubsw m3, m0, [r5 + 24 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 582 * 16], m3 + pmaddubsw m3, m1, [r5 + 24 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 583 * 16], m3 + + ; mode 11 [row 4] + pmaddubsw m3, m0, [r5 + 22 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 22 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 584 * 16], m3 + + ; mode 12 [row 1 - first half] + movu [r0 + 642 * 16], m3 + + pmaddubsw m3, m1, [r5 + 22 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 22 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 585 * 16], m3 + + ; mode 12 [row 1 - second half] + movu [r0 + 643 * 16], m3 + + ; mode 11 [row 5] + pmaddubsw m3, m0, [r5 + 20 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 20 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 586 * 16], m3 + pmaddubsw m3, m1, [r5 + 20 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 20 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 587 * 16], m3 + + ; mode 11 [row 6] + pmaddubsw m3, m0, [r5 + 18 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 18 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 588 * 16], m3 + pmaddubsw m3, m1, [r5 + 18 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 18 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 589 * 16], m3 + + ; mode 11 [row 7] + pmaddubsw m3, m0, [r5 + 16 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 16 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 590 * 16], m3 + pmaddubsw m3, m1, [r5 + 16 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 16 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 591 * 16], m3 + + ; mode 11 [row 8] + pmaddubsw m3, m0, [r5 + 14 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 14 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 592 * 16], m3 + + ; mode 13 [row 1 - first half] + movu [r0 + 706 * 16], m3 + + pmaddubsw m3, m1, [r5 + 14 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 14 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 593 * 16], m3 + + ; mode 13 [row 1 - second half] + movu [r0 + 707 * 16], m3 + + ; mode 11 [row 9] + pmaddubsw m3, m0, [r5 + 12 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 12 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 594 * 16], m3 + + ; mode 12 [row 3 - first half] + movu [r0 + 646 * 16], m3 + + pmaddubsw m3, m1, [r5 + 12 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 12 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 595 * 16], m3 + + ; mode 12 [row 3 - second half] + movu [r0 + 647 * 16], m3 + + ; mode 11 [row 10] + pmaddubsw m3, m0, [r5 + 10 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 10 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 596 * 16], m3 + pmaddubsw m3, m1, [r5 + 10 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 10 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 597 * 16], m3 + + ; mode 11 [row 11] + pmaddubsw m3, m0, [r5 + 8 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 8 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 598 * 16], m3 + pmaddubsw m3, m1, [r5 + 8 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 8 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 599 * 16], m3 + + ; mode 11 [row 12] + pmaddubsw m3, m0, [r5 + 6 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 6 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 600 * 16], m3 + + ; mode 14 [row 1 - first half] + movu [r0 + 770 * 16], m3 + + pmaddubsw m3, m1, [r5 + 6 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 6 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 601 * 16], m3 + + ; mode 14 [row 1 - second half] + movu [r0 + 771 * 16], m3 + + ; mode 11 [row 13] + pmaddubsw m3, m0, [r5 + 4 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 4 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 602 * 16], m3 + pmaddubsw m3, m1, [r5 + 4 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 4 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 603 * 16], m3 + + ; mode 11 [row 14] + pmaddubsw m3, m0, [r5 + 2 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 2 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 604 * 16], m3 + + ; mode 13 [row 5 - first half] + movu [r0 + 650 * 16], m3 + + pmaddubsw m3, m1, [r5 + 2 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 2 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 605 * 16], m3 + + ; mode 13 [row 5 - second half] + movu [r0 + 651 * 16], m3 + + ; mode 12 [row 0] + pmaddubsw m3, m0, [r5 + 27 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 27 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 640 * 16], m3 + pmaddubsw m3, m1, [r5 + 27 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 27 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 641 * 16], m3 + + ; mode 12 [row 2] + pmaddubsw m3, m0, [r5 + 17 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 17 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 644 * 16], m3 + pmaddubsw m3, m1, [r5 + 17 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 17 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 645 * 16], m3 + + ; mode 12 [row 4] + pmaddubsw m3, m0, [r5 + 7 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 7 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 648 * 16], m3 + pmaddubsw m3, m1, [r5 + 7 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 7 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 649 * 16], m3 + + ; mode 13 [row 0] + pmaddubsw m3, m0, [r5 + 23 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 23 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 704 * 16], m3 + pmaddubsw m3, m1, [r5 + 23 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 23 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 705 * 16], m3 + + ; mode 13 [row 2] + pmaddubsw m3, m0, [r5 + 5 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 5 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 708 * 16], m3 + pmaddubsw m3, m1, [r5 + 5 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 5 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 709 * 16], m3 + + ; mode 14 [row 0] + pmaddubsw m3, m0, [r5 + 19 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 19 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 768 * 16], m3 + pmaddubsw m3, m1, [r5 + 19 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 19 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 769 * 16], m3 + + ; mode 15 [row 0] + pmaddubsw m3, m0, [r5 + 15 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 15 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 832 * 16], m3 + pmaddubsw m3, m1, [r5 + 15 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 15 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 833 * 16], m3 + + ; mode 11 [row 16] + pslldq m0, 2 + pinsrb m0, [r4 + 0], 1 + pinsrb m0, [r3 + 16], 0 + pmaddubsw m3, m0, [r5 + 30 * 16] + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 8], 1 + pinsrb m2, [r4 + 7], 0 + pmaddubsw m5, m2, [r5 + 30 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 608 * 16], m3 + pslldq m1, 2 + pinsrb m1, [r4 + 16], 1 + pinsrb m1, [r4 + 15], 0 + pmaddubsw m3, m1, [r5 + 30 * 16] + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrb m4, [r4 + 24], 1 + pinsrb m4, [r4 + 23], 0 + pmaddubsw m5, m4, [r5 + 30 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 609 * 16], m3 + + ; mode 11 [row 17] + pmaddubsw m3, m0, [r5 + 28 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 610 * 16], m3 + pmaddubsw m3, m1, [r5 + 28 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 611 * 16], m3 + + ; mode 11 [row 18] + pmaddubsw m3, m0, [r5 + 26 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 26 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 612 * 16], m3 + pmaddubsw m3, m1, [r5 + 26 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 26 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 613 * 16], m3 + + ; mode 11 [row 19] + pmaddubsw m3, m0, [r5 + 24 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 614 * 16], m3 + pmaddubsw m3, m1, [r5 + 24 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 615 * 16], m3 + + ; mode 11 [row 20] + pmaddubsw m3, m0, [r5 + 22 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 22 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 616 * 16], m3 + pmaddubsw m3, m1, [r5 + 22 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 22 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 617 * 16], m3 + + ; mode 11 [row 21] + pmaddubsw m3, m0, [r5 + 20 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 20 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 618 * 16], m3 + pmaddubsw m3, m1, [r5 + 20 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 20 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 619 * 16], m3 + + ; mode 11 [row 22] + pmaddubsw m3, m0, [r5 + 18 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 18 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 620 * 16], m3 + pmaddubsw m3, m1, [r5 + 18 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 18 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 621 * 16], m3 + + ; mode 11 [row 23] + pmaddubsw m3, m0, [r5 + 16 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 16 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 622 * 16], m3 + pmaddubsw m3, m1, [r5 + 16 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 16 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 623 * 16], m3 + + ; mode 11 [row 24] + pmaddubsw m3, m0, [r5 + 14 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 14 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 624 * 16], m3 + pmaddubsw m3, m1, [r5 + 14 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 14 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 625 * 16], m3 + + ; mode 11 [row 25] + pmaddubsw m3, m0, [r5 + 12 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 12 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 626 * 16], m3 + pmaddubsw m3, m1, [r5 + 12 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 12 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 627 * 16], m3 + + ; mode 11 [row 26] + pmaddubsw m3, m0, [r5 + 10 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 10 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 628 * 16], m3 + pmaddubsw m3, m1, [r5 + 10 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 10 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 629 * 16], m3 + + ; mode 11 [row 27] + pmaddubsw m3, m0, [r5 + 8 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 8 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 630 * 16], m3 + pmaddubsw m3, m1, [r5 + 8 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 8 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 631 * 16], m3 + + ; mode 11 [row 28] + pmaddubsw m3, m0, [r5 + 6 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 6 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 632 * 16], m3 + pmaddubsw m3, m1, [r5 + 6 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 6 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 633 * 16], m3 + + ; mode 11 [row 29] + pmaddubsw m3, m0, [r5 + 4 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 4 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 634 * 16], m3 + pmaddubsw m3, m1, [r5 + 4 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 4 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 635 * 16], m3 + + ; mode 11 [row 30] + pmaddubsw m3, m0, [r5 + 2 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 2 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 636 * 16], m3 + pmaddubsw m3, m1, [r5 + 2 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 2 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 637 * 16], m3 + + ; mode 12 [row 6] + pinsrb m0, [r3 + 6], 0 + pmaddubsw m3, m0, [r5 + 29 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 29 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 652 * 16], m3 + pmaddubsw m3, m1, [r5 + 29 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 29 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 653 * 16], m3 + + ; mode 12 [row 7] + pmaddubsw m3, m0, [r5 + 24 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 654 * 16], m3 + pmaddubsw m3, m1, [r5 + 24 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 655 * 16], m3 + + ; mode 12 [row 8] + pmaddubsw m3, m0, [r5 + 19 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 19 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 656 * 16], m3 + pmaddubsw m3, m1, [r5 + 19 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 19 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 657 * 16], m3 + + ; mode 12 [row 9] + pmaddubsw m3, m0, [r5 + 14 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 14 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 658 * 16], m3 + pmaddubsw m3, m1, [r5 + 14 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 14 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 659 * 16], m3 + + ; mode 12 [row 10] + pmaddubsw m3, m0, [r5 + 9 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 9 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 660 * 16], m3 + pmaddubsw m3, m1, [r5 + 9 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 9 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 661 * 16], m3 + + ; mode 12 [row 11] + pmaddubsw m3, m0, [r5 + 4 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 4 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 662 * 16], m3 + pmaddubsw m3, m1, [r5 + 4 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 4 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 663 * 16], m3 + + ; mode 13 [row 3] + movu m6, m0 + pinsrb m6, [r3 + 4], 0 + pmaddubsw m3, m6, [r5 + 28 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 710 * 16], m3 + pmaddubsw m3, m1, [r5 + 28 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 711 * 16], m3 + + ; mode 13 [row 4] + pmaddubsw m3, m6, [r5 + 19 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 19 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 712 * 16], m3 + pmaddubsw m3, m1, [r5 + 19 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 19 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 713 * 16], m3 + + ; mode 13 [row 5] + pmaddubsw m3, m6, [r5 + 10 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 10 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 714 * 16], m3 + pmaddubsw m3, m1, [r5 + 10 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 10 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 715 * 16], m3 + + ; mode 13 [row 6] + pmaddubsw m3, m6, [r5 + 1 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 1 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 716 * 16], m3 + pmaddubsw m3, m1, [r5 + 1 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 1 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 717 * 16], m3 + + ; mode 14 [row 2] + movu m6, m0 + pinsrb m6, [r4 + 0], 1 + pinsrb m6, [r3 + 2], 0 + pmaddubsw m3, m6, [r5 + 25 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 25 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 772 * 16], m3 + pmaddubsw m3, m1, [r5 + 25 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 25 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 773 * 16], m3 + + ; mode 14 [row 3] + pmaddubsw m3, m6, [r5 + 12 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 12 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 774 * 16], m3 + pmaddubsw m3, m1, [r5 + 12 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 12 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 775 * 16], m3 + + ; mode 15 [row 1] + pmaddubsw m3, m6, [r5 + 30 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 30 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 834 * 16], m3 + pmaddubsw m3, m1, [r5 + 30 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 30 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 835 * 16], m3 + + ; mode 15 [row 2] + pmaddubsw m3, m6, [r5 + 13 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 13 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 836 * 16], m3 + pmaddubsw m3, m1, [r5 + 13 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 13 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 837 * 16], m3 + + ; mode 15 [row 3] + pslldq m6, 2 + pinsrb m6, [r3 + 2], 1 + pinsrb m6, [r3 + 4], 0 + pmaddubsw m3, m6, [r5 + 28 * 16] + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 7], 1 + pinsrb m2, [r4 + 6], 0 + pmaddubsw m5, m2, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 838 * 16], m3 + pslldq m1, 2 + pinsrb m1, [r4 + 15], 1 + pinsrb m1, [r4 + 14], 0 + pmaddubsw m3, m1, [r5 + 28 * 16] + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrb m4, [r4 + 23], 1 + pinsrb m4, [r4 + 22], 0 + pmaddubsw m5, m4, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 839 * 16], m3 + + ; mode 15 [row 4] + pmaddubsw m3, m6, [r5 + 11 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 11 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 840 * 16], m3 + pmaddubsw m3, m1, [r5 + 11 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 11 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 841 * 16], m3 + + ; mode 15 [row 5, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 4], 1 + pinsrb m6, [r3 + 6], 0 + pmaddubsw m3, m6, [r5 + 26 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 842 * 16], m3 + + ; mode 15 [row 6, 0-7] + pmaddubsw m3, m6, [r5 + 9 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 844 * 16], m3 + + ; mode 15 [row 7, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 6], 1 + pinsrb m6, [r3 + 8], 0 + pmaddubsw m3, m6, [r5 + 24 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 846 * 16], m3 + + ; mode 15 [row 8, 0-7] + pmaddubsw m3, m6, [r5 + 7 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 848 * 16], m3 + + ; mode 15 [row 9, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 8], 1 + pinsrb m6, [r3 + 9], 0 + pmaddubsw m3, m6, [r5 + 22 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 850 * 16], m3 + + ; mode 15 [row 10, 0-7] + pmaddubsw m3, m6, [r5 + 5 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 852 * 16], m3 + + ; mode 15 [row 11, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 9], 1 + pinsrb m6, [r3 + 11], 0 + pmaddubsw m3, m6, [r5 + 20 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 854 * 16], m3 + + ; mode 15 [row 12, 0-7] + pmaddubsw m3, m6, [r5 + 3 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 856 * 16], m3 + + ; mode 15 [row 13, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 11], 1 + pinsrb m6, [r3 + 13], 0 + pmaddubsw m3, m6, [r5 + 18 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 858 * 16], m3 + + ; mode 15 [row 14, 0-7] + pmaddubsw m3, m6, [r5 + 1 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 860 * 16], m3 + + ; mode 15 [row 15, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 13], 1 + pinsrb m6, [r3 + 15], 0 + pmaddubsw m3, m6, [r5 + 16 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 862 * 16], m3 + + ; mode 15 [row 16, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 15], 1 + pinsrb m6, [r3 + 17], 0 + pmaddubsw m3, m6, [r5 + 31 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 864 * 16], m3 + + ; mode 15 [row 17, 0-7] + pmaddubsw m3, m6, [r5 + 14 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 866 * 16], m3 + + ; mode 15 [row 18, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 17], 1 + pinsrb m6, [r3 + 19], 0 + pmaddubsw m3, m6, [r5 + 29 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 868 * 16], m3 + + ; mode 15 [row 19, 0-7] + pmaddubsw m3, m6, [r5 + 12 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 870 * 16], m3 + + ; mode 15 [row 20, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 19], 1 + pinsrb m6, [r3 + 21], 0 + pmaddubsw m3, m6, [r5 + 27 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 872 * 16], m3 + + ; mode 15 [row 21, 0-7] + pmaddubsw m3, m6, [r5 + 10 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 874 * 16], m3 + + ; mode 15 [row 22, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 21], 1 + pinsrb m6, [r3 + 23], 0 + pmaddubsw m3, m6, [r5 + 25 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 876 * 16], m3 + + ; mode 15 [row 23, 0-7] + pmaddubsw m3, m6, [r5 + 8 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 878 * 16], m3 + + ; mode 15 [row 24, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 23], 1 + pinsrb m6, [r3 + 24], 0 + pmaddubsw m3, m6, [r5 + 23 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 880 * 16], m3 + + ; mode 15 [row 25, 0-7] + pmaddubsw m3, m6, [r5 + 6 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 882 * 16], m3 + + ; mode 15 [row 26, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 24], 1 + pinsrb m6, [r3 + 26], 0 + pmaddubsw m3, m6, [r5 + 21 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 884 * 16], m3 + + ; mode 15 [row 27, 0-7] + pmaddubsw m3, m6, [r5 + 4 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 886 * 16], m3 + + ; mode 15 [row 28, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 26], 1 + pinsrb m6, [r3 + 28], 0 + pmaddubsw m3, m6, [r5 + 19 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 888 * 16], m3 + + ; mode 15 [row 29, 0-7] + pmaddubsw m3, m6, [r5 + 2 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 890 * 16], m3 + + ; mode 15 [row 30, 0-7] + pslldq m6, 2 + pinsrb m6, [r3 + 28], 1 + pinsrb m6, [r3 + 30], 0 + pmaddubsw m3, m6, [r5 + 17 * 16] + pmulhrsw m3, m7 + packuswb m3, m3 + movh [r0 + 892 * 16], m3 + + ; mode 15 [row 31, 0-7] + pshufb m3, m6, [tab_S2] + movh [r0 + 894 * 16], m3 + + ; mode 12 [row 12] + pslldq m0, 2 + pinsrb m0, [r3 + 6], 1 + pinsrb m0, [r3 + 13], 0 + pmaddubsw m3, m0, [r5 + 31 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 31 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 664 * 16], m3 + pmaddubsw m3, m1, [r5 + 31 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 31 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 665 * 16], m3 + + ; mode 12 [row 13] + pmaddubsw m3, m0, [r5 + 26 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 26 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 666 * 16], m3 + pmaddubsw m3, m1, [r5 + 26 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 26 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 667 * 16], m3 + + ; mode 12 [row 14] + pmaddubsw m3, m0, [r5 + 21 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 21 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 668 * 16], m3 + pmaddubsw m3, m1, [r5 + 21 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 21 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 669 * 16], m3 + + ; mode 12 [row 15] + pmaddubsw m3, m0, [r5 + 16 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 16 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 670 * 16], m3 + pmaddubsw m3, m1, [r5 + 16 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 16 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 671 * 16], m3 + + ; mode 12 [row 16] + pmaddubsw m3, m0, [r5 + 11 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 11 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 672 * 16], m3 + pmaddubsw m3, m1, [r5 + 11 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 11 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 673 * 16], m3 + + ; mode 12 [row 17] + pmaddubsw m3, m0, [r5 + 6 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 6 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 674 * 16], m3 + pmaddubsw m3, m1, [r5 + 6 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 6 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 675 * 16], m3 + + ; mode 12 [row 18] + pmaddubsw m3, m0, [r5 + 1 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 1 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 676 * 16], m3 + pmaddubsw m3, m1, [r5 + 1 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 1 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 677 * 16], m3 + + ; mode 13 [row 7] + movu m6, m0 + pinsrb m6, [r3 + 4], 2 + pinsrb m6, [r3 + 4], 1 + pinsrb m6, [r3 + 7], 0 + pmaddubsw m3, m6, [r5 + 24 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 718 * 16], m3 + pmaddubsw m3, m1, [r5 + 24 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 719 * 16], m3 + + ; mode 13 [row 8] + pmaddubsw m3, m6, [r5 + 15 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 15 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 720 * 16], m3 + pmaddubsw m3, m1, [r5 + 15 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 15 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 721 * 16], m3 + + ; mode 13 [row 9] + pmaddubsw m3, m6, [r5 + 6 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 6 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 722 * 16], m3 + pmaddubsw m3, m1, [r5 + 6 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 6 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 723 * 16], m3 + + ; mode 14 [row 4] + pinsrb m6, [r3 + 2], 2 + pinsrb m6, [r3 + 2], 1 + pinsrb m6, [r3 + 5], 0 + pmaddubsw m3, m6, [r5 + 31 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 31 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 776 * 16], m3 + pmaddubsw m3, m1, [r5 + 31 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 31 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 777 * 16], m3 + + ; mode 14 [row 5] + pmaddubsw m3, m6, [r5 + 18 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 18 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 778 * 16], m3 + pmaddubsw m3, m1, [r5 + 18 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 18 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 779 * 16], m3 + + ; mode 14 [row 6] + pmaddubsw m3, m6, [r5 + 5 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 5 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 780 * 16], m3 + pmaddubsw m3, m1, [r5 + 5 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 5 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 781 * 16], m3 + + ; mode 14 [row 7] + pslldq m6, 2 + pinsrb m6, [r3 + 5], 1 + pinsrb m6, [r3 + 7], 0 + pmaddubsw m3, m6, [r5 + 24 * 16] + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 5], 0 + pmaddubsw m5, m2, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 782 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 13], 0 + pmaddubsw m3, m1, [r5 + 24 * 16] + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 21], 0 + pmaddubsw m5, m4, [r5 + 24 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 783 * 16], m3 + + ; mode 14 [row 8] + pmaddubsw m3, m6, [r5 + 11 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 11 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 784 * 16], m3 + pmaddubsw m3, m1, [r5 + 11 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 11 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 785 * 16], m3 + + ; mode 15 [row 5, 8-31] + pmaddubsw m5, m2, [r5 + 26 * 16] + pmulhrsw m5, m7 + packuswb m5, m5 + movh [r0 + 842 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 26 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 26 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 843 * 16], m3 + + ; mode 15 [row 6, 8-31] + pmaddubsw m5, m2, [r5 + 9 * 16] + pmulhrsw m5, m7 + packuswb m5, m5 + movh [r0 + 844 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 9 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 9 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 845 * 16], m3 + + ; mode 12 [row 19] + pslldq m0, 2 + pinsrb m0, [r3 + 13], 1 + pinsrb m0, [r3 + 19], 0 + pmaddubsw m3, m0, [r5 + 28 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 678 * 16], m3 + pmaddubsw m3, m1, [r5 + 28 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 28 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 679 * 16], m3 + + ; mode 12 [row 20] + pmaddubsw m3, m0, [r5 + 23 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 23 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 680 * 16], m3 + pmaddubsw m3, m1, [r5 + 23 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 23 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 681 * 16], m3 + + ; mode 12 [row 21] + pmaddubsw m3, m0, [r5 + 18 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 18 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 682 * 16], m3 + pmaddubsw m3, m1, [r5 + 18 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 18 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 683 * 16], m3 + + ; mode 12 [row 22] + pmaddubsw m3, m0, [r5 + 13 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 13 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 684 * 16], m3 + pmaddubsw m3, m1, [r5 + 13 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 13 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 685 * 16], m3 + + ; mode 12 [row 23] + pmaddubsw m3, m0, [r5 + 8 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 8 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 686 * 16], m3 + pmaddubsw m3, m1, [r5 + 8 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 8 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 687 * 16], m3 + + ; mode 12 [row 24] + pmaddubsw m3, m0, [r5 + 3 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m2, [r5 + 3 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 688 * 16], m3 + pmaddubsw m3, m1, [r5 + 3 * 16] + pmulhrsw m3, m7 + pmaddubsw m5, m4, [r5 + 3 * 16] + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 689 * 16], m3 + + ; mode 13 [row 10] + movu m7, m6 + movu m6, m0 + pinsrb m6, [r3 + 4], 4 + pinsrb m6, [r3 + 4], 3 + pinsrb m6, [r3 + 7], 2 + pinsrb m6, [r3 + 7], 1 + pinsrb m6, [r3 + 11], 0 + pmaddubsw m3, m6, [r5 + 29 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 29 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 724 * 16], m3 + pmaddubsw m3, m1, [r5 + 29 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 29 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 725 * 16], m3 + + ; mode 13 [row 11] + pmaddubsw m3, m6, [r5 + 20 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 20 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 726 * 16], m3 + pmaddubsw m3, m1, [r5 + 20 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 20 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 727 * 16], m3 + + ; mode 13 [row 12] + pmaddubsw m3, m6, [r5 + 11 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 11 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 728 * 16], m3 + pmaddubsw m3, m1, [r5 + 11 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 11 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 729 * 16], m3 + + ; mode 13 [row 13] + pmaddubsw m3, m6, [r5 + 2 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 2 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 730 * 16], m3 + pmaddubsw m3, m1, [r5 + 2 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 2 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 731 * 16], m3 + + ; mode 14 [row 9] + pslldq m7, 2 + pinsrb m7, [r3 + 7], 1 + pinsrb m7, [r3 + 10], 0 + pmaddubsw m3, m7, [r5 + 30 * 16] + pmulhrsw m3, [pw_1024] + pslldq m2, 2 + pinsrw m2, [r4 + 4], 0 + pmaddubsw m5, m2, [r5 + 30 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 786 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 12], 0 + pmaddubsw m3, m1, [r5 + 30 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrb m4, [r4 + 21], 1 + pinsrb m4, [r4 + 20], 0 + pmaddubsw m5, m4, [r5 + 30 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 787 * 16], m3 + + ; mode 14 [row 10] + pmaddubsw m3, m7, [r5 + 17 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 17 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 788 * 16], m3 + pmaddubsw m3, m1, [r5 + 17 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 17 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 789 * 16], m3 + + ; mode 14 [row 11] + pmaddubsw m3, m7, [r5 + 4 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 4 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 790 * 16], m3 + pmaddubsw m3, m1, [r5 + 4 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 4 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 791 * 16], m3 + + movu m6, [pw_1024] + + ; mode 15 [row 7, 8-31] + pmaddubsw m5, m2, [r5 + 24 * 16] + pmulhrsw m5, m6 + packuswb m5, m5 + movh [r0 + 846 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 24 * 16] + pmulhrsw m3, m6 + pmaddubsw m5, m4, [r5 + 24 * 16] + pmulhrsw m5, m6 + packuswb m3, m5 + movu [r0 + 847 * 16], m3 + + ; mode 15 [row 8, 8-31] + pmaddubsw m5, m2, [r5 + 7 * 16] + pmulhrsw m5, m6 + packuswb m5, m5 + movh [r0 + 848 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 7 * 16] + pmulhrsw m3, m6 + pmaddubsw m5, m4, [r5 + 7 * 16] + pmulhrsw m5, m6 + packuswb m3, m5 + movu [r0 + 849 * 16], m3 + + ; mode 12 [row 25] + pslldq m0, 2 + pinsrb m0, [r3 + 19], 1 + pinsrb m0, [r3 + 26], 0 + pmaddubsw m3, m0, [r5 + 30 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 30 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 690 * 16], m3 + pmaddubsw m3, m1, [r5 + 30 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 30 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 691 * 16], m3 + + ; mode 12 [row 26] + pmaddubsw m3, m0, [r5 + 25 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 25 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 692 * 16], m3 + pmaddubsw m3, m1, [r5 + 25 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 25 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 693 * 16], m3 + + ; mode 12 [row 27] + pmaddubsw m3, m0, [r5 + 20 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 20 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 694 * 16], m3 + pmaddubsw m3, m1, [r5 + 20 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 20 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 695 * 16], m3 + + ; mode 12 [row 28] + pmaddubsw m3, m0, [r5 + 15 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 15 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 696 * 16], m3 + pmaddubsw m3, m1, [r5 + 15 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 15 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 697 * 16], m3 + + ; mode 12 [row 29] + pmaddubsw m3, m0, [r5 + 10 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 10 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 698 * 16], m3 + pmaddubsw m3, m1, [r5 + 10 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 10 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 699 * 16], m3 + + ; mode 12 [row 30] + pmaddubsw m3, m0, [r5 + 5 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 5 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 700 * 16], m3 + pmaddubsw m3, m1, [r5 + 5 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 5 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 701 * 16], m3 + + ; mode 13 [row 14] + movu m6, m0 + pinsrb m6, [r3 + 4], 6 + pinsrb m6, [r3 + 4], 5 + pinsrb m6, [r3 + 7], 4 + pinsrb m6, [r3 + 7], 3 + pinsrb m6, [r3 + 11], 2 + pinsrb m6, [r3 + 11], 1 + pinsrb m6, [r3 + 14], 0 + pmaddubsw m3, m6, [r5 + 25 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 25 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 732 * 16], m3 + pmaddubsw m3, m1, [r5 + 25 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 25 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 733 * 16], m3 + + ; mode 13 [row 15] + pmaddubsw m3, m6, [r5 + 16 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 16 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 734 * 16], m3 + pmaddubsw m3, m1, [r5 + 16 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 16 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 735 * 16], m3 + + ; mode 13 [row 16] + pmaddubsw m3, m6, [r5 + 7 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 7 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 736 * 16], m3 + pmaddubsw m3, m1, [r5 + 7 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 7 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 737 * 16], m3 + + ; mode 13 [row 17] + pslldq m6, 2 + pinsrb m6, [r3 + 14], 1 + pinsrb m6, [r3 + 18], 0 + pmaddubsw m3, m6, [r5 + 30 * 16] + pmulhrsw m3, [pw_1024] + pslldq m2, 2 + pinsrw m2, [r4 + 3], 0 + pmaddubsw m5, m2, [r5 + 30 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 738 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 11], 0 + pmaddubsw m3, m1, [r5 + 30 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 19], 0 + pmaddubsw m5, m4, [r5 + 30 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 739 * 16], m3 + + ; mode 13 [row 18] + pmaddubsw m3, m6, [r5 + 21 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 21 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 740 * 16], m3 + pmaddubsw m3, m1, [r5 + 21 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 21 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 741 * 16], m3 + + ; mode 13 [row 19] + pmaddubsw m3, m6, [r5 + 12 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 12 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 742 * 16], m3 + pmaddubsw m3, m1, [r5 + 12 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 12 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 743 * 16], m3 + + ; mode 13 [row 20] + pmaddubsw m3, m6, [r5 + 3 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 3 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 744 * 16], m3 + pmaddubsw m3, m1, [r5 + 3 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 3 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 745 * 16], m3 + + ; mode 14 [row 12] + pslldq m7, 2 + pinsrb m7, [r3 + 10], 1 + pinsrb m7, [r3 + 12], 0 + pmaddubsw m3, m7, [r5 + 23 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 23 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 792 * 16], m3 + pmaddubsw m3, m1, [r5 + 23 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 23 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 793 * 16], m3 + + ; mode 14 [row 13] + pmaddubsw m3, m7, [r5 + 10 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 10 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 794 * 16], m3 + pmaddubsw m3, m1, [r5 + 10 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 10 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 795 * 16], m3 + + ; mode 15 [row 9] + pmaddubsw m5, m2, [r5 + 22 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movu [r0 + 850 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 22 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 22 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 851 * 16], m3 + + ; mode 15 [row 10] + pmaddubsw m5, m2, [r5 + 5 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movu [r0 + 852 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 5 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 5 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 853 * 16], m3 + + ; mode 13 [row 21] + pslldq m6, 2 + pinsrb m6, [r3 + 18], 1 + pinsrb m6, [r3 + 21], 0 + pmaddubsw m3, m6, [r5 + 26 * 16] + pmulhrsw m3, [pw_1024] + pslldq m2, 2 + pinsrw m2, [r4 + 2], 0 + pmaddubsw m5, m2, [r5 + 26 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 746 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 10], 0 + pmaddubsw m3, m1, [r5 + 26 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 18], 0 + pmaddubsw m5, m4, [r5 + 26 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 747 * 16], m3 + + ; mode 13 [row 22] + pmaddubsw m3, m6, [r5 + 17 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 17 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 748 * 16], m3 + pmaddubsw m3, m1, [r5 + 17 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 17 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 749 * 16], m3 + + ; mode 13 [row 23] + pmaddubsw m3, m6, [r5 + 8 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 8 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 750 * 16], m3 + pmaddubsw m3, m1, [r5 + 8 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 8 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 751 * 16], m3 + + ; mode 14 [row 14] + pslldq m7, 2 + pinsrb m7, [r3 + 12], 1 + pinsrb m7, [r3 + 15], 0 + pmaddubsw m3, m7, [r5 + 29 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 29 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 796 * 16], m3 + pmaddubsw m3, m1, [r5 + 29 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 29 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 797 * 16], m3 + + ; mode 14 [row 15] + pmaddubsw m3, m7, [r5 + 16 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 16 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 798 * 16], m3 + pmaddubsw m3, m1, [r5 + 16 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 16 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 799 * 16], m3 + + ; mode 14 [row 16] + pmaddubsw m3, m7, [r5 + 3 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 3 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 800 * 16], m3 + pmaddubsw m3, m1, [r5 + 3 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 3 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 801 * 16], m3 + + ; mode 15 [row 11] + pmaddubsw m5, m2, [r5 + 20 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 854 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 20 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 20 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 855 * 16], m3 + + ; mode 15 [row 12] + pmaddubsw m5, m2, [r5 + 3 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 856 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 3 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 3 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 857 * 16], m3 + + ; mode 13 [row 24] + pslldq m6, 2 + pinsrb m6, [r3 + 21], 1 + pinsrb m6, [r3 + 25], 0 + pmaddubsw m3, m6, [r5 + 31 * 16] + pmulhrsw m3, [pw_1024] + pslldq m2, 2 + pinsrw m2, [r4 + 1], 0 + pmaddubsw m5, m2, [r5 + 31 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 752 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 9], 0 + pmaddubsw m3, m1, [r5 + 31 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 17], 0 + pmaddubsw m5, m4, [r5 + 31 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 753 * 16], m3 + + ; mode 13 [row 25] + pmaddubsw m3, m6, [r5 + 22 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 22 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 754 * 16], m3 + pmaddubsw m3, m1, [r5 + 22 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 22 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 755 * 16], m3 + + ; mode 13 [row 26] + pmaddubsw m3, m6, [r5 + 13 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 13 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 756 * 16], m3 + pmaddubsw m3, m1, [r5 + 13 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 13 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 757 * 16], m3 + + ; mode 13 [row 27] + pmaddubsw m3, m6, [r5 + 4 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 4 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 758 * 16], m3 + pmaddubsw m3, m1, [r5 + 4 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 4 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 759 * 16], m3 + + ; mode 14 [row 17] + pslldq m7, 2 + pinsrb m7, [r3 + 15], 1 + pinsrb m7, [r3 + 17], 0 + pmaddubsw m3, m7, [r5 + 22 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 22 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 802 * 16], m3 + pmaddubsw m3, m1, [r5 + 22 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 22 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 803 * 16], m3 + + ; mode 14 [row 18] + pmaddubsw m3, m7, [r5 + 9 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 9 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 804 * 16], m3 + pmaddubsw m3, m1, [r5 + 9 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 9 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 805 * 16], m3 + + ; mode 15 [row 13] + pmaddubsw m5, m2, [r5 + 18 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 858 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 18 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 18 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 859 * 16], m3 + + ; mode 15 [row 14] + pmaddubsw m5, m2, [r5 + 1 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 860 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 1 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 1 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 861 * 16], m3 + + ; mode 13 [row 28] + pslldq m6, 2 + pinsrb m6, [r3 + 25], 1 + pinsrb m6, [r3 + 28], 0 + pmaddubsw m3, m6, [r5 + 27 * 16] + pmulhrsw m3, [pw_1024] + pslldq m2, 2 + pinsrw m2, [r4 + 0], 0 + pmaddubsw m5, m2, [r5 + 27 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 760 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 8], 0 + pmaddubsw m3, m1, [r5 + 27 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 16], 0 + pmaddubsw m5, m4, [r5 + 27 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 761 * 16], m3 + + ; mode 13 [row 29] + pmaddubsw m3, m6, [r5 + 18 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 18 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 762 * 16], m3 + pmaddubsw m3, m1, [r5 + 18 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 18 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 763 * 16], m3 + + ; mode 13 [row 30] + pmaddubsw m3, m6, [r5 + 9 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 9 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 764 * 16], m3 + pmaddubsw m3, m1, [r5 + 9 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 9 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 765 * 16], m3 + + ; mode 14 [row 19] + pslldq m7, 2 + pinsrb m7, [r3 + 17], 1 + pinsrb m7, [r3 + 20], 0 + pmaddubsw m3, m7, [r5 + 28 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 28 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 806 * 16], m3 + pmaddubsw m3, m1, [r5 + 28 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 28 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 807 * 16], m3 + + ; mode 14 [row 20] + pmaddubsw m3, m7, [r5 + 15 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 15 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 808 * 16], m3 + pmaddubsw m3, m1, [r5 + 15 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 15 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 809 * 16], m3 + + ; mode 14 [row 21] + pmaddubsw m3, m7, [r5 + 2 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 2 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 810 * 16], m3 + pmaddubsw m3, m1, [r5 + 2 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 2 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 811 * 16], m3 + + ; mode 15 [row 15] + pmaddubsw m5, m2, [r5 + 16 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 862 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 16 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 16 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 863 * 16], m3 + + ; mode 14 [row 22] + pslldq m7, 2 + pinsrb m7, [r3 + 20], 1 + pinsrb m7, [r3 + 22], 0 + pmaddubsw m3, m7, [r5 + 21 * 16] + pmulhrsw m3, [pw_1024] + pslldq m2, 2 + pinsrb m2, [r4 + 0], 1 + pinsrb m2, [r3 + 2], 0 + pmaddubsw m5, m2, [r5 + 21 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 812 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 7], 0 + pmaddubsw m3, m1, [r5 + 21 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 15], 0 + pmaddubsw m5, m4, [r5 + 21 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 813 * 16], m3 + + ; mode 14 [row 23] + pmaddubsw m3, m7, [r5 + 8 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 8 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 814 * 16], m3 + pmaddubsw m3, m1, [r5 + 8 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 8 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 815 * 16], m3 + + ; mode 15 [row 16] + pmaddubsw m5, m2, [r5 + 31 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 864 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 31 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 31 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 865 * 16], m3 + + ; mode 15 [row 17] + pmaddubsw m5, m2, [r5 + 14 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 866 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 14 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 14 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 867 * 16], m3 + + ; mode 14 [row 24] + pslldq m7, 2 + pinsrb m7, [r3 + 22], 1 + pinsrb m7, [r3 + 25], 0 + pmaddubsw m3, m7, [r5 + 27 * 16] + pmulhrsw m3, [pw_1024] + pslldq m2, 2 + pinsrb m2, [r3 + 2], 1 + pinsrb m2, [r3 + 5], 0 + pmaddubsw m5, m2, [r5 + 27 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 816 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 6], 0 + pmaddubsw m3, m1, [r5 + 27 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 14], 0 + pmaddubsw m5, m4, [r5 + 27 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 817 * 16], m3 + + ; mode 14 [row 25] + pmaddubsw m3, m7, [r5 + 14 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 14 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 818 * 16], m3 + pmaddubsw m3, m1, [r5 + 14 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 14 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 819 * 16], m3 + + ; mode 14 [row 26] + pmaddubsw m3, m7, [r5 + 1 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 1 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 820 * 16], m3 + pmaddubsw m3, m1, [r5 + 1 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 1 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 821 * 16], m3 + + ; mode 15 [row 18] + pinsrb m2, [r3 + 4], 0 + pmaddubsw m5, m2, [r5 + 29 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 868 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 29 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 29 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 869 * 16], m3 + + ; mode 15 [row 19] + pmaddubsw m5, m2, [r5 + 12 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 870 * 16 + 8], m5 + pmaddubsw m3, m1, [r5 + 12 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 12 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 871 * 16], m3 + + ; mode 15 [row 20 - 8 to 15] + pslldq m3, m2, 2 + pinsrb m3, [r3 + 4], 1 + pinsrb m3, [r3 + 6], 0 + pmaddubsw m5, m3, [r5 + 27 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 872 * 16 + 8], m5 + + ; mode 15 [row 21 - 8 to 15] + pmaddubsw m5, m3, [r5 + 10 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 874 * 16 + 8], m5 + + ; mode 15 [row 22 - 8 to 15] + pslldq m3, 2 + pinsrb m3, [r3 + 6], 1 + pinsrb m3, [r3 + 8], 0 + pmaddubsw m5, m3, [r5 + 25 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 876 * 16 + 8], m5 + + ; mode 15 [row 23 - 8 to 15] + pmaddubsw m5, m3, [r5 + 8 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 878 * 16 + 8], m5 + + ; mode 15 [row 24 - 8 to 15] + pslldq m3, 2 + pinsrb m3, [r3 + 8], 1 + pinsrb m3, [r3 + 9], 0 + pmaddubsw m5, m3, [r5 + 23 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 880 * 16 + 8], m5 + + ; mode 15 [row 25 - 8 to 15] + pmaddubsw m5, m3, [r5 + 6 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 882 * 16 + 8], m5 + + ; mode 15 [row 26 - 8 to 15] + pslldq m3, 2 + pinsrb m3, [r3 + 9], 1 + pinsrb m3, [r3 + 11], 0 + pmaddubsw m5, m3, [r5 + 21 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 884 * 16 + 8], m5 + + ; mode 15 [row 27 - 8 to 15] + pmaddubsw m5, m3, [r5 + 4 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 886 * 16 + 8], m5 + + ; mode 15 [row 28 - 8 to 15] + pslldq m3, 2 + pinsrb m3, [r3 + 11], 1 + pinsrb m3, [r3 + 13], 0 + pmaddubsw m5, m3, [r5 + 19 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 888 * 16 + 8], m5 + + ; mode 15 [row 29 - 8 to 15] + pmaddubsw m5, m3, [r5 + 2 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 890 * 16 + 8], m5 + + ; mode 15 [row 30 - 8 to 15] + pslldq m3, 2 + pinsrb m3, [r3 + 13], 1 + pinsrb m3, [r3 + 15], 0 + pmaddubsw m5, m3, [r5 + 17 * 16] + pmulhrsw m5, [pw_1024] + packuswb m5, m5 + movh [r0 + 892 * 16 + 8], m5 + + ; mode 15 [row 31, 8 to 15] + pshufb m5, m3, [tab_S2] + movh [r0 + 894 * 16 + 8], m5 + + ; mode 14 [row 27] + pinsrb m2, [r3 + 5], 0 + pslldq m7, 2 + pinsrb m7, [r3 + 25], 1 + pinsrb m7, [r3 + 27], 0 + pmaddubsw m3, m7, [r5 + 20 * 16] + pmulhrsw m3, [pw_1024] + pslldq m2, 2 + pinsrb m2, [r3 + 5], 1 + pinsrb m2, [r3 + 7], 0 + pmaddubsw m5, m2, [r5 + 20 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 822 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 5], 0 + pmaddubsw m3, m1, [r5 + 20 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 13], 0 + pmaddubsw m5, m4, [r5 + 20 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 823 * 16], m3 + + ; mode 15 [row 20 - 16 to 31] + pmaddubsw m3, m1, [r5 + 27 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 27 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 873 * 16], m3 + + ; mode 15 [row 21 - 16 to 31] + pmaddubsw m3, m1, [r5 + 10 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 10 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 875 * 16], m3 + + ; mode 14 [row 28] + pmaddubsw m3, m7, [r5 + 7 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 7 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 824 * 16], m3 + pmaddubsw m3, m1, [r5 + 7 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 7 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 825 * 16], m3 + + ; mode 14 [row 29] + pslldq m7, 2 + pinsrb m7, [r3 + 27], 1 + pinsrb m7, [r3 + 30], 0 + pmaddubsw m3, m7, [r5 + 26 * 16] + pmulhrsw m3, [pw_1024] + pslldq m2, 2 + pinsrb m2, [r3 + 7], 1 + pinsrb m2, [r3 + 10], 0 + pmaddubsw m5, m2, [r5 + 26 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 826 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 4], 0 + pmaddubsw m3, m1, [r5 + 26 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 12], 0 + pmaddubsw m5, m4, [r5 + 26 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 827 * 16], m3 + + ; mode 14 [row 30] + pmaddubsw m3, m7, [r5 + 13 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m2, [r5 + 13 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 828 * 16], m3 + pmaddubsw m3, m1, [r5 + 13 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 13 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 829 * 16], m3 + + ; mode 15 [row 22] + pmaddubsw m3, m1, [r5 + 25 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 25 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 877 * 16], m3 + + ; mode 15 [row 23] + pmaddubsw m3, m1, [r5 + 8 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 8 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 879 * 16], m3 + + ; mode 14 [row 31] + pshufb m3, m7, [tab_S2] + movh [r0 + 830 * 16], m3 + pshufb m3, m2, [tab_S2] + movh [r0 + 830 * 16 + 8], m3 + pshufb m3, m1, [tab_S2] + movh [r0 + 831 * 16], m3 + pshufb m3, m4, [tab_S2] + movh [r0 + 831 * 16 + 8], m3 + + ; mode 13 [row 31] + pshufb m0, m6, [tab_S2] + movh [r0 + 766 * 16], m0 + movh m0, [r4] + movh [r0 + 766 * 16 + 8], m0 + movu m0, [r4 + 8] + movu [r0 + 767 * 16], m0 + + ; mode 15 [row 24] + pslldq m1, 2 + pinsrw m1, [r4 + 3], 0 + pmaddubsw m3, m1, [r5 + 23 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 11], 0 + pmaddubsw m5, m4, [r5 + 23 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 881 * 16], m3 + + ; mode 15 [row 25] + pmaddubsw m3, m1, [r5 + 6 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 6 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 883 * 16], m3 + + ; mode 15 [row 26] + pslldq m1, 2 + pinsrw m1, [r4 + 2], 0 + pmaddubsw m3, m1, [r5 + 21 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 10], 0 + pmaddubsw m5, m4, [r5 + 21 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 885 * 16], m3 + + ; mode 15 [row 27] + pmaddubsw m3, m1, [r5 + 4 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 4 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 887 * 16], m3 + + ; mode 15 [row 28] + pslldq m1, 2 + pinsrw m1, [r4 + 1], 0 + pmaddubsw m3, m1, [r5 + 19 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 9], 0 + pmaddubsw m5, m4, [r5 + 19 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 889 * 16], m3 + + ; mode 15 [row 29] + pmaddubsw m3, m1, [r5 + 2 * 16] + pmulhrsw m3, [pw_1024] + pmaddubsw m5, m4, [r5 + 2 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 891 * 16], m3 + + ; mode 15 [row 30] + pslldq m1, 2 + pinsrw m1, [r4 + 0], 0 + pmaddubsw m3, m1, [r5 + 17 * 16] + pmulhrsw m3, [pw_1024] + pslldq m4, 2 + pinsrw m4, [r4 + 8], 0 + pmaddubsw m5, m4, [r5 + 17 * 16] + pmulhrsw m5, [pw_1024] + packuswb m3, m5 + movu [r0 + 893 * 16], m3 + + ; mode 15 [row 31] + pshufb m5, m1, [tab_S2] + movh [r0 + 895 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 895 * 16 + 8], m5 + + ; mode 16 [row 0] + movu m6, [r5 + 11 * 16] + movu m7, [pw_1024] + movh m0, [r4 ] + movh m1, [r4 + 1 ] + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movh m2, [r4 + 8] + movh m3, [r4 + 9] + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 896 * 16], m1 + + movh m1, [r4 + 16] + movh m3, [r4 + 17] + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movh m4, [r4 + 24] + movh m5, [r4 + 25] + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 897 * 16], m3 + + ; mode16 [row 1] + movu m6, [r5 + 22 * 16] + pslldq m0, 2 + pinsrb m0, [r4], 1 + pinsrb m0, [r3 + 2], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 898 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 15], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 23], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 899 * 16], m3 + + ; mode16 [row 2] + movu m6, [r5 + 1 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 900 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 901 * 16], m3 + + ; mode16 [row 3] + movu m6, [r5 + 12 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 2], 1 + pinsrb m0, [r3 + 3], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 902 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 14], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 22], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 903 * 16], m3 + + ; mode16 [row 4] + movu m6, [r5 + 23 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 3], 1 + pinsrb m0, [r3 + 5], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 904 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 13], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 21], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 905 * 16], m3 + + ; mode16 [row 5] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 906 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 907 * 16], m3 + + ; mode16 [row 6] + movu m6, [r5 + 13 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 5], 1 + pinsrb m0, [r3 + 6], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 5], 1 + pinsrb m2, [r4 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 908 * 16], m3 + pslldq m1, 2 + pinsrw m1, [r4 + 12], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 20], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 909 * 16], m3 + + ; mode16 [row 7] + movu m6, [r5 + 24 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 6], 1 + pinsrb m0, [r3 + 8], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 3], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 910 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 11], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 19], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 911 * 16], m3 + + ; mode16 [row 8] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 912 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 913 * 16], m3 + + ; mode16 [row 9] + movu m6, [r5 + 14 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 8], 1 + pinsrb m0, [r3 + 9], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 914 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 10], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 18], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 915 * 16], m3 + + ; mode16 [row 10] + movu m6, [r5 + 25 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 9], 1 + pinsrb m0, [r3 + 11], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 1], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 916 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 9], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrb m4, [r4 + 18], 1 + pinsrb m4, [r4 + 17], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 917 * 16], m3 + + ; mode16 [row 11] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 918 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 919 * 16], m3 + + ; mode16 [row 12] + movu m6, [r5 + 15 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 11], 1 + pinsrb m0, [r3 + 12], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 0], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 920 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 8], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 16], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 921 * 16], m3 + + ; mode16 [row 13] + movu m6, [r5 + 26 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 12], 1 + pinsrb m0, [r3 + 14], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 0], 1 + pinsrb m2, [r3 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 922 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 7], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 15], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 923 * 16], m3 + + ; mode16 [row 14] + movu m6, [r5 + 5 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 924 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 925 * 16], m3 + + ; mode16 [row 15] + movu m6, [r5 + 16 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 14], 1 + pinsrb m0, [r3 + 15], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 2], 1 + pinsrb m2, [r3 + 3], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 926 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 6], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 14], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 927 * 16], m3 + + ; mode16 [row 16] + movu m6, [r5 + 27 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 15], 1 + pinsrb m0, [r3 + 17], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 3], 1 + pinsrb m2, [r3 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 928 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 5], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 13], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 929 * 16], m3 + + ; mode16 [row 17] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 930 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 931 * 16], m3 + + ; mode16 [row 18] + movu m6, [r5 + 17 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 17], 1 + pinsrb m0, [r3 + 18], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 5], 1 + pinsrb m2, [r3 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 932 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 4], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 12], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 933 * 16], m3 + + ; mode16 [row 19] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 18], 1 + pinsrb m0, [r3 + 20], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 6], 1 + pinsrb m2, [r3 + 8], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 934 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 3], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 11], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 935 * 16], m3 + + ; mode16 [row 20] + movu m6, [r5 + 7 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 936 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 937 * 16], m3 + + ; mode16 [row 21] + movu m6, [r5 + 18 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 20], 1 + pinsrb m0, [r3 + 21], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 8], 1 + pinsrb m2, [r3 + 9], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 938 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 2], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 10], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 939 * 16], m3 + + ; mode16 [row 22] + movu m6, [r5 + 29 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 21], 1 + pinsrb m0, [r3 + 23], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 9], 1 + pinsrb m2, [r3 + 11], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 940 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 1], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 9], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 941 * 16], m3 + + ; mode16 [row 23] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 942 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 943 * 16], m3 + + ; mode16 [row 24] + movu m6, [r5 + 19 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 23], 1 + pinsrb m0, [r3 + 24], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 11], 1 + pinsrb m2, [r3 + 12], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 944 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 0], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 8], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 945 * 16], m3 + + ; mode16 [row 25] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 24], 1 + pinsrb m0, [r3 + 26], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 12], 1 + pinsrb m2, [r3 + 14], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 946 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r4 + 0], 1 + pinsrb m1, [r3 + 2], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 7], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 947 * 16], m3 + + ; mode16 [row 26] + movu m6, [r5 + 9 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 948 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 949 * 16], m3 + + ; mode16 [row 27] + movu m6, [r5 + 20 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 26], 1 + pinsrb m0, [r3 + 27], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 14], 1 + pinsrb m2, [r3 + 15], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 950 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 2], 1 + pinsrb m1, [r3 + 3], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 6], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 951 * 16], m3 + + ; mode16 [row 28] + movu m6, [r5 + 31 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 27], 1 + pinsrb m0, [r3 + 29], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 15], 1 + pinsrb m2, [r3 + 17], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 952 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 3], 1 + pinsrb m1, [r3 + 5], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 5], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 953 * 16], m3 + + ; mode16 [row 29] + movu m6, [r5 + 10 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 954 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 955 * 16], m3 + + ; mode16 [row 30] + movu m6, [r5 + 21 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 29], 1 + pinsrb m0, [r3 + 30], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 17], 1 + pinsrb m2, [r3 + 18], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 956 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 5], 1 + pinsrb m1, [r3 + 6], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 4], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 957 * 16], m3 + + ; mode16 [row 31] + pshufb m5, m0, [tab_S2] + movh [r0 + 958 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 958 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 959 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 959 * 16 + 8], m5 + + ; mode 17 [row 0] + movu m6, [r5 + 6 * 16] + movu m7, [pw_1024] + movh m0, [r4 ] + movh m1, [r4 + 1 ] + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movh m2, [r4 + 8] + movh m3, [r4 + 9] + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 960 * 16], m1 + + movh m1, [r4 + 16] + movh m3, [r4 + 17] + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movh m4, [r4 + 24] + movh m5, [r4 + 25] + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 961 * 16], m3 + + ; mode17 [row 1] + movu m6, [r5 + 12 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 0], 1 + pinsrb m0, [r3 + 1], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 962 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 15], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 23], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 963 * 16], m3 + + ; mode17 [row 2] + movu m6, [r5 + 18 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 1], 1 + pinsrb m0, [r3 + 2], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 964 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 14], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 22], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 965 * 16], m3 + + ; mode17 [row 3] + movu m6, [r5 + 24 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 2], 1 + pinsrb m0, [r3 + 4], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 966 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 13], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 21], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 967 * 16], m3 + + ; mode17 [row 4] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 4], 1 + pinsrb m0, [r3 + 5], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 968 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 12], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 20], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 969 * 16], m3 + + ; mode17 [row 5] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 970 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 971 * 16], m3 + + ; mode17 [row 6] + movu m6, [r5 + 10 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 5], 1 + pinsrb m0, [r3 + 6], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 3], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 972 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 11], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 19], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 973 * 16], m3 + + ; mode17 [row 7] + movu m6, [r5 + 16 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 6], 1 + pinsrb m0, [r3 + 7], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 974 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 10], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 18], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 975 * 16], m3 + + ; mode17 [row 8] + movu m6, [r5 + 22 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 7], 1 + pinsrb m0, [r3 + 9], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 1], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 976 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 9], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 17], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 977 * 16], m3 + + ; mode17 [row 9] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 9], 1 + pinsrb m0, [r3 + 10], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrw m2, [r4 + 0], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 978 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 8], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 16], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 979 * 16], m3 + + ; mode17 [row 10] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 980 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 981 * 16], m3 + + ; mode17 [row 11] + movu m6, [r5 + 8 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 10], 1 + pinsrb m0, [r3 + 11], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 0], 1 + pinsrb m2, [r3 + 1], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 982 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 7], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 15], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 983 * 16], m3 + + ; mode17 [row 12] + movu m6, [r5 + 14 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 11], 1 + pinsrb m0, [r3 + 12], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 1], 1 + pinsrb m2, [r3 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 984 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 6], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 14], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 985 * 16], m3 + + ; mode17 [row 13] + movu m6, [r5 + 20 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 12], 1 + pinsrb m0, [r3 + 14], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 2], 1 + pinsrb m2, [r3 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 986 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 5], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 13], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 987 * 16], m3 + + ; mode17 [row 14] + movu m6, [r5 + 26 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 14], 1 + pinsrb m0, [r3 + 15], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 4], 1 + pinsrb m2, [r3 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 988 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 4], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 12], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 989 * 16], m3 + + ; mode17 [row 15] + pshufb m5, m0, [tab_S2] + movh [r0 + 990 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 990 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 991 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 991 * 16 + 8], m5 + + ; mode17 [row 16] + movu m6, [r5 + 6 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 15], 1 + pinsrb m0, [r3 + 16], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 5], 1 + pinsrb m2, [r3 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 992 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 3], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 11], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 993 * 16], m3 + + ; mode17 [row 17] + movu m6, [r5 + 12 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 16], 1 + pinsrb m0, [r3 + 17], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 6], 1 + pinsrb m2, [r3 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 994 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 2], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 10], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 995 * 16], m3 + + ; mode17 [row 18] + movu m6, [r5 + 18 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 17], 1 + pinsrb m0, [r3 + 18], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 7], 1 + pinsrb m2, [r3 + 9], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 996 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 1], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 9], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 997 * 16], m3 + + ; mode17 [row 19] + movu m6, [r5 + 24 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 18], 1 + pinsrb m0, [r3 + 20], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 9], 1 + pinsrb m2, [r3 + 10], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 998 * 16], m3 + + pslldq m1, 2 + pinsrw m1, [r4 + 0], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 8], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 999 * 16], m3 + + ; mode17 [row 20] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 20], 1 + pinsrb m0, [r3 + 21], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 10], 1 + pinsrb m2, [r3 + 11], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1000 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r4 + 0], 1 + pinsrb m1, [r3 + 1], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + ;pinsrb m4, [r4 + 8], 1 + ;pinsrb m4, [r4 + 7], 0 + pinsrw m4, [r4 + 7], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1001 * 16], m3 + + ; mode17 [row 21] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1002 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1003 * 16], m3 + + ; mode17 [row 22] + movu m6, [r5 + 10 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 21], 1 + pinsrb m0, [r3 + 22], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 11], 1 + pinsrb m2, [r3 + 12], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1004 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 1], 1 + pinsrb m1, [r3 + 2], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 6], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1005 * 16], m3 + + ; mode17 [row 23] + movu m6, [r5 + 16 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 22], 1 + pinsrb m0, [r3 + 23], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 12], 1 + pinsrb m2, [r3 + 14], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1006 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 2], 1 + pinsrb m1, [r3 + 4], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 5], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1007 * 16], m3 + + ; mode17 [row 24] + movu m6, [r5 + 22 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 23], 1 + pinsrb m0, [r3 + 25], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 14], 1 + pinsrb m2, [r3 + 15], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1008 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 4], 1 + pinsrb m1, [r3 + 5], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 4], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1009 * 16], m3 + + ; mode17 [row 25] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 25], 1 + pinsrb m0, [r3 + 26], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 15], 1 + pinsrb m2, [r3 + 16], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1010 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 5], 1 + pinsrb m1, [r3 + 6], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 3], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1011 * 16], m3 + + ; mode17 [row 26] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1012 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1013 * 16], m3 + + ; mode17 [row 27] + movu m6, [r5 + 8 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 26], 1 + pinsrb m0, [r3 + 27], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 16], 1 + pinsrb m2, [r3 + 17], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1014 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 6], 1 + pinsrb m1, [r3 + 7], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 2], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1015 * 16], m3 + + ; mode17 [row 28] + movu m6, [r5 + 14 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 27], 1 + pinsrb m0, [r3 + 28], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 17], 1 + pinsrb m2, [r3 + 18], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1016 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 7], 1 + pinsrb m1, [r3 + 9], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 1], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1017 * 16], m3 + + ; mode17 [row 29] + movu m6, [r5 + 20 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 28], 1 + pinsrb m0, [r3 + 30], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 18], 1 + pinsrb m2, [r3 + 20], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1018 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 9], 1 + pinsrb m1, [r3 + 10], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrw m4, [r4 + 0], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1019 * 16], m3 + + ; mode17 [row 30] + movu m6, [r5 + 26 * 16] + pslldq m0, 2 + pinsrb m0, [r3 + 30], 1 + pinsrb m0, [r3 + 31], 0 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 20], 1 + pinsrb m2, [r3 + 21], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1020 * 16], m3 + + pslldq m1, 2 + pinsrb m1, [r3 + 10], 1 + pinsrb m1, [r3 + 11], 0 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pslldq m4, 2 + pinsrb m4, [r4 + 0], 1 + pinsrb m4, [r3 + 1], 0 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1021 * 16], m3 + + ; mode17 [row 31] + pshufb m5, m0, [tab_S2] + movh [r0 + 1022 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1022 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1023 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 1023 * 16 + 8], m5 + + ;mode 18[row 0] + movu m0, [r3] + movu [r0 + 1024 * 16], m0 + movu m1, [r3 + 16] + movu [r0 + 1025 * 16], m1 + + ;mode 18[row 1] + pslldq m0, 1 + pinsrb m0, [r4 + 1], 0 + movu [r0 + 1026 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 15], 0 + movu [r0 + 1027 * 16], m1 + + ;mode 18[row 2] + pslldq m0, 1 + pinsrb m0, [r4 + 2], 0 + movu [r0 + 1028 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 14], 0 + movu [r0 + 1029 * 16], m1 + + ;mode 18[row 3] + pslldq m0, 1 + pinsrb m0, [r4 + 3], 0 + movu [r0 + 1030 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 13], 0 + movu [r0 + 1031 * 16], m1 + + ;mode 18[row 4] + pslldq m0, 1 + pinsrb m0, [r4 + 4], 0 + movu [r0 + 1032 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 12], 0 + movu [r0 + 1033 * 16], m1 + + ;mode 18[row 5] + pslldq m0, 1 + pinsrb m0, [r4 + 5], 0 + movu [r0 + 1034 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 11], 0 + movu [r0 + 1035 * 16], m1 + + ;mode 18[row 6] + pslldq m0, 1 + pinsrb m0, [r4 + 6], 0 + movu [r0 + 1036 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 10], 0 + movu [r0 + 1037 * 16], m1 + + ;mode 18[row 7] + pslldq m0, 1 + pinsrb m0, [r4 + 7], 0 + movu [r0 + 1038 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 9], 0 + movu [r0 + 1039 * 16], m1 + + ;mode 18[row 8] + pslldq m0, 1 + pinsrb m0, [r4 + 8], 0 + movu [r0 + 1040 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 8], 0 + movu [r0 + 1041 * 16], m1 + + ;mode 18[row 9] + pslldq m0, 1 + pinsrb m0, [r4 + 9], 0 + movu [r0 + 1042 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 7], 0 + movu [r0 + 1043 * 16], m1 + + ;mode 18[row 10] + pslldq m0, 1 + pinsrb m0, [r4 + 10], 0 + movu [r0 + 1044 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 6], 0 + movu [r0 + 1045 * 16], m1 + + ;mode 18[row 11] + pslldq m0, 1 + pinsrb m0, [r4 + 11], 0 + movu [r0 + 1046 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 5], 0 + movu [r0 + 1047 * 16], m1 + + ;mode 18[row 12] + pslldq m0, 1 + pinsrb m0, [r4 + 12], 0 + movu [r0 + 1048 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 4], 0 + movu [r0 + 1049 * 16], m1 + + ;mode 18[row 13] + pslldq m0, 1 + pinsrb m0, [r4 + 13], 0 + movu [r0 + 1050 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 3], 0 + movu [r0 + 1051 * 16], m1 + + ;mode 18[row 14] + pslldq m0, 1 + pinsrb m0, [r4 + 14], 0 + movu [r0 + 1052 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 2], 0 + movu [r0 + 1053 * 16], m1 + + ;mode 18[row 15] + pslldq m0, 1 + pinsrb m0, [r4 + 15], 0 + movu [r0 + 1054 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 1], 0 + movu [r0 + 1055 * 16], m1 + + ;mode 18[row 16] + pslldq m0, 1 + pinsrb m0, [r4 + 16], 0 + movu [r0 + 1056 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r3 + 0], 0 + movu [r0 + 1057 * 16], m1 + + ;mode 18[row 17] + pslldq m0, 1 + pinsrb m0, [r4 + 17], 0 + movu [r0 + 1058 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 1], 0 + movu [r0 + 1059 * 16], m1 + + ;mode 18[row 18] + pslldq m0, 1 + pinsrb m0, [r4 + 18], 0 + movu [r0 + 1060 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 2], 0 + movu [r0 + 1061 * 16], m1 + + ;mode 18[row 19] + pslldq m0, 1 + pinsrb m0, [r4 + 19], 0 + movu [r0 + 1062 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 3], 0 + movu [r0 + 1063 * 16], m1 + + ;mode 18[row 20] + pslldq m0, 1 + pinsrb m0, [r4 + 20], 0 + movu [r0 + 1064 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 4], 0 + movu [r0 + 1065 * 16], m1 + + ;mode 18[row 21] + pslldq m0, 1 + pinsrb m0, [r4 + 21], 0 + movu [r0 + 1066 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 5], 0 + movu [r0 + 1067 * 16], m1 + + ;mode 18[row 22] + pslldq m0, 1 + pinsrb m0, [r4 + 22], 0 + movu [r0 + 1068 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 6], 0 + movu [r0 + 1069 * 16], m1 + + ;mode 18[row 23] + pslldq m0, 1 + pinsrb m0, [r4 + 23], 0 + movu [r0 + 1070 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 7], 0 + movu [r0 + 1071 * 16], m1 + + ;mode 18[row 24] + pslldq m0, 1 + pinsrb m0, [r4 + 24], 0 + movu [r0 + 1072 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 8], 0 + movu [r0 + 1073 * 16], m1 + + ;mode 18[row 25] + pslldq m0, 1 + pinsrb m0, [r4 + 25], 0 + movu [r0 + 1074 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 9], 0 + movu [r0 + 1075 * 16], m1 + + ;mode 18[row 26] + pslldq m0, 1 + pinsrb m0, [r4 + 26], 0 + movu [r0 + 1076 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 10], 0 + movu [r0 + 1077 * 16], m1 + + ;mode 18[row 27] + pslldq m0, 1 + pinsrb m0, [r4 + 27], 0 + movu [r0 + 1078 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 11], 0 + movu [r0 + 1079 * 16], m1 + + ;mode 18[row 28] + pslldq m0, 1 + pinsrb m0, [r4 + 28], 0 + movu [r0 + 1080 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 12], 0 + movu [r0 + 1081 * 16], m1 + + ;mode 18[row 29] + pslldq m0, 1 + pinsrb m0, [r4 + 29], 0 + movu [r0 + 1082 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 13], 0 + movu [r0 + 1083 * 16], m1 + + ;mode 18[row 30] + pslldq m0, 1 + pinsrb m0, [r4 + 30], 0 + movu [r0 + 1084 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 14], 0 + movu [r0 + 1085 * 16], m1 + + ;mode 18[row 31] + pslldq m0, 1 + pinsrb m0, [r4 + 31], 0 + movu [r0 + 1086 * 16], m0 + pslldq m1, 1 + pinsrb m1, [r4 + 15], 0 + movu [r0 + 1087 * 16], m1 + + ; mode 19 [row 0] + movu m6, [r5 + 6 * 16] + movu m0, [r3 ] + movu m1, [r3 + 1 ] + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r3 + 8] + movu m3, [r3 + 9] + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 1088 * 16], m1 + + movu m1, [r3 + 16] + movu m3, [r3 + 17] + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + movu m3, [r3 + 24] + movu m5, [r3 + 25] + punpcklbw m3, m5 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1089 * 16], m4 + + ; mode 19 [row 1] + movu m6, [r5 + 12 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 0], 1 + pinsrb m0, [r4 + 1], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1090 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 15], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 23], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1091 * 16], m4 + + ; mode 19 [row 2] + movu m6, [r5 + 18 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 1], 1 + pinsrb m0, [r4 + 2], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1092 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 14], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 22], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1093 * 16], m4 + + ; mode 19 [row 3] + movu m6, [r5 + 24 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 2], 1 + pinsrb m0, [r4 + 4], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1094 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 13], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 21], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1095 * 16], m4 + + ; mode 19 [row 4] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 4], 1 + pinsrb m0, [r4 + 5], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1096 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 12], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 20], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1097 * 16], m4 + + ; mode 19 [row 5] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1098 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1099 * 16], m4 + + ; mode 19 [row 6] + movu m6, [r5 + 10 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 5], 1 + pinsrb m0, [r4 + 6], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 3], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1100 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 11], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 19], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1101 * 16], m4 + + ; mode 19 [row 7] + movu m6, [r5 + 16 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 6], 1 + pinsrb m0, [r4 + 7], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1102 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 10], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 18], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1103 * 16], m4 + + ; mode 19 [row 8] + movu m6, [r5 + 22 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 7], 1 + pinsrb m0, [r4 + 9], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 1], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1104 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 9], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 17], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1105 * 16], m4 + + ; mode 19 [row 9] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 9], 1 + pinsrb m0, [r4 + 10], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 0], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1106 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 8], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 16], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1107 * 16], m4 + + ; mode 19 [row 10] + movu m6, [r5 + 2 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1108 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1109 * 16], m4 + + ; mode 19 [row 11] + movu m6, [r5 + 8 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 10], 1 + pinsrb m0, [r4 + 11], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 0], 1 + pinsrb m2, [r4 + 1], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1110 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 7], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 15], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1111 * 16], m4 + + ; mode 19 [row 12] + movu m6, [r5 + 14 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 11], 1 + pinsrb m0, [r4 + 12], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 1], 1 + pinsrb m2, [r4 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1112 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 6], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 14], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1113 * 16], m4 + + ; mode 19 [row 13] + movu m6, [r5 + 20 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 12], 1 + pinsrb m0, [r4 + 14], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 2], 1 + pinsrb m2, [r4 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1114 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 5], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 13], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1115 * 16], m4 + + ; mode 19 [row 14] + movu m6, [r5 + 26 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 14], 1 + pinsrb m0, [r4 + 15], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 4], 1 + pinsrb m2, [r4 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1116 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 4], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 12], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1117 * 16], m4 + + ; mode19 [row 15] + pshufb m5, m0, [tab_S2] + movh [r0 + 1118 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1118 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1119 * 16], m5 + pshufb m5, m3, [tab_S2] + movh [r0 + 1119 * 16 + 8], m5 + + ; mode 19 [row 16] + movu m6, [r5 + 6 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 15], 1 + pinsrb m0, [r4 + 16], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 5], 1 + pinsrb m2, [r4 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1120 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 3], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 11], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1121 * 16], m4 + + ; mode 19 [row 17] + movu m6, [r5 + 12 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 16], 1 + pinsrb m0, [r4 + 17], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 6], 1 + pinsrb m2, [r4 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1122 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 2], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 10], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1123 * 16], m4 + + ; mode 19 [row 18] + movu m6, [r5 + 18 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 17], 1 + pinsrb m0, [r4 + 18], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 7], 1 + pinsrb m2, [r4 + 9], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1124 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 1], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 9], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1125 * 16], m4 + + ; mode 19 [row 19] + movu m6, [r5 + 24 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 18], 1 + pinsrb m0, [r4 + 20], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 9], 1 + pinsrb m2, [r4 + 10], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1126 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 0], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 8], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1127 * 16], m4 + + ; mode 19 [row 20] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 20], 1 + pinsrb m0, [r4 + 21], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 10], 1 + pinsrb m2, [r4 + 11], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1128 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 0], 1 + pinsrb m1, [r4 + 1], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrb m3, [r3 + 8], 1 + pinsrb m3, [r3 + 7], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1129 * 16], m4 + + ; mode 19 [row 21] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1130 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1131 * 16], m4 + + ; mode 19 [row 22] + movu m6, [r5 + 10 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 21], 1 + pinsrb m0, [r4 + 22], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 11], 1 + pinsrb m2, [r4 + 12], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1132 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 1], 1 + pinsrb m1, [r4 + 2], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 6], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1133 * 16], m4 + + ; mode 19 [row 23] + movu m6, [r5 + 16 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 22], 1 + pinsrb m0, [r4 + 23], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 12], 1 + pinsrb m2, [r4 + 14], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1134 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 2], 1 + pinsrb m1, [r4 + 4], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 5], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1135 * 16], m4 + + ; mode 19 [row 24] + movu m6, [r5 + 22 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 23], 1 + pinsrb m0, [r4 + 25], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 14], 1 + pinsrb m2, [r4 + 15], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1136 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 4], 1 + pinsrb m1, [r4 + 5], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 4], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1137 * 16], m4 + + ; mode 19 [row 25] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 25], 1 + pinsrb m0, [r4 + 26], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 15], 1 + pinsrb m2, [r4 + 16], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1138 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 5], 1 + pinsrb m1, [r4 + 6], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 3], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1139 * 16], m4 + + ; mode 19 [row 26] + movu m6, [r5 + 2 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1140 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1141 * 16], m4 + + ; mode 19 [row 27] + movu m6, [r5 + 8 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 26], 1 + pinsrb m0, [r4 + 27], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 16], 1 + pinsrb m2, [r4 + 17], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1142 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 6], 1 + pinsrb m1, [r4 + 7], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 2], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1143 * 16], m4 + + ; mode 19 [row 28] + movu m6, [r5 + 14 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 27], 1 + pinsrb m0, [r4 + 28], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 17], 1 + pinsrb m2, [r4 + 18], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1144 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 7], 1 + pinsrb m1, [r4 + 9], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 1], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1145 * 16], m4 + + ; mode 19 [row 29] + movu m6, [r5 + 20 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 28], 1 + pinsrb m0, [r4 + 30], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 18], 1 + pinsrb m2, [r4 + 20], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1146 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 9], 1 + pinsrb m1, [r4 + 10], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 0], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1147 * 16], m4 + + ; mode 19 [row 30] + movu m6, [r5 + 26 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 30], 1 + pinsrb m0, [r4 + 31], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 20], 1 + pinsrb m2, [r4 + 21], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1148 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 10], 1 + pinsrb m1, [r4 + 11], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrb m3, [r4 + 0], 1 + pinsrb m3, [r4 + 1], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1149 * 16], m4 + + ; mode19 [row 31] + pshufb m5, m0, [tab_S2] + movh [r0 + 1150 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1150 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1151 * 16], m5 + pshufb m5, m3, [tab_S2] + movh [r0 + 1151 * 16 + 8], m5 + + ; mode 20 [row 0] + movu m6, [r5 + 11 * 16] + movu m0, [r3 ] + movu m1, [r3 + 1 ] + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r3 + 8] + movu m3, [r3 + 9] + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 1152 * 16], m1 + + movu m1, [r3 + 16] + movu m3, [r3 + 17] + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + movu m3, [r3 + 24] + movu m5, [r3 + 25] + punpcklbw m3, m5 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1153 * 16], m4 + + ; mode 20 [row 1] + movu m6, [r5 + 22 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 0], 1 + pinsrb m0, [r4 + 2], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1154 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 15], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 23], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1155 * 16], m4 + + ; mode 20 [row 2] + movu m6, [r5 + 1 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1156 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1157 * 16], m4 + + ; mode 20 [row 3] + movu m6, [r5 + 12 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 2], 1 + pinsrb m0, [r4 + 3], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1158 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 14], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 22], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1159 * 16], m4 + + ; mode 20 [row 4] + movu m6, [r5 + 23 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 3], 1 + pinsrb m0, [r4 + 5], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1160 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 13], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 21], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1161 * 16], m4 + + ; mode 20 [row 5] + movu m6, [r5 + 2 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1162 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1163 * 16], m4 + + ; mode 20 [row 6] + movu m6, [r5 + 13 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 5], 1 + pinsrb m0, [r4 + 6], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1164 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 12], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 20], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1165 * 16], m4 + + ; mode 20 [row 7] + movu m6, [r5 + 24 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 6], 1 + pinsrb m0, [r4 + 8], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 3], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1166 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 11], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 19], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1167 * 16], m4 + + ; mode 20 [row 8] + movu m6, [r5 + 3 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1168 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1169 * 16], m4 + + ; mode 20 [row 9] + movu m6, [r5 + 14 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 8], 1 + pinsrb m0, [r4 + 9], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 3], 1 + pinsrb m2, [r3 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1170 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 10], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 18], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1171 * 16], m4 + + ; mode 20 [row 10] + movu m6, [r5 + 25 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 9], 1 + pinsrb m0, [r4 + 11], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 1], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1172 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 9], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 17], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1173 * 16], m4 + + ; mode 20 [row 11] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1174 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1175 * 16], m4 + + ; mode 20 [row 12] + movu m6, [r5 + 15 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 11], 1 + pinsrb m0, [r4 + 12], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r3 + 1], 1 + pinsrb m2, [r3 + 0], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1176 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 8], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 16], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1177 * 16], m4 + + ; mode 20 [row 13] + movu m6, [r5 + 26 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 12], 1 + pinsrb m0, [r4 + 14], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 0], 1 + pinsrb m2, [r4 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1178 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 7], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 15], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1179 * 16], m4 + + ; mode 20 [row 14] + movu m6, [r5 + 5 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1180 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1181 * 16], m4 + + ; mode 20 [row 15] + movu m6, [r5 + 16 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 14], 1 + pinsrb m0, [r4 + 15], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 2], 1 + pinsrb m2, [r4 + 3], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1182 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 6], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 14], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1183 * 16], m4 + + ; mode 20 [row 16] + movu m6, [r5 + 27 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 15], 1 + pinsrb m0, [r4 + 17], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 3], 1 + pinsrb m2, [r4 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1184 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 5], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 13], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1185 * 16], m4 + + ; mode 20 [row 17] + movu m6, [r5 + 6 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1186 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1187 * 16], m4 + + ; mode 20 [row 18] + movu m6, [r5 + 17 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 17], 1 + pinsrb m0, [r4 + 18], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 5], 1 + pinsrb m2, [r4 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1188 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 4], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 12], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1189 * 16], m4 + + ; mode 20 [row 19] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 18], 1 + pinsrb m0, [r4 + 20], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 6], 1 + pinsrb m2, [r4 + 8], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1190 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 3], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 11], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1191 * 16], m4 + + ; mode 20 [row 20] + movu m6, [r5 + 7 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1192 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1193 * 16], m4 + + ; mode 20 [row 21] + movu m6, [r5 + 18 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 20], 1 + pinsrb m0, [r4 + 21], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 8], 1 + pinsrb m2, [r4 + 9], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1194 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 2], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 10], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1195 * 16], m4 + + ; mode 20 [row 22] + movu m6, [r5 + 29 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 21], 1 + pinsrb m0, [r4 + 23], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 9], 1 + pinsrb m2, [r4 + 11], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1196 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 1], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 9], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1197 * 16], m4 + + ; mode 20 [row 23] + movu m6, [r5 + 8 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1198 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1199 * 16], m4 + + ; mode 20 [row 24] + movu m6, [r5 + 19 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 23], 1 + pinsrb m0, [r4 + 24], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 11], 1 + pinsrb m2, [r4 + 12], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1200 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 0], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 8], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1201 * 16], m4 + + ; mode 20 [row 25] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 24], 1 + pinsrb m0, [r4 + 26], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 12], 1 + pinsrb m2, [r4 + 14], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1202 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 0], 1 + pinsrb m1, [r4 + 2], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 7], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1203 * 16], m4 + + ; mode 20 [row 26] + movu m6, [r5 + 9 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1204 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1205 * 16], m4 + + ; mode 20 [row 27] + movu m6, [r5 + 20 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 26], 1 + pinsrb m0, [r4 + 27], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 14], 1 + pinsrb m2, [r4 + 15], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1206 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 2], 1 + pinsrb m1, [r4 + 3], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 6], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1207 * 16], m4 + + ; mode 20 [row 28] + movu m6, [r5 + 31 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 27], 1 + pinsrb m0, [r4 + 29], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 15], 1 + pinsrb m2, [r4 + 17], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1208 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 3], 1 + pinsrb m1, [r4 + 5], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 5], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1209 * 16], m4 + + ; mode 20 [row 29] + movu m6, [r5 + 10 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1210 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1211 * 16], m4 + + ; mode 20 [row 30] + movu m6, [r5 + 21 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 29], 1 + pinsrb m0, [r4 + 30], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 17], 1 + pinsrb m2, [r4 + 18], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1212 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r4 + 5], 1 + pinsrb m1, [r4 + 6], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 4], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1213 * 16], m4 + + ; mode20 [row 31] + pshufb m5, m0, [tab_S2] + movh [r0 + 1214 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1214 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1215 * 16], m5 + pshufb m5, m3, [tab_S2] + movh [r0 + 1215 * 16 + 8], m5 + + ; mode 21 [row 0] + movu m6, [r5 + 15 * 16] + movu m0, [r3 ] + movu m1, [r3 + 1 ] + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r3 + 8] + movu m3, [r3 + 9] + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 1216 * 16], m1 + + movu m1, [r3 + 16] + movu m3, [r3 + 17] + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + movu m3, [r3 + 24] + movu m5, [r3 + 25] + punpcklbw m3, m5 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1217 * 16], m4 + + ; mode 21 [row 1] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 0], 1 + pinsrb m0, [r4 + 2], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1218 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 15], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 23], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1219 * 16], m4 + + ; mode 21 [row 2] + movu m6, [r5 + 13 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1220 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1221 * 16], m4 + + ; mode 21 [row 3] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 2], 1 + pinsrb m0, [r4 + 4], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1222 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 14], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 22], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1223 * 16], m4 + + ; mode 21 [row 4] + movu m6, [r5 + 11 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1224 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1225 * 16], m4 + + ; mode 21 [row 5] + movu m6, [r5 + 26 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 4], 1 + pinsrb m0, [r4 + 6], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1226 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 13], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 21], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1227 * 16], m4 + + ; mode 21 [row 6] + movu m6, [r5 + 9 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1228 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1229 * 16], m4 + + ; mode 21 [row 7] + movu m6, [r5 + 24 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 6], 1 + pinsrb m0, [r4 + 8], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1230 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 12], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 20], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1231 * 16], m4 + + ; mode 21 [row 8] + movu m6, [r5 + 7 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1232 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1233 * 16], m4 + + ; mode 21 [row 9] + movu m6, [r5 + 22 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 8], 1 + pinsrb m0, [r4 + 9], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 3], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1234 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 11], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 19], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1235 * 16], m4 + + ; mode 21 [row 10] + movu m6, [r5 + 5 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1236 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1237 * 16], m4 + + ; mode 21 [row 11] + movu m6, [r5 + 20 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 9], 1 + pinsrb m0, [r4 + 11], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1238 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 10], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 18], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1239 * 16], m4 + + ; mode 21 [row 12] + movu m6, [r5 + 3 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1240 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1241 * 16], m4 + + ; mode 21 [row 13] + movu m6, [r5 + 18 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 11], 1 + pinsrb m0, [r4 + 13], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 1], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1242 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 9], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 17], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1243 * 16], m4 + + ; mode 21 [row 14] + movu m6, [r5 + 1 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1244 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1245 * 16], m4 + + ; mode 21 [row 15] + movu m6, [r5 + 16 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 13], 1 + pinsrb m0, [r4 + 15], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 0], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1246 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 8], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 16], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1247 * 16], m4 + + ; mode 21 [row 16] + movu m6, [r5 + 31 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 15], 1 + pinsrb m0, [r4 + 17], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 0], 1 + pinsrb m2, [r4 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1248 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 7], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 15], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1249 * 16], m4 + + ; mode 21 [row 17] + movu m6, [r5 + 14 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1250 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1251 * 16], m4 + + ; mode 21 [row 18] + movu m6, [r5 + 29 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 17], 1 + pinsrb m0, [r4 + 19], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 2], 1 + pinsrb m2, [r4 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1252 * 16], m4 + pslldq m1, 2 + pinsrb m1, [r3 + 7], 1 + pinsrb m1, [r3 + 6], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrb m3, [r3 + 15], 1 + pinsrb m3, [r3 + 14], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1253 * 16], m4 + + ; mode 21 [row 19] + movu m6, [r5 + 12 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1254 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1255 * 16], m4 + + ; mode 21 [row 20] + movu m6, [r5 + 27 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 19], 1 + pinsrb m0, [r4 + 21], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 4], 1 + pinsrb m2, [r4 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1256 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 5], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 13], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1257 * 16], m4 + + ; mode 21 [row 21] + movu m6, [r5 + 10 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1258 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1259 * 16], m4 + + ; mode 21 [row 22] + movu m6, [r5 + 25 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 21], 1 + pinsrb m0, [r4 + 23], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 6], 1 + pinsrb m2, [r4 + 8], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1260 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 4], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 12], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1261 * 16], m4 + + ; mode 21 [row 23] + movu m6, [r5 + 8 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1262 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1263 * 16], m4 + + ; mode 21 [row 24] + movu m6, [r5 + 23 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 23], 1 + pinsrb m0, [r4 + 24], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 8], 1 + pinsrb m2, [r4 + 9], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1264 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 3], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 11], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1265 * 16], m4 + + ; mode 21 [row 25] + movu m6, [r5 + 6 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1266 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1267 * 16], m4 + + ; mode 21 [row 26] + movu m6, [r5 + 21 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 24], 1 + pinsrb m0, [r4 + 26], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 9], 1 + pinsrb m2, [r4 + 11], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1268 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 2], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 10], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1269 * 16], m4 + + ; mode 21 [row 27] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1270 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1271 * 16], m4 + + ; mode 21 [row 28] + movu m6, [r5 + 19 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 26], 1 + pinsrb m0, [r4 + 28], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 11], 1 + pinsrb m2, [r4 + 13], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1272 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 1], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 9], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1273 * 16], m4 + + ; mode 21 [row 29] + movu m6, [r5 + 2 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1274 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1275 * 16], m4 + + ; mode 21 [row 30] + movu m6, [r5 + 17 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 28], 1 + pinsrb m0, [r4 + 30], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 13], 1 + pinsrb m2, [r4 + 15], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1276 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 0], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 8], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1277 * 16], m4 + + ; mode21 [row 31] + pshufb m5, m0, [tab_S2] + movh [r0 + 1278 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1278 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1279 * 16], m5 + pshufb m5, m3, [tab_S2] + movh [r0 + 1279 * 16 + 8], m5 + + ; mode 22 [row 0] + movu m6, [r5 + 19 * 16] + movu m0, [r3 ] + movu m1, [r3 + 1 ] + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r3 + 8] + movu m3, [r3 + 9] + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 1280 * 16], m1 + + movu m1, [r3 + 16] + movu m3, [r3 + 17] + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + movu m3, [r3 + 24] + movu m5, [r3 + 25] + punpcklbw m3, m5 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1281 * 16], m4 + + ; mode 22 [row 1] + movu m6, [r5 + 6 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1282 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1283 * 16], m4 + + ; mode 22 [row 2] + movu m6, [r5 + 25 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 0], 1 + pinsrb m0, [r4 + 2], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1284 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 15], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 23], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1285 * 16], m4 + + ; mode 22 [row 3] + movu m6, [r5 + 12 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1286 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1287 * 16], m4 + + ; mode 22 [row 4] + movu m6, [r5 + 31 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 2], 1 + pinsrb m0, [r4 + 5], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1288 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 14], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 22], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1289 * 16], m4 + + ; mode 22 [row 5] + movu m6, [r5 + 18 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1290 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1291 * 16], m4 + + ; mode 22 [row 6] + movu m6, [r5 + 5 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1292 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1293 * 16], m4 + + ; mode 22 [row 7] + movu m6, [r5 + 24 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 5], 1 + pinsrb m0, [r4 + 7], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1294 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 13], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 21], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1295 * 16], m4 + + ; mode 22 [row 8] + movu m6, [r5 + 11 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1296 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1297 * 16], m4 + + ; mode 22 [row 9] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 7], 1 + pinsrb m0, [r4 + 10], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1298 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 12], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 20], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1299 * 16], m4 + + ; mode 22 [row 10] + movu m6, [r5 + 17 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1300 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1301 * 16], m4 + + ; mode 22 [row 11] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1302 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1303 * 16], m4 + + ; mode 22 [row 12] + movu m6, [r5 + 23 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 10], 1 + pinsrb m0, [r4 + 12], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 3], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1304 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 11], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 19], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1305 * 16], m4 + + ; mode 22 [row 13] + movu m6, [r5 + 10 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1306 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1307 * 16], m4 + + ; mode 22 [row 14] + movu m6, [r5 + 29 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 12], 1 + pinsrb m0, [r4 + 15], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1308 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 10], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 18], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1309 * 16], m4 + + ; mode 22 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1310 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1311 * 16], m4 + + ; mode 22 [row 16] + movu m6, [r5 + 3 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1312 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1313 * 16], m4 + + ; mode 22 [row 17] + movu m6, [r5 + 22 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 15], 1 + pinsrb m0, [r4 + 17], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 1], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1314 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 9], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 17], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1315 * 16], m4 + + ; mode 22 [row 18] + movu m6, [r5 + 9 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1316 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1317 * 16], m4 + + ; mode 22 [row 19] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 17], 1 + pinsrb m0, [r4 + 20], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 0], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1318 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 8], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 16], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1319 * 16], m4 + + ; mode 22 [row 20] + movu m6, [r5 + 15 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1320 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1321 * 16], m4 + + ; mode 22 [row 21] + movu m6, [r5 + 2 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1322 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1323 * 16], m4 + + ; mode 22 [row 22] + movu m6, [r5 + 21 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 20], 1 + pinsrb m0, [r4 + 22], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 0], 1 + pinsrb m2, [r4 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1324 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 7], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 15], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1325 * 16], m4 + + ; mode 22 [row 23] + movu m6, [r5 + 8 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1326 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1327 * 16], m4 + + ; mode 22 [row 24] + movu m6, [r5 + 27 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 22], 1 + pinsrb m0, [r4 + 25], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 2], 1 + pinsrb m2, [r4 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1328 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 6], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 14], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1329 * 16], m4 + + ; mode 22 [row 25] + movu m6, [r5 + 14 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1330 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1331 * 16], m4 + + ; mode 22 [row 26] + movu m6, [r5 + 1 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1332 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1333 * 16], m4 + + ; mode 22 [row 27] + movu m6, [r5 + 20 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 25], 1 + pinsrb m0, [r4 + 27], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 5], 1 + pinsrb m2, [r4 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1334 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 5], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 13], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1335 * 16], m4 + + ; mode 22 [row 28] + movu m6, [r5 + 7 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1336 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1337 * 16], m4 + + ; mode 22 [row 29] + movu m6, [r5 + 26 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 27], 1 + pinsrb m0, [r4 + 30], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrb m2, [r4 + 7], 1 + pinsrb m2, [r4 + 10], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1338 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 4], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 12], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1339 * 16], m4 + + ; mode 22 [row 30] + movu m6, [r5 + 13 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1340 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1341 * 16], m4 + + ; mode22 [row 31] + pshufb m5, m0, [tab_S2] + movh [r0 + 1342 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1342 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1343 * 16], m5 + pshufb m5, m3, [tab_S2] + movh [r0 + 1343 * 16 + 8], m5 + + ; mode 23 [row 0] + movu m6, [r5 + 23 * 16] + movu m0, [r3 ] + movu m1, [r3 + 1 ] + punpcklbw m0, m1 + pmaddubsw m1, m0, m6 + pmulhrsw m1, m7 + movu m2, [r3 + 8] + movu m3, [r3 + 9] + punpcklbw m2, m3 + pmaddubsw m3, m2, m6 + pmulhrsw m3, m7 + packuswb m1, m3 + movu [r0 + 1344 * 16], m1 + + movu m1, [r3 + 16] + movu m3, [r3 + 17] + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + movu m3, [r3 + 24] + movu m5, [r3 + 25] + punpcklbw m3, m5 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1345 * 16], m4 + + ; mode 23 [row 1] + movu m6, [r5 + 14 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1346 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1347 * 16], m4 + + ; mode 23 [row 2] + movu m6, [r5 + 5 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1348 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1349 * 16], m4 + + ; mode 23 [row 3] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 0], 1 + pinsrb m0, [r4 + 4], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1350 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 15], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 23], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1351 * 16], m4 + + ; mode 23 [row 4] + movu m6, [r5 + 19 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1352 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1353 * 16], m4 + + ; mode 23 [row 5] + movu m6, [r5 + 10 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1354 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1355 * 16], m4 + + ; mode 23 [row 6] + movu m6, [r5 + 1 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1356 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1357 * 16], m4 + + ; mode 23 [row 7] + movu m6, [r5 + 24 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 4], 1 + pinsrb m0, [r4 + 7], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1358 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 14], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 22], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1359 * 16], m4 + + ; mode 23 [row 8] + movu m6, [r5 + 15 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1360 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1361 * 16], m4 + + ; mode 23 [row 9] + movu m6, [r5 + 6 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1362 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1363 * 16], m4 + + ; mode 23 [row 10] + movu m6, [r5 + 29 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 7], 1 + pinsrb m0, [r4 + 11], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1364 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 13], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 21], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1365 * 16], m4 + + ; mode 23 [row 11] + movu m6, [r5 + 20 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1366 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1367 * 16], m4 + + ; mode 23 [row 12] + movu m6, [r5 + 11 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1368 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1369 * 16], m4 + + ; mode 23 [row 13] + movu m6, [r5 + 2 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1370 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1371 * 16], m4 + + ; mode 23 [row 14] + movu m6, [r5 + 25 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 11], 1 + pinsrb m0, [r4 + 14], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1372 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 12], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 20], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1373 * 16], m4 + + ; mode 23 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1374 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1375 * 16], m4 + + ; mode 23 [row 16] + movu m6, [r5 + 7 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1376 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1377 * 16], m4 + + ; mode 23 [row 17] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 14], 1 + pinsrb m0, [r4 + 18], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 3], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1378 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 11], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 19], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1379 * 16], m4 + + ; mode 23 [row 18] + movu m6, [r5 + 21 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1380 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1381 * 16], m4 + + ; mode 23 [row 19] + movu m6, [r5 + 12 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1382 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1383 * 16], m4 + + ; mode 23 [row 20] + movu m6, [r5 + 3 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1384 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1385 * 16], m4 + + ; mode 23 [row 21] + movu m6, [r5 + 26 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 18], 1 + pinsrb m0, [r4 + 21], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 2], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1386 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 10], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 18], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1387 * 16], m4 + + ; mode 23 [row 22] + movu m6, [r5 + 17 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1388 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1389 * 16], m4 + + ; mode 23 [row 23] + movu m6, [r5 + 8 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1390 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1391 * 16], m4 + + ; mode 23 [row 24] + movu m6, [r5 + 31 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 21], 1 + pinsrb m0, [r4 + 25], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 1], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1392 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 9], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 17], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1393 * 16], m4 + + ; mode 23 [row 25] + movu m6, [r5 + 22 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1394 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1395 * 16], m4 + + ; mode 23 [row 26] + movu m6, [r5 + 13 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1396 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1397 * 16], m4 + + ; mode 23 [row 27] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1398 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1399 * 16], m4 + + ; mode 23 [row 28] + movu m6, [r5 + 27 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 25], 1 + pinsrb m0, [r4 + 28], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 0], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1400 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 8], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 16], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1401 * 16], m4 + + ; mode 23 [row 29] + movu m6, [r5 + 18 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1402 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1403 * 16], m4 + + ; mode 23 [row 30] + movu m6, [r5 + 9 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1404 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1405 * 16], m4 + + ; mode23 [row 31] + pshufb m5, m0, [tab_S2] + movh [r0 + 1406 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1406 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1407 * 16], m5 + pshufb m5, m3, [tab_S2] + movh [r0 + 1407 * 16 + 8], m5 + + ; mode 24 [row 0] + movu m6, [r5 + 27 * 16] + movu m0, [r3 ] + movu m1, [r3 + 1 ] + punpcklbw m0, m1 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + movu m2, [r3 + 8] + movu m3, [r3 + 9] + punpcklbw m2, m3 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1408 * 16], m4 + + movu m1, [r3 + 16] + movu m3, [r3 + 17] + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + movu m3, [r3 + 24] + movu m5, [r3 + 25] + punpcklbw m3, m5 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1409 * 16], m4 + + ; mode 24 [row 1] + movu m6, [r5 + 22 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1410 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1411 * 16], m4 + + ; mode 24 [row 2] + movu m6, [r5 + 17 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1412 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1413 * 16], m4 + + ; mode 24 [row 3] + movu m6, [r5 + 12 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1414 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1415 * 16], m4 + + ; mode 24 [row 4] + movu m6, [r5 + 7 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1416 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1417 * 16], m4 + + ; mode 24 [row 5] + movu m6, [r5 + 2 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1418 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1419 * 16], m4 + + ; mode 24 [row 6] + movu m6, [r5 + 29 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 0], 1 + pinsrb m0, [r4 + 6], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1420 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 15], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 23], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1421 * 16], m4 + + ; mode 24 [row 7] + movu m6, [r5 + 24 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1422 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1423 * 16], m4 + + ; mode 24 [row 8] + movu m6, [r5 + 19 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1424 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1425 * 16], m4 + + ; mode 24 [row 9] + movu m6, [r5 + 14 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1426 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1427 * 16], m4 + + ; mode 24 [row 10] + movu m6, [r5 + 9 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1428 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1429 * 16], m4 + + ; mode 24 [row 11] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1430 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1431 * 16], m4 + + ; mode 24 [row 12] + movu m6, [r5 + 31 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 6], 1 + pinsrb m0, [r4 + 13], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 6], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1432 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 14], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 22], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1433 * 16], m4 + + ; mode 24 [row 13] + movu m6, [r5 + 26 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1434 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1435 * 16], m4 + + ; mode 24 [row 14] + movu m6, [r5 + 21 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1436 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1437 * 16], m4 + + ; mode 24 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1438 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1439 * 16], m4 + + ; mode 24 [row 16] + movu m6, [r5 + 11 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1440 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1441 * 16], m4 + + ; mode 24 [row 17] + movu m6, [r5 + 6 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1442 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1443 * 16], m4 + + ; mode 24 [row 18] + movu m6, [r5 + 1 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1444 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1445 * 16], m4 + + ; mode 24 [row 19] + movu m6, [r5 + 28 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 13], 1 + pinsrb m0, [r4 + 19], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 5], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1446 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 13], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 21], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1447 * 16], m4 + + ; mode 24 [row 20] + movu m6, [r5 + 23 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1448 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1449 * 16], m4 + + ; mode 24 [row 21] + movu m6, [r5 + 18 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1450 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1451 * 16], m4 + + ; mode 24 [row 22] + movu m6, [r5 + 13 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1452 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1453 * 16], m4 + + ; mode 24 [row 23] + movu m6, [r5 + 8 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1454 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1455 * 16], m4 + + ; mode 24 [row 24] + movu m6, [r5 + 3 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1456 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1457 * 16], m4 + + ; mode 24 [row 25] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 19], 1 + pinsrb m0, [r4 + 26], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 4], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1458 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 12], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 20], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1459 * 16], m4 + + ; mode 24 [row 26] + movu m6, [r5 + 25 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1460 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1461 * 16], m4 + + ; mode 24 [row 27] + movu m6, [r5 + 20 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1462 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1463 * 16], m4 + + ; mode 24 [row 28] + movu m6, [r5 + 15 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1464 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1465 * 16], m4 + + ; mode 24 [row 29] + movu m6, [r5 + 10 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1466 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1467 * 16], m4 + + ; mode 24 [row 30] + movu m6, [r5 + 5 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1468 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1469 * 16], m4 + + ; mode 24 [row 31] + pshufb m5, m0, [tab_S2] + movh [r0 + 1470 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1470 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1471 * 16], m5 + pshufb m5, m3, [tab_S2] + movh [r0 + 1471 * 16 + 8], m5 + + ; mode 25 [row 0] + movu m6, [r5 + 30 * 16] + movu m0, [r3 ] + movu m1, [r3 + 1 ] + punpcklbw m0, m1 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + movu m2, [r3 + 8] + movu m3, [r3 + 9] + punpcklbw m2, m3 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1472 * 16], m4 + + movu m1, [r3 + 16] + movu m3, [r3 + 17] + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + movu m3, [r3 + 24] + movu m5, [r3 + 25] + punpcklbw m3, m5 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1473 * 16], m4 + + ; mode 25 [row 1] + movu m6, [r5 + 28 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1474 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1475 * 16], m4 + + ; mode 25 [row 2] + movu m6, [r5 + 26 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1476 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1477 * 16], m4 + + ; mode 25 [row 3] + movu m6, [r5 + 24 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1478 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1479 * 16], m4 + + ; mode 25 [row 4] + movu m6, [r5 + 22 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1480 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1481 * 16], m4 + + ; mode 25 [row 5] + movu m6, [r5 + 20 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1482 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1483 * 16], m4 + + ; mode 25 [row 6] + movu m6, [r5 + 18 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1484 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1485 * 16], m4 + + ; mode 25 [row 7] + movu m6, [r5 + 16 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1486 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1487 * 16], m4 + + ; mode 25 [row 8] + movu m6, [r5 + 14 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1488 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1489 * 16], m4 + + ; mode 25 [row 9] + movu m6, [r5 + 12 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1490 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1491 * 16], m4 + + ; mode 25 [row 10] + movu m6, [r5 + 10 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1492 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1493 * 16], m4 + + ; mode 25 [row 11] + movu m6, [r5 + 8 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1494 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1495 * 16], m4 + + ; mode 25 [row 12] + movu m6, [r5 + 6 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1496 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1497 * 16], m4 + + ; mode 25 [row 13] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1498 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1499 * 16], m4 + + ; mode 25 [row 14] + movu m6, [r5 + 2 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1500 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1501 * 16], m4 + + ; mode 25 [row 15] + pshufb m5, m0, [tab_S2] + movh [r0 + 1502 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1502 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1503 * 16], m5 + pshufb m5, m3, [tab_S2] + movh [r0 + 1503 * 16 + 8], m5 + + ; mode 25 [row 16] + movu m6, [r5 + 30 * 16] + pslldq m0, 2 + pinsrb m0, [r4 + 0], 1 + pinsrb m0, [r4 + 16], 0 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pslldq m2, 2 + pinsrw m2, [r3 + 7], 0 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1504 * 16], m4 + pslldq m1, 2 + pinsrw m1, [r3 + 15], 0 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pslldq m3, 2 + pinsrw m3, [r3 + 23], 0 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1505 * 16], m4 + + ; mode 25 [row 17] + movu m6, [r5 + 28 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1506 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1507 * 16], m4 + + ; mode 25 [row 18] + movu m6, [r5 + 26 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1508 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1509 * 16], m4 + + ; mode 25 [row 19] + movu m6, [r5 + 24 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1510 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1511 * 16], m4 + + ; mode 25 [row 20] + movu m6, [r5 + 22 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1512 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1513 * 16], m4 + + ; mode 25 [row 21] + movu m6, [r5 + 20 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1514 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1515 * 16], m4 + + ; mode 25 [row 22] + movu m6, [r5 + 18 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1516 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1517 * 16], m4 + + ; mode 25 [row 23] + movu m6, [r5 + 16 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1518 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1519 * 16], m4 + + ; mode 25 [row 24] + movu m6, [r5 + 14 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1520 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1521 * 16], m4 + + ; mode 25 [row 25] + movu m6, [r5 + 12 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1522 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1523 * 16], m4 + + ; mode 25 [row 26] + movu m6, [r5 + 10 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1524 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1525 * 16], m4 + + ; mode 25 [row 27] + movu m6, [r5 + 8 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1526 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1527 * 16], m4 + + ; mode 25 [row 28] + movu m6, [r5 + 6 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1528 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1529 * 16], m4 + + ; mode 25 [row 29] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1530 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1531 * 16], m4 + + ; mode 25 [row 30] + movu m6, [r5 + 2 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1532 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1533 * 16], m4 + + ; mode 25 [row 31] + pshufb m5, m0, [tab_S2] + movh [r0 + 1534 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1534 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1535 * 16], m5 + pshufb m5, m3, [tab_S2] + movh [r0 + 1535 * 16 + 8], m5 + + ; mode 26 + movu m1, [r1 + 1] + movu m2, [r1 + 17] + movu [r0 + 1536 * 16], m1 + movu [r0 + 1537 * 16], m2 + movu [r0 + 1538 * 16], m1 + movu [r0 + 1539 * 16], m2 + movu [r0 + 1540 * 16], m1 + movu [r0 + 1541 * 16], m2 + movu [r0 + 1542 * 16], m1 + movu [r0 + 1543 * 16], m2 + movu [r0 + 1544 * 16], m1 + movu [r0 + 1545 * 16], m2 + movu [r0 + 1546 * 16], m1 + movu [r0 + 1547 * 16], m2 + movu [r0 + 1548 * 16], m1 + movu [r0 + 1549 * 16], m2 + movu [r0 + 1550 * 16], m1 + movu [r0 + 1551 * 16], m2 + + movu [r0 + 1552 * 16], m1 + movu [r0 + 1553 * 16], m2 + movu [r0 + 1554 * 16], m1 + movu [r0 + 1555 * 16], m2 + movu [r0 + 1556 * 16], m1 + movu [r0 + 1557 * 16], m2 + movu [r0 + 1558 * 16], m1 + movu [r0 + 1559 * 16], m2 + movu [r0 + 1560 * 16], m1 + movu [r0 + 1561 * 16], m2 + movu [r0 + 1562 * 16], m1 + movu [r0 + 1563 * 16], m2 + movu [r0 + 1564 * 16], m1 + movu [r0 + 1565 * 16], m2 + movu [r0 + 1566 * 16], m1 + movu [r0 + 1567 * 16], m2 + + movu [r0 + 1568 * 16], m1 + movu [r0 + 1569 * 16], m2 + movu [r0 + 1570 * 16], m1 + movu [r0 + 1571 * 16], m2 + movu [r0 + 1572 * 16], m1 + movu [r0 + 1573 * 16], m2 + movu [r0 + 1574 * 16], m1 + movu [r0 + 1575 * 16], m2 + movu [r0 + 1576 * 16], m1 + movu [r0 + 1577 * 16], m2 + movu [r0 + 1578 * 16], m1 + movu [r0 + 1579 * 16], m2 + movu [r0 + 1580 * 16], m1 + movu [r0 + 1581 * 16], m2 + movu [r0 + 1582 * 16], m1 + movu [r0 + 1583 * 16], m2 + + movu [r0 + 1584 * 16], m1 + movu [r0 + 1585 * 16], m2 + movu [r0 + 1586 * 16], m1 + movu [r0 + 1587 * 16], m2 + movu [r0 + 1588 * 16], m1 + movu [r0 + 1589 * 16], m2 + movu [r0 + 1590 * 16], m1 + movu [r0 + 1591 * 16], m2 + movu [r0 + 1592 * 16], m1 + movu [r0 + 1593 * 16], m2 + movu [r0 + 1594 * 16], m1 + movu [r0 + 1595 * 16], m2 + movu [r0 + 1596 * 16], m1 + movu [r0 + 1597 * 16], m2 + movu [r0 + 1598 * 16], m1 + movu [r0 + 1599 * 16], m2 + + ; mode 27 [row 0] + movu m6, [r5 + 2 * 16] + movu m0, [r3 + 1 ] + movu m1, [r3 + 2 ] + punpcklbw m0, m1 + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + movu m2, [r3 + 9] + movu m3, [r3 + 10] + punpcklbw m2, m3 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1600 * 16], m4 + + movu m1, [r3 + 17] + movu m3, [r3 + 18] + punpcklbw m1, m3 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + movu m3, [r3 + 25] + movu m5, [r3 + 26] + punpcklbw m3, m5 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1601 * 16], m4 + + ; mode 27 [row 1] + movu m6, [r5 + 4 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1602 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1603 * 16], m4 + + ; mode 27 [row 2] + movu m6, [r5 + 6 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1604 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1605 * 16], m4 + + ; mode 27 [row 3] + movu m6, [r5 + 8 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1606 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1607 * 16], m4 + + ; mode 27 [row 4] + movu m6, [r5 + 10 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1608 * 16], m4 + + ; mode 28 [row 1 -first half] + movu [r0 + 1666 * 16], m4 + + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1609 * 16], m4 + + ; mode 28 [row 1 - second half] + movu [r0 + 1667 * 16], m4 + + ; mode 27 [row 5] + movu m6, [r5 + 12 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1610 * 16], m4 + + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1611 * 16], m4 + + ; mode 27 [row 6] + movu m6, [r5 + 14 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1612 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1613 * 16], m4 + + ; mode 27 [row 7] + movu m6, [r5 + 16 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1614 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1615 * 16], m4 + + ; mode 27 [row 8] + movu m6, [r5 + 18 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1616 * 16], m4 + + ; mode 29 [row 1 - first half] + movu [r0 + 1730 * 16], m4 + + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1617 * 16], m4 + + ; mode 29 [row 1 - second half] + movu [r0 + 1731 * 16], m4 + + ; mode 27 [row 9] + movu m6, [r5 + 20 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1618 * 16], m4 + + ; mode 28 [row 3 -first half] + movu [r0 + 1670 * 16], m4 + + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1619 * 16], m4 + + ; mode 28 [row 3 -second half] + movu [r0 + 1671 * 16], m4 + + ; mode 27 [row 10] + movu m6, [r5 + 22 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1620 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1621 * 16], m4 + + ; mode 27 [row 11] + movu m6, [r5 + 24 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1622 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1623 * 16], m4 + + ; mode 27 [row 12] + movu m6, [r5 + 26 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1624 * 16], m4 + + ; mode 30 [row 1 - first half] + movu [r0 + 1794 * 16], m4 + + ; mode 33 [row 0 - first half] + movu [r0 + 1984 * 16], m4 + + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1625 * 16], m4 + + ; mode 30 [row 1 - second half] + movu [r0 + 1795 * 16], m4 + + ; mode 33 [row 0 - second half] + movu [r0 + 1985 * 16], m4 + + ; mode 27 [row 13] + movu m6, [r5 + 28 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1626 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1627 * 16], m4 + + ; mode 27 [row 14] + movu m6, [r5 + 30 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1628 * 16], m4 + + ; mode 28 [row 5 first half] + movu [r0 + 1674 * 16], m4 + + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1629 * 16], m4 + + ; mode 28 [row 5 second half] + movu [r0 + 1675 * 16], m4 + + ; mode 28 [row 0] + movu m6, [r5 + 5 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1664 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1665 * 16], m4 + + ; mode 28 [row 2] + movu m6, [r5 + 15 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1668 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1669 * 16], m4 + + ; mode 28 [row 4] + movu m6, [r5 + 25 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1672 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1673 * 16], m4 + + ; mode 30 [row 0] + movu m6, [r5 + 13 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1792 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1793 * 16], m4 + + ; mode 29 [row 0] + movu m6, [r5 + 9 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1728 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1729 * 16], m4 + + ; mode 29 [row 2] + movu m6, [r5 + 27 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1732 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1733 * 16], m4 + + ; mode 31 [row 0] + movu m6, [r5 + 17 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1856 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1857 * 16], m4 + + ; mode 32 [row 0] + movu m6, [r5 + 21 * 16] + pmaddubsw m4, m0, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1920 * 16], m4 + pmaddubsw m4, m1, m6 + pmulhrsw m4, m7 + pmaddubsw m5, m3, m6 + pmulhrsw m5, m7 + packuswb m4, m5 + movu [r0 + 1921 * 16], m4 + + ; mode 27 [row 15] + movu m0, [r3 + 2] + movd m1, [r3 + 3] + palignr m1, m0, 1 + punpcklbw m0, m1 + movu m2, [r3 + 10] + movd m3, [r3 + 11] + palignr m3, m2, 1 + punpcklbw m2, m3 + movu m1, [r3 + 18] + movd m3, [r3 + 19] + palignr m3, m1, 1 + punpcklbw m1, m3 + movu m4, [r3 + 26] + movd m5, [r3 + 27] + palignr m5, m4, 1 + punpcklbw m4, m5 + + pshufb m5, m0, [tab_S2] + movh [r0 + 1630 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1630 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1631 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 1631 * 16 + 8], m5 + + ; mode 27 [row 16] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1632 * 16], m3 + + ; mode 31 [row 1 - first half] + movu [r0 + 1858 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1633 * 16], m3 + + ; mode 31 [row 1 - second half] + movu [r0 + 1859 * 16], m3 + + ; mode 27 [row 17] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1634 * 16], m3 + + ; mode 29 [row 3 - first half] + movu [r0 + 1734 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1635 * 16], m3 + + ; mode 29 [row 3 - second half] + movu [r0 + 1735 * 16], m3 + + ; mode 27 [row 18] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1636 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1637 * 16], m3 + + ; mode 27 [row 19] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1638 * 16], m3 + + ; mode 28 [row 7 - first half] + movu [r0 + 1678 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1639 * 16], m3 + + ; mode 28 [row 7 - second half] + movu [r0 + 1679 * 16], m3 + + ; mode 27 [row 20] + movu m6, [r5 + 10 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1640 * 16], m3 + + ; mode 32 [row 1 - first half] + movu [r0 + 1922 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1641 * 16], m3 + + ; mode 32 [row 1 - second half] + movu [r0 + 1923 * 16], m3 + + ; mode 27 [row 21] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1642 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1643 * 16], m3 + + ; mode 27 [row 22] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1644 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1645 * 16], m3 + + ; mode 27 [row 23] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1646 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1647 * 16], m3 + + ; mode 27 [row 24] + movu m6, [r5 + 18 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1648 * 16], m3 + + ; mode 28 [row 9 - first half] + movu [r0 + 1682 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1649 * 16], m3 + + ; mode 28 [row 9 - second half] + movu [r0 + 1683 * 16], m3 + + ; mode 27 [row 25] + movu m6, [r5 + 20 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1650 * 16], m3 + + ; mode 30 [row 3 - first half] + movu [r0 + 1798 * 16], m3 + + ; mode 33 [row 1 - first half] + movu [r0 + 1986 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1651 * 16], m3 + + ; mode 30 [row 3 - second half] + movu [r0 + 1799 * 16], m3 + + ; mode 33 [row 1 - second half] + movu [r0 + 1987 * 16], m3 + + ; mode 27 [row 26] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1652 * 16], m3 + + ; mode 29 [row 5 - first half] + movu [r0 + 1738 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1653 * 16], m3 + + ; mode 29 [row 5 - second half] + movu [r0 + 1739 * 16], m3 + + ; mode 27 [row 27] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1654 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1655 * 16], m3 + + ; mode 27 [row 28] + movu m6, [r5 + 26 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1656 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1657 * 16], m3 + + ; mode 27 [row 29] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1658 * 16], m3 + + ; mode 28 [row 11 - first half] + movu [r0 + 1686 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1659 * 16], m3 + + ; mode 28 [row 11 - second half] + movu [r0 + 1687 * 16], m3 + + ; mode 27 [row 30] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1660 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1661 * 16], m3 + + ; mode 28 [row 6] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1676 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1677 * 16], m3 + + ; mode 28 [row 8] + movu m6, [r5 + 13 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1680 * 16], m3 + + ; mode 29 [row 4 - first half] + movu [r0 + 1736 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1681 * 16], m3 + + ; mode 29 [row 4 - second half] + movu [r0 + 1737 * 16], m3 + + ; mode 28 [row 10] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1684 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1685 * 16], m3 + + ; mode 29 [row 6] + movu m6, [r5 + 31 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1740 * 16], m3 + + ; mode 32 [row 2 - first half] + movu [r0 + 1924 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1741 * 16], m3 + + ; mode 32 [row 2 - second half] + movu [r0 + 1925 * 16], m3 + + ; mode 30 [row 2] + movu m6, [r5 + 7 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1796 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1797 * 16], m3 + + ; mode 31 [row 2] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1860 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1861 * 16], m3 + + ; mode 27 [row 15] + movu m0, [r3 + 3] + movd m1, [r3 + 4] + palignr m1, m0, 1 + punpcklbw m0, m1 + movu m2, [r3 + 11] + movd m3, [r3 + 12] + palignr m3, m2, 1 + punpcklbw m2, m3 + movu m1, [r3 + 19] + movd m3, [r3 + 20] + palignr m3, m1, 1 + punpcklbw m1, m3 + movu m4, [r3 + 27] + movd m5, [r3 + 28] + palignr m5, m4, 1 + punpcklbw m4, m5 + + pshufb m5, m0, [tab_S2] + movh [r0 + 1662 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1662 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1663 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 1663 * 16 + 8], m5 + + ; mode 28 [row 12] + movu m6, [r5 + 1 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1688 * 16], m3 + + ; mode 30 [row 4 - first half] + movu [r0 + 1800 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1689 * 16], m3 + + ; mode 30 [row 4 - second half] + movu [r0 + 1801 * 16], m3 + + ; mode 28 [row 13] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1690 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1691 * 16], m3 + + ; mode 28 [row 14] + movu m6, [r5 + 11 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1692 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1693 * 16], m3 + + ; mode 28 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1694 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1695 * 16], m3 + + ; mode 28 [row 16] + movu m6, [r5 + 21 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1696 * 16], m3 + + ; mode 31 [row 4 - first half] + movu [r0 + 1864 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1697 * 16], m3 + + ; mode 31 [row 4 - second half] + movu [r0 + 1865 * 16], m3 + + ; mode 28 [row 17] + movu m6, [r5 + 26 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1698 * 16], m3 + + ; mode 29 [row 9 - first half] + movu [r0 + 1746 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1699 * 16], m3 + + ; mode 29 [row 9 - second half] + movu [r0 + 1747 * 16], m3 + + ; mode 28 [row 18] + movu m6, [r5 + 31 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1700 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1701 * 16], m3 + + ; mode 29 [row 7] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1742 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1743 * 16], m3 + + ; mode 29 [row 8] + movu m6, [r5 + 17 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1744 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1745 * 16], m3 + + ; mode 30 [row 5] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1802 * 16], m3 + + ; mode 33 [row 2 - first half] + movu [r0 + 1988 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1803 * 16], m3 + + ; mode 33 [row 2 - second half] + movu [r0 + 1989 * 16], m3 + + ; mode 30 [row 6] + movu m6, [r5 + 27 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1804 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1805 * 16], m3 + + ; mode 31 [row 3] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1862 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1863 * 16], m3 + + ; mode 32 [row 3] + movu m6, [r5 + 20 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1926 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1927 * 16], m3 + + ; mode 28 [row 19] + movu m6, [r5 + 4 * 16] + movu m0, [r3 + 4] + movd m1, [r3 + 5] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 12] + movd m4, [r3 + 13] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1702 * 16], m3 + + movu m1, [r3 + 20] + movd m3, [r3 + 21] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 28] + movd m5, [r3 + 29] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1703 * 16], m3 + + ; mode 28 [row 20] + movu m6, [r5 + 9 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1704 * 16], m3 + + ; mode 32 [row 4 - first half] + movu [r0 + 1928 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1705 * 16], m3 + + ; mode 32 [row 4 - second half] + movu [r0 + 1929 * 16], m3 + + ; mode 28 [row 21] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1706 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1707 * 16], m3 + + ; mode 28 [row 22] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1708 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1709 * 16], m3 + + ; mode 28 [row 23] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1710 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1711 * 16], m3 + + ; mode 28 [row 24] + movu m6, [r5 + 29 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1712 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1713 * 16], m3 + + ; mode 29 [row 10] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1748 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1749 * 16], m3 + + ; mode 29 [row 11] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1750 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1751 * 16], m3 + + ; mode 29 [row 12] + movu m6, [r5 + 21 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1752 * 16], m3 + + ; mode 30 [row 8 -first half] + movu [r0 + 1808 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1753 * 16], m3 + + ; mode 30 [row 8 -second half] + movu [r0 + 1809 * 16], m3 + + ; mode 29 [row 13] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1754 * 16], m3 + + ; mode 32 [row 5 - first half] + movu [r0 + 1930 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1755 * 16], m3 + + ; mode 32 [row 5 - second half] + movu [r0 + 1931 * 16], m3 + + ; mode 30 [row 7] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1806 * 16], m3 + + ; mode 33 [row 3 - first half] + movu [r0 + 1990 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1807 * 16], m3 + + ; mode 33 [row 3 - second half] + movu [r0 + 1991 * 16], m3 + + ; mode 31 [row 5] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1866 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1867 * 16], m3 + + ; mode 31 [row 6] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1868 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1869 * 16], m3 + + ; mode 28 [row 25] + movu m6, [r5 + 2 * 16] + movu m0, [r3 + 5] + movd m1, [r3 + 6] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 13] + movd m4, [r3 + 14] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1714 * 16], m3 + + movu m1, [r3 + 21] + movd m3, [r3 + 22] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 29] + movd m5, [r3 + 30] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1715 * 16], m3 + + ; mode 28 [row 26] + movu m6, [r5 + 7 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1716 * 16], m3 + + ; mode 29 [row 14 - first half] + movu [r0 + 1756 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1717 * 16], m3 + + ; mode 29 [row 14 - second half] + movu [r0 + 1757 * 16], m3 + + ; mode 28 [row 27] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1718 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1719 * 16], m3 + + ; mode 28 [row 28] + movu m6, [r5 + 17 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1720 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1721 * 16], m3 + + ; mode 28 [row 29] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1722 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1723 * 16], m3 + + ; mode 28 [row 30] + movu m6, [r5 + 27 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1724 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1725 * 16], m3 + + ; mode 29 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1758 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1759 * 16], m3 + + ; mode 29 [row 16] + movu m6, [r5 + 25 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1760 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1761 * 16], m3 + + ; mode 30 [row 9] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1810 * 16], m3 + + ; mode 33 [row 4 - first half] + movu [r0 + 1992 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1811 * 16], m3 + + ; mode 33 [row 4 - second half] + movu [r0 + 1993 * 16], m3 + + ; mode 30 [row 10] + movu m6, [r5 + 15 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1812 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1813 * 16], m3 + + ; mode 31 [row 7] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1870 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1871 * 16], m3 + + ; mode 31 [row 8] + movu m6, [r5 + 25 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1872 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1873 * 16], m3 + + ; mode 32 [row 6] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1932 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1933 * 16], m3 + + ; mode 30 [row 11] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1814 * 16], m3 + + ; mode 33 [row 5 - first half] + movu [r0 + 1994 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1815 * 16], m3 + + ; mode 33 [row 5 - second half] + movu [r0 + 1995 * 16], m3 + + ; mode 28 [row 31] + movu m0, [r3 + 6] + movd m1, [r3 + 7] + palignr m1, m0, 1 + punpcklbw m0, m1 + movu m2, [r3 + 14] + movd m3, [r3 + 15] + palignr m3, m2, 1 + punpcklbw m2, m3 + movu m1, [r3 + 22] + movd m3, [r3 + 23] + palignr m3, m1, 1 + punpcklbw m1, m3 + movu m4, [r3 + 30] + movd m5, [r3 + 31] + palignr m5, m4, 1 + punpcklbw m4, m5 + + pshufb m5, m0, [tab_S2] + movh [r0 + 1726 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1726 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1727 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 1727 * 16 + 8], m5 + + ; mode 29 [row 17] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1762 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1763 * 16], m3 + + ; mode 29 [row 18] + movu m6, [r5 + 11 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1764 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1765 * 16], m3 + + ; mode 29 [row 19] + movu m6, [r5 + 20 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1766 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1767 * 16], m3 + + ; mode 29 [row 20] + movu m6, [r5 + 29 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1768 * 16], m3 + + ; mode 32 [row 8 - first halif] + movu [r0 + 1936 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1769 * 16], m3 + + ; mode 32 [row 8 - second halif] + movu [r0 + 1937 * 16], m3 + + ; mode 30 [row 12] + movu m6, [r5 + 9 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1816 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1817 * 16], m3 + + ; mode 30 [row 13] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1818 * 16], m3 + + ; mode 33 [row 6 - first half] + movu [r0 + 1996 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1819 * 16], m3 + + ; mode 33 [row 6 - second half] + movu [r0 + 1997 * 16], m3 + + ; mode 31 [row 9] + movu m6, [r5 + 10 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1874 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1875 * 16], m3 + + ; mode 31 [row 10] + movu m6, [r5 + 27 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1876 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1877 * 16], m3 + + ; mode 32 [row 7] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1934 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1935 * 16], m3 + + ; mode 29 [row 21] + movu m6, [r5 + 6 * 16] + movu m0, [r3 + 7] + movd m1, [r3 + 8] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 15] + movd m4, [r3 + 16] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1770 * 16], m3 + + movu m1, [r3 + 23] + movd m3, [r3 + 24] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 31] + movd m5, [r3 + 32] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1771 * 16], m3 + + ; mode 29 [row 22] + movu m6, [r5 + 15 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1772 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1773 * 16], m3 + + ; mode 29 [row 23] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1774 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1775 * 16], m3 + + ; mode 30 [row 14] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1820 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1821 * 16], m3 + + ; mode 30 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1822 * 16], m3 + + ; mode 33 [row 7 - first half] + movu [r0 + 1998 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1823 * 16], m3 + + ; mode 33 [row 7 - second half] + movu [r0 + 1999 * 16], m3 + + ; mode 30 [row 16] + movu m6, [r5 + 29 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1824 * 16], m3 + + ; mode 31 [row 12 - first half] + movu [r0 + 1880 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1825 * 16], m3 + + ; mode 31 [row 12 - second half] + movu [r0 + 1881 * 16], m3 + + ; mode 31 [row 11] + movu m6, [r5 + 12 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1878 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1879 * 16], m3 + + ; mode 32 [row 9] + movu m6, [r5 + 18 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1938 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1939 * 16], m3 + + ; mode 29 [row 24] + movu m6, [r5 + 1 * 16] + movu m0, [r3 + 8] + movd m1, [r3 + 9] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 16] + movd m4, [r3 + 17] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1776 * 16], m3 + + movu m1, [r3 + 24] + movd m3, [r3 + 25] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 32] + movd m5, [r3 + 33] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1777 * 16], m3 + + ; mode 29 [row 25] + movu m6, [r5 + 10 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1778 * 16], m3 + + ; mode 30 [row 17 - first half] + movu [r0 + 1826 * 16], m3 + + ; mode 33 [row 8 - first half] + movu [r0 + 2000 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1779 * 16], m3 + + ; mode 30 [row 17 - second half] + movu [r0 + 1827 * 16], m3 + + ; mode 33 [row 8 - second half] + movu [r0 + 2001 * 16], m3 + + ; mode 29 [row 26] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1780 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1781 * 16], m3 + + ; mode 29 [row 27] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1782 * 16], m3 + + ; mode 32 [row 11 - first half] + movu [r0 + 1942 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1783 * 16], m3 + + ; mode 32 [row 11 - second half] + movu [r0 + 1943 * 16], m3 + + ; mode 30 [row 18] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1828 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1829 * 16], m3 + + ; mode 31 [row 13] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1882 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1883 * 16], m3 + + ; mode 31 [row 14] + movu m6, [r5 + 31 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1884 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1885 * 16], m3 + + ; mode 32 [row 10] + movu m6, [r5 + 7 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1940 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1941 * 16], m3 + + ; mode 29 [row 28] + movu m6, [r5 + 5 * 16] + movu m0, [r3 + 9] + movd m1, [r3 + 10] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 17] + movd m4, [r3 + 18] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1784 * 16], m3 + + movu m1, [r3 + 25] + movd m3, [r3 + 26] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 33] + movd m5, [r3 + 34] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1785 * 16], m3 + + ; mode 29 [row 29] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1786 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1787 * 16], m3 + + ; mode 29 [row 30] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1788 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1789 * 16], m3 + + ; mode 30 [row 19] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1830 * 16], m3 + + ; mode 33 [row 9 - first half] + movu [r0 + 2002 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1831 * 16], m3 + + ; mode 33 [row 9 - second half] + movu [r0 + 2003 * 16], m3 + + ; mode 30 [row 20] + movu m6, [r5 + 17 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1832 * 16], m3 + + ; mode 32 [row 12 - first half] + movu [r0 + 1944 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1833 * 16], m3 + + ; mode 32 [row 12 - second half] + movu [r0 + 1945 * 16], m3 + + ; mode 30 [row 21] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1834 * 16], m3 + + ; mode 33 [row 10 - first half] + movu [r0 + 2004 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1835 * 16], m3 + + ; mode 33 [row 10 - second half] + movu [r0 + 2005 * 16], m3 + + ; mode 31 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1886 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1887 * 16], m3 + + ; mode 29 [row 31] + movu m0, [r3 + 10] + movd m1, [r3 + 11] + palignr m1, m0, 1 + punpcklbw m0, m1 + movu m2, [r3 + 18] + movd m3, [r3 + 19] + palignr m3, m2, 1 + punpcklbw m2, m3 + movu m1, [r3 + 26] + movd m3, [r3 + 27] + palignr m3, m1, 1 + punpcklbw m1, m3 + movu m4, [r3 + 34] + movd m5, [r3 + 35] + palignr m5, m4, 1 + punpcklbw m4, m5 + + pshufb m5, m0, [tab_S2] + movh [r0 + 1790 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1790 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1791 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 1791 * 16 + 8], m5 + + ; mode 30 [row 22] + movu m6, [r5 + 11 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1836 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1837 * 16], m3 + + ; mode 30 [row 23] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1838 * 16], m3 + + ; mode 33 [row 11 - first half] + movu [r0 + 2006 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1839 * 16], m3 + + ; mode 33 [row 11 - second half] + movu [r0 + 2007 * 16], m3 + + ; mode 31 [row 16] + movu m6, [r5 + 1 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1888 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1889 * 16], m3 + + ; mode 31 [row 17] + movu m6, [r5 + 18 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1890 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1891 * 16], m3 + + ; mode 32 [row 13] + movu m6, [r5 + 6 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1946 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1947 * 16], m3 + + ; mode 32 [row 14] + movu m6, [r5 + 27 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1948 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1949 * 16], m3 + + ; mode 30 [row 24] + movu m6, [r5 + 5 * 16] + movu m0, [r3 + 11] + movd m1, [r3 + 12] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 19] + movd m4, [r3 + 20] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1840 * 16], m3 + + movu m1, [r3 + 27] + movd m3, [r3 + 28] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 35] + movd m5, [r3 + 36] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1841 * 16], m3 + + ; mode 30 [row 25] + movu m6, [r5 + 18 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1842 * 16], m3 + + ; mode 33 [row 12 - first half] + movu [r0 + 2008 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1843 * 16], m3 + + ; mode 33 [row 12 - second half] + movu [r0 + 2009 * 16], m3 + + ; mode 30 [row 26] + movu m6, [r5 + 31 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1844 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1845 * 16], m3 + + ; mode 31 [row 18] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1892 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1893 * 16], m3 + + ; mode 31 [row 19] + movu m6, [r5 + 20 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1894 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1895 * 16], m3 + + ; mode 32 [row 15] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1950 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1951 * 16], m3 + + ; mode 30 [row 27] + movu m6, [r5 + 12 * 16] + movu m0, [r3 + 12] + movd m1, [r3 + 13] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 20] + movd m4, [r3 + 21] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1846 * 16], m3 + + ; mode 33 [row 13 - first half] + movu [r0 + 2010 * 16], m3 + + movu m1, [r3 + 28] + movd m3, [r3 + 29] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 36] + movd m5, [r3 + 37] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1847 * 16], m3 + + ; mode 33 [row 13 - second half] + movu [r0 + 2011 * 16], m3 + + ; mode 30 [row 28] + movu m6, [r5 + 25 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1848 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1849 * 16], m3 + + ; mode 31 [row 20] + movu m6, [r5 + 5 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1896 * 16], m3 + + ; mode 32 [row 16 - first half] + movu [r0 + 1952 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1897 * 16], m3 + + ; mode 32 [row 16 - second half] + movu [r0 + 1953 * 16], m3 + + ; mode 31 [row 21] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1898 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1899 * 16], m3 + + ; mode 32 [row 17] + movu m6, [r5 + 26 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1954 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1955 * 16], m3 + + ; mode 30 [row 29] + movu m6, [r5 + 6 * 16] + movu m0, [r3 + 13] + movd m1, [r3 + 14] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 21] + movd m4, [r3 + 22] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1850 * 16], m3 + + ; mode 33 [row 14 - first half] + movu [r0 + 2012 * 16], m3 + + movu m1, [r3 + 29] + movd m3, [r3 + 30] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 37] + movd m5, [r3 + 38] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1851 * 16], m3 + + ; mode 33 [row 14 - second half] + movu [r0 + 2013 * 16], m3 + + ; mode 30 [row 30] + movu m6, [r5 + 19 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1852 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1853 * 16], m3 + + ; mode 31 [row 22] + movu m6, [r5 + 7 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1900 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1901 * 16], m3 + + ; mode 31 [row 23] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1902 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1903 * 16], m3 + + ; mode 32 [row 18] + movu m6, [r5 + 15 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1956 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1957 * 16], m3 + + ; mode 30 [row 31] + movu m0, [r3 + 14] + movd m1, [r3 + 15] + palignr m1, m0, 1 + punpcklbw m0, m1 + movu m2, [r3 + 22] + movd m3, [r3 + 23] + palignr m3, m2, 1 + punpcklbw m2, m3 + movu m1, [r3 + 30] + movd m3, [r3 + 31] + palignr m3, m1, 1 + punpcklbw m1, m3 + movu m4, [r3 + 38] + movd m5, [r3 + 39] + palignr m5, m4, 1 + punpcklbw m4, m5 + + pshufb m5, m0, [tab_S2] + movh [r0 + 1854 * 16], m5 + + ; mode 33 [row 15 - first eight] + movh [r0 + 2014 * 16], m5 + + pshufb m5, m2, [tab_S2] + movh [r0 + 1854 * 16 + 8], m5 + + ; mode 33 [row 15 - second eight] + movh [r0 + 2014 * 16 + 8], m5 + + pshufb m5, m1, [tab_S2] + movh [r0 + 1855 * 16], m5 + + ; mode 33 [row 15 - third eight] + movh [r0 + 2015 * 16], m5 + + pshufb m5, m4, [tab_S2] + movh [r0 + 1855 * 16 + 8], m5 + + ; mode 33 [row 15 - fourth eight] + movh [r0 + 2015 * 16 + 8], m5 + + ; mode 31 [row 24] + movu m6, [r5 + 9 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1904 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1905 * 16], m3 + + ; mode 31 [row 25] + movu m6, [r5 + 26 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1906 * 16], m3 + + ; mode 33 [row 16 - first half] + movu [r0 + 2016 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1907 * 16], m3 + + ; mode 33 [row 16 - second half] + movu [r0 + 2017 * 16], m3 + + ; mode 32 [row 19] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1958 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1959 * 16], m3 + + ; mode 32 [row 20] + movu m6, [r5 + 25 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1960 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1961 * 16], m3 + + ; mode 31 [row 26] + movu m6, [r5 + 11 * 16] + movu m0, [r3 + 15] + movd m1, [r3 + 16] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 23] + movd m4, [r3 + 24] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1908 * 16], m3 + + movu m1, [r3 + 31] + movd m3, [r3 + 32] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 39] + movd m5, [r3 + 40] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1909 * 16], m3 + + ; mode 31 [row 27] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1910 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1911 * 16], m3 + + ; mode 32 [row 21] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1962 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1963 * 16], m3 + + ; mode 33 [row 17] + movu m6, [r5 + 20 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2018 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2019 * 16], m3 + + ; mode 31 [row 28] + movu m6, [r5 + 13 * 16] + movu m0, [r3 + 16] + movd m1, [r3 + 17] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 24] + movd m4, [r3 + 25] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1912 * 16], m3 + + movu m1, [r3 + 32] + movd m3, [r3 + 33] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 40] + movd m5, [r3 + 41] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1913 * 16], m3 + + ; mode 31 [row 29] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1914 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1915 * 16], m3 + + ; mode 32 [row 22] + movu m6, [r5 + 3 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1964 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1965 * 16], m3 + + ; mode 32 [row 23] + movu m6, [r5 + 24 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1966 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1967 * 16], m3 + + ; mode 33 [row 18] + movu m6, [r5 + 14 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2020 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2021 * 16], m3 + + ; mode 31 [row 30] + movu m6, [r5 + 15 * 16] + movu m0, [r3 + 17] + movd m1, [r3 + 18] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 25] + movd m4, [r3 + 26] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1916 * 16], m3 + + movu m1, [r3 + 33] + movd m3, [r3 + 34] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 41] + movd m5, [r3 + 42] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1917 * 16], m3 + + ; mode 32 [row 24] + movu m6, [r5 + 13 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1968 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1969 * 16], m3 + + ; mode 33 [row 19] + movu m6, [r5 + 8 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2022 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2023 * 16], m3 + + ; mode 31 [row 31] + movu m0, [r3 + 18] + movd m1, [r3 + 19] + palignr m1, m0, 1 + punpcklbw m0, m1 + movu m2, [r3 + 26] + movd m3, [r3 + 27] + palignr m3, m2, 1 + punpcklbw m2, m3 + movu m1, [r3 + 34] + movd m3, [r3 + 35] + palignr m3, m1, 1 + punpcklbw m1, m3 + movu m4, [r3 + 42] + movd m5, [r3 + 43] + palignr m5, m4, 1 + punpcklbw m4, m5 + + pshufb m5, m0, [tab_S2] + movh [r0 + 1918 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1918 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1919 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 1919 * 16 + 8], m5 + + ; mode 32 [row 25] + movu m6, [r5 + 2 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1970 * 16], m3 + + ; mode 33 [row 20 - first half] + movu [r0 + 2024 * 16], m3 + + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1971 * 16], m3 + + ; mode 33 [row 20 - second half] + movu [r0 + 2025 * 16], m3 + + ; mode 32 [row 26] + movu m6, [r5 + 23 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1972 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1973 * 16], m3 + + ; mode 33 [row 21] + movu m6, [r5 + 28 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2026 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2027 * 16], m3 + + ; mode 32 [row 27] + movu m6, [r5 + 12 * 16] + movu m0, [r3 + 19] + movd m1, [r3 + 20] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 27] + movd m4, [r3 + 28] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1974 * 16], m3 + + movu m1, [r3 + 35] + movd m3, [r3 + 36] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 43] + movd m5, [r3 + 44] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1975 * 16], m3 + + ; mode 33 [row 22] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2028 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2029 * 16], m3 + + ; mode 32 [row 28] + movu m6, [r5 + 1 * 16] + movu m0, [r3 + 20] + movd m1, [r3 + 21] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 28] + movd m4, [r3 + 29] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1976 * 16], m3 + + movu m1, [r3 + 36] + movd m3, [r3 + 37] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 44] + movd m5, [r3 + 45] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1977 * 16], m3 + + ; mode 32 [row 29] + movu m6, [r5 + 22 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1978 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1979 * 16], m3 + + ; mode 33 [row 23] + movu m6, [r5 + 16 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2030 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2031 * 16], m3 + + ; mode 32 [row 30] + movu m6, [r5 + 11 * 16] + movu m0, [r3 + 21] + movd m1, [r3 + 22] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 29] + movd m4, [r3 + 30] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1980 * 16], m3 + + movu m1, [r3 + 37] + movd m3, [r3 + 38] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 45] + movd m5, [r3 + 46] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 1981 * 16], m3 + + ; mode 33 [row 24] + movu m6, [r5 + 10 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2032 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2033 * 16], m3 + + ; mode 32 [row 31] + movu m0, [r3 + 22] + movd m1, [r3 + 23] + palignr m1, m0, 1 + punpcklbw m0, m1 + movu m2, [r3 + 30] + movd m3, [r3 + 31] + palignr m3, m2, 1 + punpcklbw m2, m3 + movu m1, [r3 + 38] + movd m3, [r3 + 39] + palignr m3, m1, 1 + punpcklbw m1, m3 + movu m4, [r3 + 46] + movd m5, [r3 + 47] + palignr m5, m4, 1 + punpcklbw m4, m5 + + pshufb m5, m0, [tab_S2] + movh [r0 + 1982 * 16], m5 + pshufb m5, m2, [tab_S2] + movh [r0 + 1982 * 16 + 8], m5 + pshufb m5, m1, [tab_S2] + movh [r0 + 1983 * 16], m5 + pshufb m5, m4, [tab_S2] + movh [r0 + 1983 * 16 + 8], m5 + + ; mode 33 [row 25] + movu m6, [r5 + 4 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2034 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2035 * 16], m3 + + ; mode 33 [row 26] + movu m6, [r5 + 30 * 16] + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2036 * 16], m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2037 * 16], m3 + + ; mode 33 [row 27] + movu m6, [r5 + 24 * 16] + movu m0, [r3 + 23] + movd m1, [r3 + 24] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 31] + movd m4, [r3 + 32] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2038 * 16], m3 + + movu m1, [r3 + 39] + movd m3, [r3 + 40] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 47] + movd m5, [r3 + 48] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2039 * 16], m3 + + ; mode 33 [row 28] + movu m6, [r5 + 18 * 16] + movu m0, [r3 + 24] + movd m1, [r3 + 25] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 32] + movd m4, [r3 + 33] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2040 * 16], m3 + + movu m1, [r3 + 40] + movd m3, [r3 + 41] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 48] + movd m5, [r3 + 49] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2041 * 16], m3 + + ; mode 33 [row 29] + movu m6, [r5 + 12 * 16] + movu m0, [r3 + 25] + movd m1, [r3 + 26] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 33] + movd m4, [r3 + 34] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2042 * 16], m3 + + movu m1, [r3 + 41] + movd m3, [r3 + 42] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 49] + movd m5, [r3 + 50] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2043 * 16], m3 + + ; mode 33 [row 30] + movu m6, [r5 + 6 * 16] + movu m0, [r3 + 26] + movd m1, [r3 + 27] + palignr m1, m0, 1 + punpcklbw m0, m1 + pmaddubsw m3, m0, m6 + pmulhrsw m3, m7 + movu m2, [r3 + 34] + movd m4, [r3 + 35] + palignr m4, m2, 1 + punpcklbw m2, m4 + pmaddubsw m5, m2, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2044 * 16], m3 + + movu m1, [r3 + 42] + movd m3, [r3 + 43] + palignr m3, m1, 1 + punpcklbw m1, m3 + pmaddubsw m3, m1, m6 + pmulhrsw m3, m7 + movu m4, [r3 + 50] + movd m5, [r3 + 51] + palignr m5, m4, 1 + punpcklbw m4, m5 + pmaddubsw m5, m4, m6 + pmulhrsw m5, m7 + packuswb m3, m5 + movu [r0 + 2045 * 16], m3 + + ; mode 33 [row 31] + movu m5, [r3 + 27] + movu [r0 + 2046 * 16], m5 + movu m5, [r3 + 43] + movu [r0 + 2047 * 16], m5 + + ;mode 34 [row 0] + movu m0, [r3 + 2] + movu [r0 + 2048 * 16], m0 + movu m1, [r3 + 18] + movu [r0 + 2049 * 16], m1 + + ;mode 34 [row 1] + movu m2, [r3 + 34] + palignr m3, m1, m0, 1 + movu [r0 + 2050 * 16], m3 + palignr m4, m2, m1, 1 + movu [r0 + 2051 * 16], m4 + + ;mode 34 [row 2] + palignr m3, m1, m0, 2 + movu [r0 + 2052 * 16], m3 + palignr m4, m2, m1, 2 + movu [r0 + 2053 * 16], m4 + + ;mode 34 [row 3] + palignr m3, m1, m0, 3 + movu [r0 + 2054 * 16], m3 + palignr m4, m2, m1, 3 + movu [r0 + 2055 * 16], m4 + + ;mode 34 [row 4] + palignr m3, m1, m0, 4 + movu [r0 + 2056 * 16], m3 + palignr m4, m2, m1, 4 + movu [r0 + 2057 * 16], m4 + + ;mode 34 [row 5] + palignr m3, m1, m0, 5 + movu [r0 + 2058 * 16], m3 + palignr m4, m2, m1, 5 + movu [r0 + 2059 * 16], m4 + + ;mode 34 [row 6] + palignr m3, m1, m0, 6 + movu [r0 + 2060 * 16], m3 + palignr m4, m2, m1, 6 + movu [r0 + 2061 * 16], m4 + + ;mode 34 [row 7] + palignr m3, m1, m0, 7 + movu [r0 + 2062 * 16], m3 + palignr m4, m2, m1, 7 + movu [r0 + 2063 * 16], m4 + + ;mode 34 [row 8] + palignr m3, m1, m0, 8 + movu [r0 + 2064 * 16], m3 + palignr m4, m2, m1, 8 + movu [r0 + 2065 * 16], m4 + + ;mode 34 [row 9] + palignr m3, m1, m0, 9 + movu [r0 + 2066 * 16], m3 + palignr m4, m2, m1, 9 + movu [r0 + 2067 * 16], m4 + + ;mode 34 [row 10] + palignr m3, m1, m0, 10 + movu [r0 + 2068 * 16], m3 + palignr m4, m2, m1, 10 + movu [r0 + 2069 * 16], m4 + + ;mode 34 [row 11] + palignr m3, m1, m0, 11 + movu [r0 + 2070 * 16], m3 + palignr m4, m2, m1, 11 + movu [r0 + 2071 * 16], m4 + + ;mode 34 [row 12] + palignr m3, m1, m0, 12 + movu [r0 + 2072 * 16], m3 + palignr m4, m2, m1, 12 + movu [r0 + 2073 * 16], m4 + + ;mode 34 [row 13] + palignr m3, m1, m0, 13 + movu [r0 + 2074 * 16], m3 + palignr m4, m2, m1, 13 + movu [r0 + 2075 * 16], m4 + + ;mode 34 [row 14] + palignr m3, m1, m0, 14 + movu [r0 + 2076 * 16], m3 + palignr m4, m2, m1, 14 + movu [r0 + 2077 * 16], m4 + + ;mode 34 [row 15] + palignr m3, m1, m0, 15 + movu [r0 + 2078 * 16], m3 + palignr m4, m2, m1, 15 + movu [r0 + 2079 * 16], m4 + + ;mode 34 [row 16] + palignr m3, m1, m0, 16 + movu [r0 + 2080 * 16], m3 + palignr m4, m2, m1, 16 + movu [r0 + 2081 * 16], m4 + + ;mode 34 [row 17] + movu m0, [r3 + 19] + movu [r0 + 2082 * 16], m0 + movu m1, [r3 + 35] + movu [r0 + 2083 * 16], m1 + + mov r2d, r6d + mov [r4], r2b + mov r2d, [rsp] + mov [r1 + 64], r2b + + ;mode 34 [row 18] + movu m2, [r3 + 51] + palignr m3, m1, m0, 1 + movu [r0 + 2084 * 16], m3 + palignr m4, m2, m1, 1 + movu [r0 + 2085 * 16], m4 + + ;mode 34 [row 19] + palignr m3, m1, m0, 2 + movu [r0 + 2086 * 16], m3 + palignr m4, m2, m1, 2 + movu [r0 + 2087 * 16], m4 + + ;mode 34 [row 20] + palignr m3, m1, m0, 3 + movu [r0 + 2088 * 16], m3 + palignr m4, m2, m1, 3 + movu [r0 + 2089 * 16], m4 + + ;mode 34 [row 21] + palignr m3, m1, m0, 4 + movu [r0 + 2090 * 16], m3 + palignr m4, m2, m1, 4 + movu [r0 + 2091 * 16], m4 + + ;mode 34 [row 22] + palignr m3, m1, m0, 5 + movu [r0 + 2092 * 16], m3 + palignr m4, m2, m1, 5 + movu [r0 + 2093 * 16], m4 + + ;mode 34 [row 23] + palignr m3, m1, m0, 6 + movu [r0 + 2094 * 16], m3 + palignr m4, m2, m1, 6 + movu [r0 + 2095 * 16], m4 + + ;mode 34 [row 24] + palignr m3, m1, m0, 7 + movu [r0 + 2096 * 16], m3 + palignr m4, m2, m1, 7 + movu [r0 + 2097 * 16], m4 + + ;mode 34 [row 25] + palignr m3, m1, m0, 8 + movu [r0 + 2098 * 16], m3 + palignr m4, m2, m1, 8 + movu [r0 + 2099 * 16], m4 + + ;mode 34 [row 26] + palignr m3, m1, m0, 9 + movu [r0 + 2100 * 16], m3 + palignr m4, m2, m1, 9 + movu [r0 + 2101 * 16], m4 + + ;mode 34 [row 27] + palignr m3, m1, m0, 10 + movu [r0 + 2102 * 16], m3 + palignr m4, m2, m1, 10 + movu [r0 + 2103 * 16], m4 + + ;mode 34 [row 28] + palignr m3, m1, m0, 11 + movu [r0 + 2104 * 16], m3 + palignr m4, m2, m1, 11 + movu [r0 + 2105 * 16], m4 + + ;mode 34 [row 29] + palignr m3, m1, m0, 12 + movu [r0 + 2106 * 16], m3 + palignr m4, m2, m1, 12 + movu [r0 + 2107 * 16], m4 + + ;mode 34 [row 30] + palignr m3, m1, m0, 13 + movu [r0 + 2108 * 16], m3 + palignr m4, m2, m1, 13 + movu [r0 + 2109 * 16], m4 + + ;mode 34 [row 31] + palignr m3, m1, m0, 14 + movu [r0 + 2110 * 16], m3 + palignr m4, m2, m1, 14 + movu [r0 + 2111 * 16], m4 + RET diff -Nru x265-1.5/source/common/x86/intrapred8.asm x265-1.6/source/common/x86/intrapred8.asm --- x265-1.5/source/common/x86/intrapred8.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/intrapred8.asm 2015-04-02 16:46:36.000000000 +0000 @@ -2,6 +2,7 @@ ;* Copyright (C) 2013 x265 project ;* ;* Authors: Min Chen +;* Praveen Kumar Tiwari ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -26,11 +27,15 @@ SECTION_RODATA 32 +intra_pred_shuff_0_8: times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + pb_0_8 times 8 db 0, 8 pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8 pb_swap8: times 2 db 7, 6, 5, 4, 3, 2, 1, 0 c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 -tab_Si: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 +const tab_S1, db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 +const tab_S2, db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0 +const tab_Si, db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 pb_fact0: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 c_mode32_12_0: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 7, 0 c_mode32_13_0: db 3, 6, 10, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 @@ -43,7 +48,6 @@ c_mode32_18_0: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 c_shuf8_0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 c_deinterval8: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 -tab_S1: db 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0 pb_unpackbq: db 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 c_mode16_12: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13, 6 c_mode16_13: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4 @@ -52,8 +56,327 @@ c_mode16_16: db 8, 6, 5, 3, 2, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2 c_mode16_17: db 4, 2, 1, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1 c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 -tab_S2: db 0, 1, 3, 5, 7, 9, 11, 13, 0, 0, 0, 0, 0, 0, 0, 0 +ALIGN 32 +trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 +c_ang8_src1_9_2_10: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 +c_ang8_26_20: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 +c_ang8_src3_11_4_12: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 +c_ang8_14_8: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 +c_ang8_src5_13_5_13: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +c_ang8_2_28: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 +c_ang8_src6_14_7_15: db 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 +c_ang8_22_16: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + +c_ang8_21_10 : db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 +c_ang8_src2_10_3_11: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +c_ang8_31_20: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 +c_ang8_src4_12_4_12: times 2 db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 +c_ang8_9_30: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 +c_ang8_src5_13_6_14: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13 +c_ang8_19_8: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 + +c_ang8_17_2: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 +c_ang8_19_4: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 +c_ang8_21_6: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 +c_ang8_23_8: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, +c_ang8_src4_12_5_13: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 + +c_ang8_13_26: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 +c_ang8_7_20: db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 +c_ang8_1_14: db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 +c_ang8_27_8: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 +c_ang8_src2_10_2_10: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 +c_ang8_src3_11_3_11: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 + +c_ang8_31_8: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 +c_ang8_13_22: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 +c_ang8_27_4: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 +c_ang8_9_18: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + +c_ang8_5_10: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 +c_ang8_15_20: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 +c_ang8_25_30: db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 +c_ang8_3_8: db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 + +c_ang8_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + +c_ang8_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + +c_ang8_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 + db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 + db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + +ALIGN 32 +c_ang16_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + + +ALIGN 32 +c_ang16_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 + db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 + db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 + db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + + +ALIGN 32 +c_ang16_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + +ALIGN 32 +intra_pred_shuff_0_15: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 15 + + +ALIGN 32 +c_ang16_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 + db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 + db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + + +ALIGN 32 +c_ang16_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 + db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 + db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + + +ALIGN 32 +c_ang16_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 + db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 + db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 + db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + +ALIGN 32 +c_ang16_mode_32: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 + db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 + db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 + db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + +ALIGN 32 +c_ang16_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 + db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 + db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + +ALIGN 32 +c_ang16_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 + db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 + db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 + db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + +ALIGN 32 +c_ang16_mode_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 + db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1 + db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 + db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 + db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + +ALIGN 32 +c_ang16_mode_22: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 + db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5 + db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 + db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 + db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 + db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + + +ALIGN 32 +c_ang32_mode_27: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 + db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 + db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + + +ALIGN 32 +c_ang32_mode_28: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 + db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8 + db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6 + db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9 + db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 + db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7 + db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 + db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 + db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + +ALIGN 32 +c_ang32_mode_29: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 + db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 + db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 + db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 + db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10 + db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 + db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + + +ALIGN 32 +c_ang32_mode_30: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 + db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 + db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 + db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 + db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 + db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + + +ALIGN 32 +c_ang32_mode_31: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 + db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 + db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 + db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 + db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + + +ALIGN 32 +c_ang32_mode_32: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21 + db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31 + db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20 + db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30 + db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19 + db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29 + db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18 + db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28 + db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17 + db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27 + db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 + db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26 + db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15 + db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25 + db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14 + db 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24 + db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13 + db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23 + db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12 + db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22 + db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11 + db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0 + +ALIGN 32 ;; (blkSize - 1 - x) pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0 pw_planar4_1: dw 3, 3, 3, 3, 3, 3, 3, 3 @@ -65,6 +388,7 @@ pw_planar32_L: dw 31, 30, 29, 28, 27, 26, 25, 24 pw_planar32_H: dw 23, 22, 21, 20, 19, 18, 17, 16 + const ang_table %assign x 0 %rep 32 @@ -72,13 +396,24 @@ %assign x x+1 %endrep +const pw_ang_table +%assign x 0 +%rep 32 + times 4 dw (32-x), x +%assign x x+1 +%endrep + SECTION .text +cextern pw_2 cextern pw_4 cextern pw_8 cextern pw_16 cextern pw_32 +cextern pw_257 cextern pw_1024 +cextern pw_4096 +cextern pw_00ff cextern pb_unpackbd1 cextern multiL cextern multiH @@ -89,22 +424,21 @@ ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- -INIT_XMM sse4 +INIT_XMM sse2 cglobal intra_pred_dc4, 5,5,3 inc r2 pxor m0, m0 - movd m1, [r2] - movd m2, [r2 + 8] - punpckldq m1, m2 + movu m1, [r2] + pshufd m1, m1, 0xF8 psadbw m1, m0 ; m1 = sum test r4d, r4d - mov r4d, 4096 - movd m2, r4d - pmulhrsw m1, m2 ; m1 = (sum + 4) / 8 + paddw m1, [pw_4] + psraw m1, 3 movd r4d, m1 ; r4d = dc_val - pshufb m1, m0 ; m1 = byte [dc_val ...] + pmullw m1, [pw_257] + pshuflw m1, m1, 0x00 ; store DC 4x4 lea r3, [r1 * 3] @@ -121,7 +455,8 @@ pshuflw m1, m1, 0 ; m1 = pixDCx3 ; filter top - pmovzxbw m2, [r2] + movd m2, [r2] + punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 @@ -137,92 +472,111 @@ ; filter left add r0, r1 - pmovzxbw m2, [r2 + 9] + movq m2, [r2 + 9] + punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 - pextrb [r0], m2, 0 - pextrb [r0 + r1], m2, 1 - pextrb [r0 + r1 * 2], m2, 2 - +%if ARCH_X86_64 + movq r4, m2 + mov [r0], r4b + shr r4, 8 + mov [r0 + r1], r4b + shr r4, 8 + mov [r0 + r1 * 2], r4b +%else + movd r2d, m2 + mov [r0], r2b + shr r2, 8 + mov [r0 + r1], r2b + shr r2, 8 + mov [r0 + r1 * 2], r2b +%endif .end: RET ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- -INIT_XMM sse4 +INIT_XMM sse2 cglobal intra_pred_dc8, 5, 7, 3 - lea r3, [r2 + 17] - inc r2 pxor m0, m0 - movh m1, [r2] - movh m2, [r3] + movh m1, [r2 + 1] + movh m2, [r2 + 17] punpcklqdq m1, m2 psadbw m1, m0 pshufd m2, m1, 2 paddw m1, m2 - movd r5d, m1 - add r5d, 8 - shr r5d, 4 ; sum = sum / 16 - movd m1, r5d - pshufb m1, m0 ; m1 = byte [dc_val ...] + paddw m1, [pw_8] + psraw m1, 4 + pmullw m1, [pw_257] + pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] test r4d, r4d ; store DC 8x8 - mov r6, r0 - movh [r0], m1 - movh [r0 + r1], m1 - lea r0, [r0 + r1 * 2] - movh [r0], m1 - movh [r0 + r1], m1 - lea r0, [r0 + r1 * 2] - movh [r0], m1 - movh [r0 + r1], m1 - lea r0, [r0 + r1 * 2] + lea r6, [r1 + r1 * 2] + lea r5, [r6 + r1 * 2] movh [r0], m1 movh [r0 + r1], m1 + movh [r0 + r1 * 2], m1 + movh [r0 + r6], m1 + movh [r0 + r1 * 4], m1 + movh [r0 + r5], m1 + movh [r0 + r6 * 2], m1 + lea r5, [r5 + r1 * 2] + movh [r0 + r5], m1 ; Do DC Filter jz .end - lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 - add r5d, r4d ; r5d = DC * 3 + 2 - movd m1, r5d - pshuflw m1, m1, 0 ; m1 = pixDCx3 + psrlw m1, 8 + movq m2, [pw_2] + pmullw m2, m1 + paddw m2, [pw_2] + movd r4d, m2 ; r4d = DC * 2 + 2 + paddw m1, m2 ; m1 = DC * 3 + 2 pshufd m1, m1, 0 ; filter top - pmovzxbw m2, [r2] + movq m2, [r2 + 1] + punpcklbw m2, m0 paddw m2, m1 - psraw m2, 2 + psraw m2, 2 ; sum = sum / 16 packuswb m2, m2 - movh [r6], m2 + movh [r0], m2 ; filter top-left - movzx r5d, byte [r3] - add r4d, r5d - movzx r3d, byte [r2] + movzx r3d, byte [r2 + 17] + add r4d, r3d + movzx r3d, byte [r2 + 1] add r3d, r4d shr r3d, 2 - mov [r6], r3b + mov [r0], r3b ; filter left - add r6, r1 - pmovzxbw m2, [r2 + 17] + movq m2, [r2 + 18] + punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 - pextrb [r6], m2, 0 - pextrb [r6 + r1], m2, 1 - pextrb [r6 + 2 * r1], m2, 2 - lea r6, [r6 + r1 * 2] - pextrb [r6 + r1], m2, 3 - pextrb [r6 + r1 * 2], m2, 4 - pextrb [r6 + r1 * 4], m2, 6 - lea r1, [r1 * 3] - pextrb [r6 + r1], m2, 5 + movd r2d, m2 + lea r0, [r0 + r1] + lea r5, [r6 + r1 * 2] + mov [r0], r2b + shr r2, 8 + mov [r0 + r1], r2b + shr r2, 8 + mov [r0 + r1 * 2], r2b + shr r2, 8 + mov [r0 + r6], r2b + pshufd m2, m2, 0x01 + movd r2d, m2 + mov [r0 + r1 * 4], r2b + shr r2, 8 + mov [r0 + r5], r2b + shr r2, 8 + mov [r0 + r6 * 2], r2b .end: RET @@ -230,28 +584,53 @@ ;-------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;-------------------------------------------------------------------------------------------- -INIT_XMM sse4 +INIT_XMM sse2 +%if ARCH_X86_64 +cglobal intra_pred_dc16, 5, 10, 4 +%else cglobal intra_pred_dc16, 5, 7, 4 - lea r3, [r2 + 33] - inc r2 +%endif pxor m0, m0 - movu m1, [r2] - movu m2, [r3] + movu m1, [r2 + 1] + movu m2, [r2 + 33] psadbw m1, m0 psadbw m2, m0 paddw m1, m2 pshufd m2, m1, 2 paddw m1, m2 - movd r5d, m1 - add r5d, 16 - shr r5d, 5 ; sum = sum / 32 - movd m1, r5d - pshufb m1, m0 ; m1 = byte [dc_val ...] + paddw m1, [pw_16] + psraw m1, 5 + pmullw m1, [pw_257] + pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] + pshufd m1, m1, 0x00 + test r4d, r4d ; store DC 16x16 +%if ARCH_X86_64 + lea r6, [r1 + r1 * 2] ;index 3 + lea r7, [r1 + r1 * 4] ;index 5 + lea r8, [r6 + r1 * 4] ;index 7 + lea r9, [r0 + r8] ;base + 7 + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m1 + movu [r0 + r6], m1 + movu [r0 + r1 * 4], m1 + movu [r0 + r7], m1 + movu [r0 + r6 * 2], m1 + movu [r0 + r8], m1 + movu [r0 + r1 * 8], m1 + movu [r9 + r1 * 2], m1 + movu [r0 + r7 * 2], m1 + movu [r9 + r1 * 4], m1 + movu [r0 + r6 * 4], m1 + movu [r9 + r6 * 2], m1 + movu [r0 + r8 * 2], m1 + movu [r9 + r1 * 8], m1 +%else ;32 bit mov r6, r0 movu [r0], m1 movu [r0 + r1], m1 @@ -276,83 +655,142 @@ lea r0, [r0 + r1 * 2] movu [r0], m1 movu [r0 + r1], m1 - +%endif ; Do DC Filter jz .end - lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 - add r5d, r4d ; r5d = DC * 3 + 2 - movd m1, r5d - pshuflw m1, m1, 0 ; m1 = pixDCx3 - pshufd m1, m1, 0 + psrlw m1, 8 + mova m2, [pw_2] + pmullw m2, m1 + paddw m2, [pw_2] + movd r4d, m2 + paddw m1, m2 ; filter top - pmovzxbw m2, [r2] + movh m2, [r2 + 1] + punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 - movh [r6], m2 - pmovzxbw m3, [r2 + 8] + movh m3, [r2 + 9] + punpcklbw m3, m0 paddw m3, m1 psraw m3, 2 packuswb m3, m3 - movh [r6 + 8], m3 ; filter top-left - movzx r5d, byte [r3] + movzx r5d, byte [r2 + 33] add r4d, r5d - movzx r3d, byte [r2] + movzx r3d, byte [r2 + 1] add r3d, r4d shr r3d, 2 + +%if ARCH_X86_64 + movh [r0], m2 + movh [r0 + 8], m3 + mov [r0], r3b +%else ;32 bit + movh [r6], m2 + movh [r6 + 8], m3 mov [r6], r3b + add r6, r1 +%endif ; filter left - add r6, r1 - pmovzxbw m2, [r2 + 33] + movh m2, [r2 + 34] + punpcklbw m2, m0 paddw m2, m1 psraw m2, 2 packuswb m2, m2 - pextrb [r6], m2, 0 - pextrb [r6 + r1], m2, 1 - pextrb [r6 + r1 * 2], m2, 2 - lea r6, [r6 + r1 * 2] - pextrb [r6 + r1], m2, 3 - pextrb [r6 + r1 * 2], m2, 4 - lea r6, [r6 + r1 * 2] - pextrb [r6 + r1], m2, 5 - pextrb [r6 + r1 * 2], m2, 6 - lea r6, [r6 + r1 * 2] - pextrb [r6 + r1], m2, 7 - pmovzxbw m3, [r2 + 41] + movh m3, [r2 + 42] + punpcklbw m3, m0 paddw m3, m1 psraw m3, 2 packuswb m3, m3 - pextrb [r6 + r1 * 2], m3, 0 +%if ARCH_X86_64 + movh r3, m2 + mov [r0 + r1], r3b + shr r3, 8 + mov [r0 + r1 * 2], r3b + shr r3, 8 + mov [r0 + r6], r3b + shr r3, 8 + mov [r0 + r1 * 4], r3b + shr r3, 8 + mov [r0 + r7], r3b + shr r3, 8 + mov [r0 + r6 * 2], r3b + shr r3, 8 + mov [r0 + r8], r3b + shr r3, 8 + mov [r0 + r1 * 8], r3b + movh r3, m3 + mov [r9 + r1 * 2], r3b + shr r3, 8 + mov [r0 + r7 * 2], r3b + shr r3, 8 + mov [r9 + r1 * 4], r3b + shr r3, 8 + mov [r0 + r6 * 4], r3b + shr r3, 8 + mov [r9 + r6 * 2], r3b + shr r3, 8 + mov [r0 + r8 * 2], r3b + shr r3, 8 + mov [r9 + r1 * 8], r3b +%else ;32 bit + movd r2d, m2 + pshufd m2, m2, 0x01 + mov [r6], r2b + shr r2, 8 + mov [r6 + r1], r2b + shr r2, 8 + mov [r6 + r1 * 2], r2b lea r6, [r6 + r1 * 2] - pextrb [r6 + r1], m3, 1 - pextrb [r6 + r1 * 2], m3, 2 + shr r2, 8 + mov [r6 + r1], r2b + movd r2d, m2 + mov [r6 + r1 * 2], r2b lea r6, [r6 + r1 * 2] - pextrb [r6 + r1], m3, 3 - pextrb [r6 + r1 * 2], m3, 4 + shr r2, 8 + mov [r6 + r1], r2b + shr r2, 8 + mov [r6 + r1 * 2], r2b lea r6, [r6 + r1 * 2] - pextrb [r6 + r1], m3, 5 - pextrb [r6 + r1 * 2], m3, 6 - + shr r2, 8 + mov [r6 + r1], r2b + movd r2d, m3 + pshufd m3, m3, 0x01 + mov [r6 + r1 * 2], r2b + lea r6, [r6 + r1 * 2] + shr r2, 8 + mov [r6 + r1], r2b + shr r2, 8 + mov [r6 + r1 * 2], r2b + lea r6, [r6 + r1 * 2] + shr r2, 8 + mov [r6 + r1], r2b + movd r2d, m3 + mov [r6 + r1 * 2], r2b + lea r6, [r6 + r1 * 2] + shr r2, 8 + mov [r6 + r1], r2b + shr r2, 8 + mov [r6 + r1 * 2], r2b +%endif .end: RET ;--------------------------------------------------------------------------------------------- ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) ;--------------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal intra_pred_dc32, 3, 5, 5 - lea r3, [r2 + 65] - inc r2 +INIT_XMM sse2 +cglobal intra_pred_dc32, 3, 3, 5 pxor m0, m0 - movu m1, [r2] - movu m2, [r2 + 16] - movu m3, [r3] - movu m4, [r3 + 16] + movu m1, [r2 + 1] + movu m2, [r2 + 17] + movu m3, [r2 + 65] + movu m4, [r2 + 81] psadbw m1, m0 psadbw m2, m0 psadbw m3, m0 @@ -363,65 +801,36 @@ pshufd m2, m1, 2 paddw m1, m2 - movd r4d, m1 - add r4d, 32 - shr r4d, 6 ; sum = sum / 64 - movd m1, r4d - pshufb m1, m0 ; m1 = byte [dc_val ...] + paddw m1, [pw_32] + psraw m1, 6 + pmullw m1, [pw_257] + pshuflw m1, m1, 0x00 ; m1 = byte [dc_val ...] + pshufd m1, m1, 0x00 -%rep 2 +%assign x 0 +%rep 16 ; store DC 16x16 - movu [r0], m1 - movu [r0 + r1], m1 - movu [r0 + 16], m1 - movu [r0 + r1 + 16],m1 - lea r0, [r0 + 2 * r1] - movu [r0], m1 - movu [r0 + r1], m1 - movu [r0 + 16], m1 - movu [r0 + r1 + 16],m1 - lea r0, [r0 + 2 * r1] - movu [r0], m1 - movu [r0 + r1], m1 - movu [r0 + 16], m1 - movu [r0 + r1 + 16],m1 - lea r0, [r0 + 2 * r1] - movu [r0], m1 - movu [r0 + r1], m1 - movu [r0 + 16], m1 - movu [r0 + r1 + 16],m1 - lea r0, [r0 + 2 * r1] - movu [r0], m1 - movu [r0 + r1], m1 - movu [r0 + 16], m1 - movu [r0 + r1 + 16],m1 - lea r0, [r0 + 2 * r1] - movu [r0], m1 - movu [r0 + r1], m1 - movu [r0 + 16], m1 - movu [r0 + r1 + 16],m1 - lea r0, [r0 + 2 * r1] - movu [r0], m1 - movu [r0 + r1], m1 - movu [r0 + 16], m1 - movu [r0 + r1 + 16],m1 - lea r0, [r0 + 2 * r1] - movu [r0], m1 - movu [r0 + r1], m1 - movu [r0 + 16], m1 - movu [r0 + r1 + 16],m1 + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16], m1 +%if x < 16 lea r0, [r0 + 2 * r1] +%endif +%assign x x+1 %endrep - RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal intra_pred_planar4, 3,3,7 - pmovzxbw m1, [r2 + 1] - pmovzxbw m2, [r2 + 9] +INIT_XMM sse2 +cglobal intra_pred_planar4, 3,3,5 + pxor m0, m0 + movh m1, [r2 + 1] + punpcklbw m1, m0 + movh m2, [r2 + 9] + punpcklbw m2, m0 pshufhw m3, m1, 0 ; topRight pshufd m3, m3, 0xAA pshufhw m4, m2, 0 ; bottomLeft @@ -429,72 +838,73 @@ pmullw m3, [multi_2Row] ; (x + 1) * topRight pmullw m0, m1, [pw_planar4_1] ; (blkSize - 1 - y) * above[x] - mova m6, [pw_planar4_0] paddw m3, [pw_4] paddw m3, m4 paddw m3, m0 psubw m4, m1 - pshuflw m5, m2, 0 - pmullw m5, m6 - paddw m5, m3 + pshuflw m1, m2, 0 + pmullw m1, [pw_planar4_0] + paddw m1, m3 paddw m3, m4 - psraw m5, 3 - packuswb m5, m5 - movd [r0], m5 - - pshuflw m5, m2, 01010101b - pmullw m5, m6 - paddw m5, m3 + psraw m1, 3 + packuswb m1, m1 + movd [r0], m1 + + pshuflw m1, m2, 01010101b + pmullw m1, [pw_planar4_0] + paddw m1, m3 paddw m3, m4 - psraw m5, 3 - packuswb m5, m5 - movd [r0 + r1], m5 + psraw m1, 3 + packuswb m1, m1 + movd [r0 + r1], m1 lea r0, [r0 + 2 * r1] - pshuflw m5, m2, 10101010b - pmullw m5, m6 - paddw m5, m3 - paddw m3, m4 - psraw m5, 3 - packuswb m5, m5 - movd [r0], m5 - - pshuflw m5, m2, 11111111b - pmullw m5, m6 - paddw m5, m3 + pshuflw m1, m2, 10101010b + pmullw m1, [pw_planar4_0] + paddw m1, m3 paddw m3, m4 - psraw m5, 3 - packuswb m5, m5 - movd [r0 + r1], m5 + psraw m1, 3 + packuswb m1, m1 + movd [r0], m1 + + pshuflw m1, m2, 11111111b + pmullw m1, [pw_planar4_0] + paddw m1, m3 + psraw m1, 3 + packuswb m1, m1 + movd [r0 + r1], m1 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal intra_pred_planar8, 3,3,7 - pmovzxbw m1, [r2 + 1] - pmovzxbw m2, [r2 + 17] +INIT_XMM sse2 +cglobal intra_pred_planar8, 3,3,6 + pxor m0, m0 + movh m1, [r2 + 1] + punpcklbw m1, m0 + movh m2, [r2 + 17] + punpcklbw m2, m0 movd m3, [r2 + 9] ; topRight = above[8]; movd m4, [r2 + 25] ; bottomLeft = left[8]; - pxor m0, m0 - pshufb m3, m0 - pshufb m4, m0 - punpcklbw m3, m0 ; v_topRight - punpcklbw m4, m0 ; v_bottomLeft + pand m3, [pw_00ff] + pand m4, [pw_00ff] + pshuflw m3, m3, 0x00 + pshuflw m4, m4, 0x00 + pshufd m3, m3, 0x44 + pshufd m4, m4, 0x44 pmullw m3, [multiL] ; (x + 1) * topRight pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) * above[x] - mova m6, [pw_planar8_0] paddw m3, [pw_8] paddw m3, m4 paddw m3, m0 psubw m4, m1 -%macro INTRA_PRED_PLANAR8 1 +%macro INTRA_PRED_PLANAR_8 1 %if (%1 < 4) pshuflw m5, m2, 0x55 * %1 pshufd m5, m5, 0 @@ -502,41 +912,46 @@ pshufhw m5, m2, 0x55 * (%1 - 4) pshufd m5, m5, 0xAA %endif - pmullw m5, m6 + pmullw m5, [pw_planar8_0] paddw m5, m3 - paddw m3, m4 psraw m5, 4 packuswb m5, m5 movh [r0], m5 +%if (%1 < 7) + paddw m3, m4 lea r0, [r0 + r1] +%endif %endmacro - INTRA_PRED_PLANAR8 0 - INTRA_PRED_PLANAR8 1 - INTRA_PRED_PLANAR8 2 - INTRA_PRED_PLANAR8 3 - INTRA_PRED_PLANAR8 4 - INTRA_PRED_PLANAR8 5 - INTRA_PRED_PLANAR8 6 - INTRA_PRED_PLANAR8 7 + INTRA_PRED_PLANAR_8 0 + INTRA_PRED_PLANAR_8 1 + INTRA_PRED_PLANAR_8 2 + INTRA_PRED_PLANAR_8 3 + INTRA_PRED_PLANAR_8 4 + INTRA_PRED_PLANAR_8 5 + INTRA_PRED_PLANAR_8 6 + INTRA_PRED_PLANAR_8 7 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal intra_pred_planar16, 3,3,8 - pmovzxbw m2, [r2 + 1] - pmovzxbw m7, [r2 + 9] +INIT_XMM sse2 +cglobal intra_pred_planar16, 3,5,8 + pxor m0, m0 + movh m2, [r2 + 1] + punpcklbw m2, m0 + movh m7, [r2 + 9] + punpcklbw m7, m0 movd m3, [r2 + 17] ; topRight = above[16] movd m6, [r2 + 49] ; bottomLeft = left[16] - - pxor m0, m0 - pshufb m3, m0 - pshufb m6, m0 - punpcklbw m3, m0 ; v_topRight - punpcklbw m6, m0 ; v_bottomLeft + pand m3, [pw_00ff] + pand m6, [pw_00ff] + pshuflw m3, m3, 0x00 + pshuflw m6, m6, 0x00 + pshufd m3, m3, 0x44 ; v_topRight + pshufd m6, m6, 0x44 ; v_bottomLeft pmullw m4, m3, [multiH] ; (x + 1) * topRight pmullw m3, [multiL] ; (x + 1) * topRight @@ -551,10 +966,12 @@ psubw m1, m6, m7 psubw m6, m2 - pmovzxbw m2, [r2 + 33] - pmovzxbw m7, [r2 + 41] + movh m2, [r2 + 33] + punpcklbw m2, m0 + movh m7, [r2 + 41] + punpcklbw m7, m0 -%macro INTRA_PRED_PLANAR16 1 +%macro INTRA_PRED_PLANAR_16 1 %if (%1 < 4) pshuflw m5, m2, 0x55 * %1 pshufd m5, m5, 0 @@ -572,149 +989,267 @@ %endif %endif %endif +%if (%1 > 0) + paddw m3, m6 + paddw m4, m1 + lea r0, [r0 + r1] +%endif pmullw m0, m5, [pw_planar8_0] pmullw m5, [pw_planar16_0] paddw m0, m4 paddw m5, m3 - paddw m3, m6 - paddw m4, m1 psraw m5, 5 psraw m0, 5 packuswb m5, m0 movu [r0], m5 - lea r0, [r0 + r1] %endmacro - INTRA_PRED_PLANAR16 0 - INTRA_PRED_PLANAR16 1 - INTRA_PRED_PLANAR16 2 - INTRA_PRED_PLANAR16 3 - INTRA_PRED_PLANAR16 4 - INTRA_PRED_PLANAR16 5 - INTRA_PRED_PLANAR16 6 - INTRA_PRED_PLANAR16 7 - INTRA_PRED_PLANAR16 8 - INTRA_PRED_PLANAR16 9 - INTRA_PRED_PLANAR16 10 - INTRA_PRED_PLANAR16 11 - INTRA_PRED_PLANAR16 12 - INTRA_PRED_PLANAR16 13 - INTRA_PRED_PLANAR16 14 - INTRA_PRED_PLANAR16 15 + INTRA_PRED_PLANAR_16 0 + INTRA_PRED_PLANAR_16 1 + INTRA_PRED_PLANAR_16 2 + INTRA_PRED_PLANAR_16 3 + INTRA_PRED_PLANAR_16 4 + INTRA_PRED_PLANAR_16 5 + INTRA_PRED_PLANAR_16 6 + INTRA_PRED_PLANAR_16 7 + INTRA_PRED_PLANAR_16 8 + INTRA_PRED_PLANAR_16 9 + INTRA_PRED_PLANAR_16 10 + INTRA_PRED_PLANAR_16 11 + INTRA_PRED_PLANAR_16 12 + INTRA_PRED_PLANAR_16 13 + INTRA_PRED_PLANAR_16 14 + INTRA_PRED_PLANAR_16 15 RET ;--------------------------------------------------------------------------------------- ; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) ;--------------------------------------------------------------------------------------- -INIT_XMM sse4 +INIT_XMM sse2 %if ARCH_X86_64 == 1 -cglobal intra_pred_planar32, 3,4,12 -%else -cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize) - %define m8 [rsp + 0 * mmsize] - %define m9 [rsp + 1 * mmsize] - %define m10 [rsp + 2 * mmsize] - %define m11 [rsp + 3 * mmsize] -%endif +cglobal intra_pred_planar32, 3,3,16 movd m3, [r2 + 33] ; topRight = above[32] pxor m7, m7 - pshufb m3, m7 - punpcklbw m3, m7 ; v_topRight + pand m3, [pw_00ff] + pshuflw m3, m3, 0x00 + pshufd m3, m3, 0x44 pmullw m0, m3, [multiL] ; (x + 1) * topRight pmullw m1, m3, [multiH] ; (x + 1) * topRight pmullw m2, m3, [multiH2] ; (x + 1) * topRight pmullw m3, [multiH3] ; (x + 1) * topRight - movd m6, [r2 + 97] ; bottomLeft = left[32] - pshufb m6, m7 - punpcklbw m6, m7 ; v_bottomLeft - - paddw m0, m6 - paddw m1, m6 - paddw m2, m6 - paddw m3, m6 - paddw m0, [pw_32] - paddw m1, [pw_32] - paddw m2, [pw_32] - paddw m3, [pw_32] + movd m11, [r2 + 97] ; bottomLeft = left[32] + pand m11, [pw_00ff] + pshuflw m11, m11, 0x00 + pshufd m11, m11, 0x44 + mova m5, m11 + paddw m5, [pw_32] - pmovzxbw m4, [r2 + 1] - pmullw m5, m4, [pw_planar32_1] paddw m0, m5 - psubw m5, m6, m4 - mova m8, m5 + paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + mova m8, m11 + mova m9, m11 + mova m10, m11 - pmovzxbw m4, [r2 + 9] - pmullw m5, m4, [pw_planar32_1] + mova m12, [pw_planar32_1] + movh m4, [r2 + 1] + punpcklbw m4, m7 + psubw m8, m4 + pmullw m4, m12 + paddw m0, m4 + + movh m4, [r2 + 9] + punpcklbw m4, m7 + psubw m9, m4 + pmullw m4, m12 + paddw m1, m4 + + movh m4, [r2 + 17] + punpcklbw m4, m7 + psubw m10, m4 + pmullw m4, m12 + paddw m2, m4 + + movh m4, [r2 + 25] + punpcklbw m4, m7 + psubw m11, m4 + pmullw m4, m12 + paddw m3, m4 + + mova m12, [pw_planar32_L] + mova m13, [pw_planar32_H] + mova m14, [pw_planar16_0] + mova m15, [pw_planar8_0] +%macro PROCESS 1 + pmullw m5, %1, m12 + pmullw m6, %1, m13 + paddw m5, m0 + paddw m6, m1 + psraw m5, 6 + psraw m6, 6 + packuswb m5, m6 + movu [r0], m5 + + pmullw m5, %1, m14 + pmullw %1, m15 + paddw m5, m2 + paddw %1, m3 + psraw m5, 6 + psraw %1, 6 + packuswb m5, %1 + movu [r0 + 16], m5 +%endmacro + +%macro INCREMENT 0 + paddw m2, m10 + paddw m3, m11 + paddw m0, m8 + paddw m1, m9 + add r0, r1 +%endmacro + +%assign x 0 +%rep 4 + pxor m7, m7 + movq m4, [r2 + 65 + x * 8] + punpcklbw m4, m7 +%assign y 0 +%rep 8 + %if y < 4 + pshuflw m7, m4, 0x55 * y + pshufd m7, m7, 0x44 + %else + pshufhw m7, m4, 0x55 * (y - 4) + pshufd m7, m7, 0xEE + %endif + PROCESS m7 + %if x + y < 10 + INCREMENT + %endif +%assign y y+1 +%endrep +%assign x x+1 +%endrep + RET + +%else ;end ARCH_X86_64, start ARCH_X86_32 +cglobal intra_pred_planar32, 3,3,8,0-(4*mmsize) + movd m3, [r2 + 33] ; topRight = above[32] + + pxor m7, m7 + pand m3, [pw_00ff] + pshuflw m3, m3, 0x00 + pshufd m3, m3, 0x44 + + pmullw m0, m3, [multiL] ; (x + 1) * topRight + pmullw m1, m3, [multiH] ; (x + 1) * topRight + pmullw m2, m3, [multiH2] ; (x + 1) * topRight + pmullw m3, [multiH3] ; (x + 1) * topRight + + movd m6, [r2 + 97] ; bottomLeft = left[32] + pand m6, [pw_00ff] + pshuflw m6, m6, 0x00 + pshufd m6, m6, 0x44 + mova m5, m6 + paddw m5, [pw_32] + + paddw m0, m5 paddw m1, m5 + paddw m2, m5 + paddw m3, m5 + + movh m4, [r2 + 1] + punpcklbw m4, m7 psubw m5, m6, m4 - mova m9, m5 + mova [rsp + 0 * mmsize], m5 + pmullw m4, [pw_planar32_1] + paddw m0, m4 - pmovzxbw m4, [r2 + 17] - pmullw m5, m4, [pw_planar32_1] - paddw m2, m5 + movh m4, [r2 + 9] + punpcklbw m4, m7 psubw m5, m6, m4 - mova m10, m5 + mova [rsp + 1 * mmsize], m5 + pmullw m4, [pw_planar32_1] + paddw m1, m4 - pmovzxbw m4, [r2 + 25] - pmullw m5, m4, [pw_planar32_1] - paddw m3, m5 + movh m4, [r2 + 17] + punpcklbw m4, m7 psubw m5, m6, m4 - mova m11, m5 - add r2, 65 ; (2 * blkSize + 1) + mova [rsp + 2 * mmsize], m5 + pmullw m4, [pw_planar32_1] + paddw m2, m4 -%macro INTRA_PRED_PLANAR32 0 - movd m4, [r2] - pshufb m4, m7 + movh m4, [r2 + 25] punpcklbw m4, m7 + psubw m5, m6, m4 + mova [rsp + 3 * mmsize], m5 + pmullw m4, [pw_planar32_1] + paddw m3, m4 - pmullw m5, m4, [pw_planar32_L] - pmullw m6, m4, [pw_planar32_H] +%macro PROCESS 1 + pmullw m5, %1, [pw_planar32_L] + pmullw m6, %1, [pw_planar32_H] paddw m5, m0 paddw m6, m1 - paddw m0, m8 - paddw m1, m9 psraw m5, 6 psraw m6, 6 packuswb m5, m6 movu [r0], m5 - pmullw m5, m4, [pw_planar16_0] - pmullw m4, [pw_planar8_0] + pmullw m5, %1, [pw_planar16_0] + pmullw %1, [pw_planar8_0] paddw m5, m2 - paddw m4, m3 - paddw m2, m10 - paddw m3, m11 + paddw %1, m3 psraw m5, 6 - psraw m4, 6 - packuswb m5, m4 + psraw %1, 6 + packuswb m5, %1 movu [r0 + 16], m5 +%endmacro - lea r0, [r0 + r1] - inc r2 +%macro INCREMENT 0 + paddw m0, [rsp + 0 * mmsize] + paddw m1, [rsp + 1 * mmsize] + paddw m2, [rsp + 2 * mmsize] + paddw m3, [rsp + 3 * mmsize] + add r0, r1 %endmacro - mov r3, 4 -.loop: - INTRA_PRED_PLANAR32 - INTRA_PRED_PLANAR32 - INTRA_PRED_PLANAR32 - INTRA_PRED_PLANAR32 - INTRA_PRED_PLANAR32 - INTRA_PRED_PLANAR32 - INTRA_PRED_PLANAR32 - INTRA_PRED_PLANAR32 - dec r3 - jnz .loop +%assign y 0 +%rep 4 + pxor m7, m7 + movq m4, [r2 + 65 + y * 8] + punpcklbw m4, m7 +%assign x 0 +%rep 8 + %if x < 4 + pshuflw m7, m4, 0x55 * x + pshufd m7, m7, 0x44 + %else + pshufhw m7, m4, 0x55 * (x - 4) + pshufd m7, m7, 0xEE + %endif + + PROCESS m7 + %if x + y < 10 + INCREMENT + %endif +%assign x x+1 +%endrep +%assign y y+1 +%endrep RET +%endif ; end ARCH_X86_32 + ;----------------------------------------------------------------------------------------- ; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) ;----------------------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal intra_pred_ang4_2, 3,5,4 +INIT_XMM sse2 +cglobal intra_pred_ang4_2, 3,5,3 lea r4, [r2 + 2] add r2, 10 cmp r3m, byte 34 @@ -722,65 +1257,94 @@ movh m0, [r2] movd [r0], m0 - palignr m1, m0, 1 + mova m1, m0 + psrldq m1, 1 movd [r0 + r1], m1 - palignr m2, m0, 2 + mova m2, m0 + psrldq m2, 2 movd [r0 + r1 * 2], m2 lea r1, [r1 * 3] psrldq m0, 3 movd [r0 + r1], m0 RET -INIT_XMM sse4 -cglobal intra_pred_ang4_3, 3,5,5 +INIT_XMM sse2 +cglobal intra_pred_ang4_3, 3,5,8 mov r4, 1 cmp r3m, byte 33 mov r3, 9 cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + mova m1, m0 + psrldq m1, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] - palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] - palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4] + mova m1, m0 + psrldq m1, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + mova m2, m0 + psrldq m2, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] + mova m3, m0 + psrldq m3, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4] punpcklqdq m0, m1 punpcklqdq m2, m3 - lea r3, [ang_table + 20 * 16] - movh m3, [r3 + 6 * 16] ; [26] - movhps m3, [r3] ; [20] - movh m4, [r3 - 6 * 16] ; [14] - movhps m4, [r3 - 12 * 16] ; [ 8] + lea r3, [pw_ang_table + 20 * 16] + mova m4, [r3 + 6 * 16] ; [26] + mova m5, [r3] ; [20] + mova m6, [r3 - 6 * 16] ; [14] + mova m7, [r3 - 12 * 16] ; [ 8] jmp .do_filter4x4 ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose ALIGN 16 .do_filter4x4: - mova m1, [pw_1024] - - pmaddubsw m0, m3 - pmulhrsw m0, m1 - pmaddubsw m2, m4 - pmulhrsw m2, m1 - packuswb m0, m2 + pxor m1, m1 + pxor m3, m3 + punpckhbw m3, m0 + psrlw m3, 8 + pmaddwd m3, m5 + punpcklbw m0, m1 + pmaddwd m0, m4 + packssdw m0, m3 + paddw m0, [pw_16] + psraw m0, 5 + pxor m3, m3 + punpckhbw m3, m2 + psrlw m3, 8 + pmaddwd m3, m7 + punpcklbw m2, m1 + pmaddwd m2, m6 + packssdw m2, m3 + paddw m2, [pw_16] + psraw m2, 5 - ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before + ; NOTE: mode 33 doesn't reorder, UNSAFE but I don't use any instruction that affect eflag register before jz .store - ; transpose 4x4 - pshufb m0, [c_trans_4x4] + ; transpose 4x4 c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 + pshufd m0, m0, 0xD8 + pshufd m1, m2, 0xD8 + pshuflw m0, m0, 0xD8 + pshuflw m1, m1, 0xD8 + pshufhw m0, m0, 0xD8 + pshufhw m1, m1, 0xD8 + mova m2, m0 + punpckldq m0, m1 + punpckhdq m2, m1 .store: - ; TODO: use pextrd here after intrinsic ssse3 removed + packuswb m0, m2 movd [r0], m0 - pextrd [r0 + r1], m0, 1 - pextrd [r0 + r1 * 2], m0, 2 + pshufd m0, m0, 0x39 + movd [r0 + r1], m0 + pshufd m0, m0, 0x39 + movd [r0 + r1 * 2], m0 lea r1, [r1 * 3] - pextrd [r0 + r1], m0, 3 + pshufd m0, m0, 0x39 + movd [r0 + r1], m0 RET -cglobal intra_pred_ang4_4, 3,5,5 +cglobal intra_pred_ang4_4, 3,5,8 xor r4, r4 inc r4 cmp r3m, byte 32 @@ -788,21 +1352,24 @@ cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + mova m1, m0 + psrldq m1, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] - palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] + mova m1, m0 + psrldq m1, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + mova m3, m0 + psrldq m3, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] punpcklqdq m0, m1 punpcklqdq m2, m1, m3 - lea r3, [ang_table + 18 * 16] - movh m3, [r3 + 3 * 16] ; [21] - movhps m3, [r3 - 8 * 16] ; [10] - movh m4, [r3 + 13 * 16] ; [31] - movhps m4, [r3 + 2 * 16] ; [20] + lea r3, [pw_ang_table + 18 * 16] + mova m4, [r3 + 3 * 16] ; [21] + mova m5, [r3 - 8 * 16] ; [10] + mova m6, [r3 + 13 * 16] ; [31] + mova m7, [r3 + 2 * 16] ; [20] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) -cglobal intra_pred_ang4_5, 3,5,5 +cglobal intra_pred_ang4_5, 3,5,8 xor r4, r4 inc r4 cmp r3m, byte 31 @@ -810,21 +1377,24 @@ cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + mova m1, m0 + psrldq m1, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] - palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] + mova m1, m0 + psrldq m1, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + mova m3, m0 + psrldq m3, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] punpcklqdq m0, m1 punpcklqdq m2, m1, m3 - lea r3, [ang_table + 10 * 16] - movh m3, [r3 + 7 * 16] ; [17] - movhps m3, [r3 - 8 * 16] ; [ 2] - movh m4, [r3 + 9 * 16] ; [19] - movhps m4, [r3 - 6 * 16] ; [ 4] + lea r3, [pw_ang_table + 10 * 16] + mova m4, [r3 + 7 * 16] ; [17] + mova m5, [r3 - 8 * 16] ; [ 2] + mova m6, [r3 + 9 * 16] ; [19] + mova m7, [r3 - 6 * 16] ; [ 4] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) -cglobal intra_pred_ang4_6, 3,5,5 +cglobal intra_pred_ang4_6, 3,5,8 xor r4, r4 inc r4 cmp r3m, byte 30 @@ -832,20 +1402,22 @@ cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + mova m1, m0 + psrldq m1, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + mova m2, m0 + psrldq m2, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] punpcklqdq m0, m0 punpcklqdq m2, m2 - lea r3, [ang_table + 19 * 16] - movh m3, [r3 - 6 * 16] ; [13] - movhps m3, [r3 + 7 * 16] ; [26] - movh m4, [r3 - 12 * 16] ; [ 7] - movhps m4, [r3 + 1 * 16] ; [20] + lea r3, [pw_ang_table + 19 * 16] + mova m4, [r3 - 6 * 16] ; [13] + mova m5, [r3 + 7 * 16] ; [26] + mova m6, [r3 - 12 * 16] ; [ 7] + mova m7, [r3 + 1 * 16] ; [20] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) -cglobal intra_pred_ang4_7, 3,5,5 +cglobal intra_pred_ang4_7, 3,5,8 xor r4, r4 inc r4 cmp r3m, byte 29 @@ -853,20 +1425,22 @@ cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + mova m1, m0 + psrldq m1, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + mova m3, m0 + psrldq m3, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] punpcklqdq m2, m0, m3 punpcklqdq m0, m0 - lea r3, [ang_table + 20 * 16] - movh m3, [r3 - 11 * 16] ; [ 9] - movhps m3, [r3 - 2 * 16] ; [18] - movh m4, [r3 + 7 * 16] ; [27] - movhps m4, [r3 - 16 * 16] ; [ 4] + lea r3, [pw_ang_table + 20 * 16] + mova m4, [r3 - 11 * 16] ; [ 9] + mova m5, [r3 - 2 * 16] ; [18] + mova m6, [r3 + 7 * 16] ; [27] + mova m7, [r3 - 16 * 16] ; [ 4] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) -cglobal intra_pred_ang4_8, 3,5,5 +cglobal intra_pred_ang4_8, 3,5,8 xor r4, r4 inc r4 cmp r3m, byte 28 @@ -874,19 +1448,20 @@ cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + mova m1, m0 + psrldq m1, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklqdq m0, m0 mova m2, m0 - lea r3, [ang_table + 13 * 16] - movh m3, [r3 - 8 * 16] ; [ 5] - movhps m3, [r3 - 3 * 16] ; [10] - movh m4, [r3 + 2 * 16] ; [15] - movhps m4, [r3 + 7 * 16] ; [20] + lea r3, [pw_ang_table + 13 * 16] + mova m4, [r3 - 8 * 16] ; [ 5] + mova m5, [r3 - 3 * 16] ; [10] + mova m6, [r3 + 2 * 16] ; [15] + mova m7, [r3 + 7 * 16] ; [20] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) -cglobal intra_pred_ang4_9, 3,5,5 +cglobal intra_pred_ang4_9, 3,5,8 xor r4, r4 inc r4 cmp r3m, byte 27 @@ -894,1676 +1469,2064 @@ cmove r3, r4 movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + mova m1, m0 + psrldq m1, 1 ; [x 8 7 6 5 4 3 2] punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] punpcklqdq m0, m0 mova m2, m0 - lea r3, [ang_table + 4 * 16] - movh m3, [r3 - 2 * 16] ; [ 2] - movhps m3, [r3 - 0 * 16] ; [ 4] - movh m4, [r3 + 2 * 16] ; [ 6] - movhps m4, [r3 + 4 * 16] ; [ 8] + lea r3, [pw_ang_table + 4 * 16] + mova m4, [r3 - 2 * 16] ; [ 2] + mova m5, [r3 - 0 * 16] ; [ 4] + mova m6, [r3 + 2 * 16] ; [ 6] + mova m7, [r3 + 4 * 16] ; [ 8] jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) -cglobal intra_pred_ang4_10, 3,3,4 - movd m0, [r2 + 9] ; [8 7 6 5 4 3 2 1] - pshufb m0, [pb_unpackbd1] - pshufd m1, m0, 1 - movhlps m2, m0 - pshufd m3, m0, 3 - movd [r0 + r1], m1 - movd [r0 + r1 * 2], m2 - lea r1, [r1 * 3] - movd [r0 + r1], m3 - cmp r4m, byte 0 - jz .quit +;--------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) +;--------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc4, 5,5,3 + inc r2 + pxor m0, m0 + movd m1, [r2] + movd m2, [r2 + 8] + punpckldq m1, m2 + psadbw m1, m0 ; m1 = sum - ; filter - pmovzxbw m0, m0 ; [-1 -1 -1 -1] - movh m1, [r2] ; [4 3 2 1 0] - pshufb m2, m1, [pb_0_8] ; [0 0 0 0] - pshufb m1, [pb_unpackbw1] ; [4 3 2 1] - psubw m1, m2 - psraw m1, 1 - paddw m0, m1 - packuswb m0, m0 -.quit: - movd [r0], m0 - RET + test r4d, r4d -INIT_XMM sse4 -cglobal intra_pred_ang4_26, 3,4,3 - movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] + pmulhrsw m1, [pw_4096] ; m1 = (sum + 4) / 8 + movd r4d, m1 ; r4d = dc_val + pshufb m1, m0 ; m1 = byte [dc_val ...] - ; store - movd [r0], m0 - movd [r0 + r1], m0 - movd [r0 + r1 * 2], m0 + ; store DC 4x4 lea r3, [r1 * 3] - movd [r0 + r3], m0 + movd [r0], m1 + movd [r0 + r1], m1 + movd [r0 + r1 * 2], m1 + movd [r0 + r3], m1 - ; filter - cmp r4m, byte 0 - jz .quit + ; do DC filter + jz .end + lea r3d, [r4d * 2 + 2] ; r3d = DC * 2 + 2 + add r4d, r3d ; r4d = DC * 3 + 2 + movd m1, r4d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + pshufd m1, m1, 0 - pshufb m0, [pb_0_8] ; [ 1 1 1 1] - movh m1, [r2 + 8] ; [-4 -3 -2 -1 0] - pinsrb m1, [r2], 0 - pshufb m2, m1, [pb_0_8] ; [0 0 0 0] - pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1] - psubw m1, m2 - psraw m1, 1 - paddw m0, m1 - packuswb m0, m0 + ; filter top + movd m2, [r2] + movd m0, [r2 + 9] + punpckldq m2, m0 + pmovzxbw m2, m2 + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + movd [r0], m2 ; overwrite top-left pixel, we will update it later - pextrb [r0], m0, 0 - pextrb [r0 + r1], m0, 1 - pextrb [r0 + r1 * 2], m0, 2 - pextrb [r0 + r3], m0, 3 -.quit: + ; filter top-left + movzx r4d, byte [r2 + 8] + add r3d, r4d + movzx r4d, byte [r2] + add r3d, r4d + shr r3d, 2 + mov [r0], r3b + + ; filter left + add r0, r1 + pextrb [r0], m2, 4 + pextrb [r0 + r1], m2, 5 + pextrb [r0 + r1 * 2], m2, 6 + +.end: RET -cglobal intra_pred_ang4_11, 3,5,5 - xor r4, r4 - cmp r3m, byte 25 - mov r3, 8 - cmove r3, r4 +;--------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) +;--------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc8, 5, 7, 3 + lea r3, [r2 + 17] + inc r2 + pxor m0, m0 + movh m1, [r2] + movh m2, [r3] + punpcklqdq m1, m2 + psadbw m1, m0 + pshufd m2, m1, 2 + paddw m1, m2 - movh m0, [r2 + r3] ; [x x x 4 3 2 1 0] - pinsrb m0, [r2], 0 - palignr m1, m0, 1 ; [x x x x 4 3 2 1] - punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] - punpcklqdq m0, m0 - mova m2, m0 + movd r5d, m1 + add r5d, 8 + shr r5d, 4 ; sum = sum / 16 + movd m1, r5d + pshufb m1, m0 ; m1 = byte [dc_val ...] - lea r3, [ang_table + 24 * 16] + test r4d, r4d - movh m3, [r3 + 6 * 16] ; [24] - movhps m3, [r3 + 4 * 16] ; [26] - movh m4, [r3 + 2 * 16] ; [28] - movhps m4, [r3 + 0 * 16] ; [30] - jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + ; store DC 8x8 + mov r6, r0 + movh [r0], m1 + movh [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movh [r0], m1 + movh [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movh [r0], m1 + movh [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movh [r0], m1 + movh [r0 + r1], m1 -cglobal intra_pred_ang4_12, 3,5,5 - xor r4, r4 - cmp r3m, byte 24 - mov r3, 8 - cmove r3, r4 + ; Do DC Filter + jz .end + lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 + add r5d, r4d ; r5d = DC * 3 + 2 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + pshufd m1, m1, 0 - movh m0, [r2 + r3] ; [x x x 4 3 2 1 0] - pinsrb m0, [r2], 0 - palignr m1, m0, 1 ; [x x x x 4 3 2 1] - punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] - punpcklqdq m0, m0 - mova m2, m0 + ; filter top + pmovzxbw m2, [r2] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + movh [r6], m2 - lea r3, [ang_table + 20 * 16] - movh m3, [r3 + 7 * 16] ; [27] - movhps m3, [r3 + 2 * 16] ; [22] - movh m4, [r3 - 3 * 16] ; [17] - movhps m4, [r3 - 8 * 16] ; [12] - jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + ; filter top-left + movzx r5d, byte [r3] + add r4d, r5d + movzx r3d, byte [r2] + add r3d, r4d + shr r3d, 2 + mov [r6], r3b -cglobal intra_pred_ang4_13, 4,5,5 - xor r4, r4 - cmp r3m, byte 23 - mov r3, 8 - jz .next - xchg r3, r4 -.next: - movh m1, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] - pinsrb m1, [r2], 1 - palignr m0, m1, 1 ; [x x x 4 3 2 1 0] - palignr m2, m1, 2 ; [x x x x 4 3 2 1] - pinsrb m1, [r2 + r3 + 4], 0 - punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x] - punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0] - punpcklqdq m2, m0, m1 - punpcklqdq m0, m0 + ; filter left + add r6, r1 + pmovzxbw m2, [r2 + 17] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + pextrb [r6], m2, 0 + pextrb [r6 + r1], m2, 1 + pextrb [r6 + 2 * r1], m2, 2 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m2, 3 + pextrb [r6 + r1 * 2], m2, 4 + pextrb [r6 + r1 * 4], m2, 6 + lea r1, [r1 * 3] + pextrb [r6 + r1], m2, 5 - lea r3, [ang_table + 21 * 16] - movh m3, [r3 + 2 * 16] ; [23] - movhps m3, [r3 - 7 * 16] ; [14] - movh m4, [r3 - 16 * 16] ; [ 5] - movhps m4, [r3 + 7 * 16] ; [28] - jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) +.end: + RET -cglobal intra_pred_ang4_14, 4,5,5 - xor r4, r4 - cmp r3m, byte 22 - mov r3, 8 - jz .next - xchg r3, r4 -.next: - movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] - pinsrb m2, [r2], 1 - palignr m0, m2, 1 ; [x x x 4 3 2 1 0] - palignr m1, m2, 2 ; [x x x x 4 3 2 1] - pinsrb m2, [r2 + r3 + 2], 0 - punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] - punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] - punpcklqdq m0, m0 - punpcklqdq m2, m2 +;-------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) +;-------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_dc16, 5, 7, 4 + lea r3, [r2 + 33] + inc r2 + pxor m0, m0 + movu m1, [r2] + movu m2, [r3] + psadbw m1, m0 + psadbw m2, m0 + paddw m1, m2 + pshufd m2, m1, 2 + paddw m1, m2 - lea r3, [ang_table + 19 * 16] - movh m3, [r3 + 0 * 16] ; [19] - movhps m3, [r3 - 13 * 16] ; [ 6] - movh m4, [r3 + 6 * 16] ; [25] - movhps m4, [r3 - 7 * 16] ; [12] - jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + movd r5d, m1 + add r5d, 16 + shr r5d, 5 ; sum = sum / 32 + movd m1, r5d + pshufb m1, m0 ; m1 = byte [dc_val ...] -cglobal intra_pred_ang4_15, 4,5,5 - xor r4, r4 - cmp r3m, byte 21 - mov r3, 8 - jz .next - xchg r3, r4 -.next: - movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] - pinsrb m2, [r2], 1 - palignr m0, m2, 1 ; [x x x 4 3 2 1 0] - palignr m1, m2, 2 ; [x x x x 4 3 2 1] - pinsrb m2, [r2 + r3 + 2], 0 - pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] - pinsrb m3, [r2 + r3 + 4], 0 - punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] - punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] - punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] - punpcklqdq m0, m2 - punpcklqdq m2, m4 - - lea r3, [ang_table + 23 * 16] - movh m3, [r3 - 8 * 16] ; [15] - movhps m3, [r3 + 7 * 16] ; [30] - movh m4, [r3 - 10 * 16] ; [13] - movhps m4, [r3 + 5 * 16] ; [28] - jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - -cglobal intra_pred_ang4_16, 3,5,5 - xor r4, r4 - cmp r3m, byte 20 - mov r3, 8 - jz .next - xchg r3, r4 -.next: - movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] - pinsrb m2, [r2], 1 - palignr m0, m2, 1 ; [x x x 4 3 2 1 0] - palignr m1, m2, 2 ; [x x x x 4 3 2 1] - pinsrb m2, [r2 + r3 + 2], 0 - pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] - pinsrb m3, [r2 + r3 + 3], 0 - punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] - punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] - punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] - punpcklqdq m0, m2 - punpcklqdq m2, m4 - - lea r3, [ang_table + 19 * 16] - movh m3, [r3 - 8 * 16] ; [11] - movhps m3, [r3 + 3 * 16] ; [22] - movh m4, [r3 - 18 * 16] ; [ 1] - movhps m4, [r3 - 7 * 16] ; [12] - jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + test r4d, r4d -cglobal intra_pred_ang4_17, 3,5,5 - xor r4, r4 - cmp r3m, byte 19 - mov r3, 8 - jz .next - xchg r3, r4 -.next: - movh m3, [r2 + r4 - 1] ; [- - 4 3 2 1 0 x] - pinsrb m3, [r2], 1 - palignr m0, m3, 1 ; [- - - 4 3 2 1 0] - palignr m1, m3, 2 ; [- - - - 4 3 2 1] - mova m4, m0 - punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] - pinsrb m3, [r2 + r3 + 1], 0 - punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x] - punpcklqdq m0, m1 + ; store DC 16x16 + mov r6, r0 + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 + lea r0, [r0 + r1 * 2] + movu [r0], m1 + movu [r0 + r1], m1 - pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y] - pinsrb m2, [r2 + r3 + 2], 0 - pslldq m1, m2, 1 ; [4 3 2 1 0 x y z] - pinsrb m1, [r2 + r3 + 4], 0 - punpcklbw m1, m2 ; [1 0 0 x x y y z] - punpcklbw m2, m3 ; [2 1 1 0 0 x x y] - punpcklqdq m2, m1 + ; Do DC Filter + jz .end + lea r4d, [r5d * 2 + 2] ; r4d = DC * 2 + 2 + add r5d, r4d ; r5d = DC * 3 + 2 + movd m1, r5d + pshuflw m1, m1, 0 ; m1 = pixDCx3 + pshufd m1, m1, 0 - lea r3, [ang_table + 14 * 16] - movh m3, [r3 - 8 * 16] ; [ 6] - movhps m3, [r3 - 2 * 16] ; [12] - movh m4, [r3 + 4 * 16] ; [18] - movhps m4, [r3 + 10 * 16] ; [24] - jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) + ; filter top + pmovzxbw m2, [r2] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + movh [r6], m2 + pmovzxbw m3, [r2 + 8] + paddw m3, m1 + psraw m3, 2 + packuswb m3, m3 + movh [r6 + 8], m3 -cglobal intra_pred_ang4_18, 3,5,1 - mov r4d, [r2 + 8] - mov r3b, byte [r2] - mov [r2 + 8], r3b - mov r3d, [r2 + 8] - bswap r3d - movd m0, r3d + ; filter top-left + movzx r5d, byte [r3] + add r4d, r5d + movzx r3d, byte [r2] + add r3d, r4d + shr r3d, 2 + mov [r6], r3b - pinsrd m0, [r2 + 1], 1 ; [- 3 2 1 0 -1 -2 -3] - lea r3, [r1 * 3] - movd [r0 + r3], m0 - psrldq m0, 1 - movd [r0 + r1 * 2], m0 - psrldq m0, 1 - movd [r0 + r1], m0 - psrldq m0, 1 - movd [r0], m0 - mov [r2 + 8], r4w - RET + ; filter left + add r6, r1 + pmovzxbw m2, [r2 + 33] + paddw m2, m1 + psraw m2, 2 + packuswb m2, m2 + pextrb [r6], m2, 0 + pextrb [r6 + r1], m2, 1 + pextrb [r6 + r1 * 2], m2, 2 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m2, 3 + pextrb [r6 + r1 * 2], m2, 4 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m2, 5 + pextrb [r6 + r1 * 2], m2, 6 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m2, 7 -;----------------------------------------------------------------------------------------- -; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) -;----------------------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal intra_pred_ang8_2, 3,5,2 - lea r4, [r2 + 2] - add r2, 18 - cmp r3m, byte 34 - cmove r2, r4 - movu m0, [r2] - lea r4, [r1 * 3] + pmovzxbw m3, [r2 + 41] + paddw m3, m1 + psraw m3, 2 + packuswb m3, m3 + pextrb [r6 + r1 * 2], m3, 0 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m3, 1 + pextrb [r6 + r1 * 2], m3, 2 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m3, 3 + pextrb [r6 + r1 * 2], m3, 4 + lea r6, [r6 + r1 * 2] + pextrb [r6 + r1], m3, 5 + pextrb [r6 + r1 * 2], m3, 6 - movh [r0], m0 - palignr m1, m0, 1 - movh [r0 + r1], m1 - palignr m1, m0, 2 - movh [r0 + r1 * 2], m1 - palignr m1, m0, 3 - movh [r0 + r4], m1 - palignr m1, m0, 4 - lea r0, [r0 + r1 * 4] - movh [r0], m1 - palignr m1, m0, 5 - movh [r0 + r1], m1 - palignr m1, m0, 6 - movh [r0 + r1 * 2], m1 - palignr m1, m0, 7 - movh [r0 + r4], m1 +.end: RET +;--------------------------------------------------------------------------------------------- +; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter) +;--------------------------------------------------------------------------------------------- INIT_XMM sse4 -cglobal intra_pred_ang8_3, 3,5,8 - lea r4, [r2 + 1] - add r2, 17 - cmp r3m, byte 33 - cmove r2, r4 - lea r3, [ang_table + 22 * 16] - lea r4, [ang_table + 8 * 16] - mova m3, [pw_1024] - - movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - - punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] - - pmaddubsw m4, m0, [r3 + 4 * 16] ; [26] - pmulhrsw m4, m3 - pmaddubsw m1, [r3 - 2 * 16] ; [20] - pmulhrsw m1, m3 - packuswb m4, m1 - - palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] - - pmaddubsw m5, [r3 - 8 * 16] ; [14] - pmulhrsw m5, m3 +cglobal intra_pred_dc32, 3, 5, 5 + lea r3, [r2 + 65] + inc r2 + pxor m0, m0 + movu m1, [r2] + movu m2, [r2 + 16] + movu m3, [r3] + movu m4, [r3 + 16] + psadbw m1, m0 + psadbw m2, m0 + psadbw m3, m0 + psadbw m4, m0 + paddw m1, m2 + paddw m3, m4 + paddw m1, m3 + pshufd m2, m1, 2 + paddw m1, m2 - palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] + movd r4d, m1 + add r4d, 32 + shr r4d, 6 ; sum = sum / 64 + movd m1, r4d + pshufb m1, m0 ; m1 = byte [dc_val ...] - pmaddubsw m6, [r4] ; [ 8] - pmulhrsw m6, m3 - packuswb m5, m6 - - palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] - - pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2] - pmulhrsw m6, m3 - - pmaddubsw m1, [r3 + 6 * 16] ; [28] - pmulhrsw m1, m3 - packuswb m6, m1 - - palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] - - pmaddubsw m1, [r3] ; [22] - pmulhrsw m1, m3 +%rep 2 + ; store DC 16x16 + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] + movu [r0], m1 + movu [r0 + r1], m1 + movu [r0 + 16], m1 + movu [r0 + r1 + 16],m1 + lea r0, [r0 + 2 * r1] +%endrep - palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] + RET - pmaddubsw m2, [r3 - 6 * 16] ; [16] - pmulhrsw m2, m3 - packuswb m1, m2 - jmp .transpose8x8 +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar4, 3,3,7 + pmovzxbw m1, [r2 + 1] + pmovzxbw m2, [r2 + 9] + pshufhw m3, m1, 0 ; topRight + pshufd m3, m3, 0xAA + pshufhw m4, m2, 0 ; bottomLeft + pshufd m4, m4, 0xAA -ALIGN 16 -.transpose8x8: - jz .store + pmullw m3, [multi_2Row] ; (x + 1) * topRight + pmullw m0, m1, [pw_planar4_1] ; (blkSize - 1 - y) * above[x] + mova m6, [pw_planar4_0] + paddw m3, [pw_4] + paddw m3, m4 + paddw m3, m0 + psubw m4, m1 - ; transpose 8x8 - punpckhbw m0, m4, m5 - punpcklbw m4, m5 - punpckhbw m2, m4, m0 - punpcklbw m4, m0 + pshuflw m5, m2, 0 + pmullw m5, m6 + paddw m5, m3 + paddw m3, m4 + psraw m5, 3 + packuswb m5, m5 + movd [r0], m5 - punpckhbw m0, m6, m1 - punpcklbw m6, m1 - punpckhbw m1, m6, m0 - punpcklbw m6, m0 + pshuflw m5, m2, 01010101b + pmullw m5, m6 + paddw m5, m3 + paddw m3, m4 + psraw m5, 3 + packuswb m5, m5 + movd [r0 + r1], m5 + lea r0, [r0 + 2 * r1] - punpckhdq m5, m4, m6 - punpckldq m4, m6 - punpckldq m6, m2, m1 - punpckhdq m2, m1 - mova m1, m2 + pshuflw m5, m2, 10101010b + pmullw m5, m6 + paddw m5, m3 + paddw m3, m4 + psraw m5, 3 + packuswb m5, m5 + movd [r0], m5 -.store: - lea r4, [r1 * 3] - movh [r0], m4 - movhps [r0 + r1], m4 - movh [r0 + r1 * 2], m5 - movhps [r0 + r4], m5 - add r0, r4 - movh [r0 + r1], m6 - movhps [r0 + r1 * 2], m6 - movh [r0 + r4], m1 - movhps [r0 + r1 * 4], m1 + pshuflw m5, m2, 11111111b + pmullw m5, m6 + paddw m5, m3 + paddw m3, m4 + psraw m5, 3 + packuswb m5, m5 + movd [r0 + r1], m5 RET -cglobal intra_pred_ang8_4, 3,5,8 - lea r4, [r2 + 1] - add r2, 17 - cmp r3m, byte 32 - cmove r2, r4 - lea r3, [ang_table + 24 * 16] - lea r4, [ang_table + 10 * 16] - mova m3, [pw_1024] - - movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - - punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] - mova m5, m1 - - pmaddubsw m4, m0, [r3 - 3 * 16] ; [21] - pmulhrsw m4, m3 - pmaddubsw m1, [r4] ; [10] - pmulhrsw m1, m3 - packuswb m4, m1 +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar8, 3,3,7 + pmovzxbw m1, [r2 + 1] + pmovzxbw m2, [r2 + 17] - pmaddubsw m5, [r3 + 7 * 16] ; [31] - pmulhrsw m5, m3 + movd m3, [r2 + 9] ; topRight = above[8]; + movd m4, [r2 + 25] ; bottomLeft = left[8]; - palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] + pxor m0, m0 + pshufb m3, m0 + pshufb m4, m0 + punpcklbw m3, m0 ; v_topRight + punpcklbw m4, m0 ; v_bottomLeft - pmaddubsw m6, [r3 - 4 * 16] ; [ 20] - pmulhrsw m6, m3 - packuswb m5, m6 + pmullw m3, [multiL] ; (x + 1) * topRight + pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) * above[x] + mova m6, [pw_planar8_0] + paddw m3, [pw_8] + paddw m3, m4 + paddw m3, m0 + psubw m4, m1 - palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] +%macro INTRA_PRED_PLANAR8 1 +%if (%1 < 4) + pshuflw m5, m2, 0x55 * %1 + pshufd m5, m5, 0 +%else + pshufhw m5, m2, 0x55 * (%1 - 4) + pshufd m5, m5, 0xAA +%endif + pmullw m5, m6 + paddw m5, m3 + paddw m3, m4 + psraw m5, 4 + packuswb m5, m5 + movh [r0], m5 + lea r0, [r0 + r1] +%endmacro - pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9] - pmulhrsw m6, m3 + INTRA_PRED_PLANAR8 0 + INTRA_PRED_PLANAR8 1 + INTRA_PRED_PLANAR8 2 + INTRA_PRED_PLANAR8 3 + INTRA_PRED_PLANAR8 4 + INTRA_PRED_PLANAR8 5 + INTRA_PRED_PLANAR8 6 + INTRA_PRED_PLANAR8 7 + RET - pmaddubsw m1, [r3 + 6 * 16] ; [30] - pmulhrsw m1, m3 - packuswb m6, m1 +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal intra_pred_planar16, 3,3,8 + pmovzxbw m2, [r2 + 1] + pmovzxbw m7, [r2 + 9] - palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] + movd m3, [r2 + 17] ; topRight = above[16] + movd m6, [r2 + 49] ; bottomLeft = left[16] - pmaddubsw m1, [r3 - 5 * 16] ; [19] - pmulhrsw m1, m3 + pxor m0, m0 + pshufb m3, m0 + pshufb m6, m0 + punpcklbw m3, m0 ; v_topRight + punpcklbw m6, m0 ; v_bottomLeft - palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8] + pmullw m4, m3, [multiH] ; (x + 1) * topRight + pmullw m3, [multiL] ; (x + 1) * topRight + pmullw m1, m2, [pw_planar16_1] ; (blkSize - 1 - y) * above[x] + pmullw m5, m7, [pw_planar16_1] ; (blkSize - 1 - y) * above[x] + paddw m4, [pw_16] + paddw m3, [pw_16] + paddw m4, m6 + paddw m3, m6 + paddw m4, m5 + paddw m3, m1 + psubw m1, m6, m7 + psubw m6, m2 - pmaddubsw m2, [r4 - 2 * 16] ; [8] - pmulhrsw m2, m3 - packuswb m1, m2 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + pmovzxbw m2, [r2 + 33] + pmovzxbw m7, [r2 + 41] -cglobal intra_pred_ang8_5, 3,5,8 - lea r4, [r2 + 1] - add r2, 17 - cmp r3m, byte 31 - cmove r2, r4 - lea r3, [ang_table + 17 * 16] - lea r4, [ang_table + 2 * 16] - mova m3, [pw_1024] - - movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - - punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] - mova m5, m1 +%macro INTRA_PRED_PLANAR16 1 +%if (%1 < 4) + pshuflw m5, m2, 0x55 * %1 + pshufd m5, m5, 0 +%else +%if (%1 < 8) + pshufhw m5, m2, 0x55 * (%1 - 4) + pshufd m5, m5, 0xAA +%else +%if (%1 < 12) + pshuflw m5, m7, 0x55 * (%1 - 8) + pshufd m5, m5, 0 +%else + pshufhw m5, m7, 0x55 * (%1 - 12) + pshufd m5, m5, 0xAA +%endif +%endif +%endif + pmullw m0, m5, [pw_planar8_0] + pmullw m5, [pw_planar16_0] + paddw m0, m4 + paddw m5, m3 + paddw m3, m6 + paddw m4, m1 + psraw m5, 5 + psraw m0, 5 + packuswb m5, m0 + movu [r0], m5 + lea r0, [r0 + r1] +%endmacro - pmaddubsw m4, m0, [r3] ; [17] - pmulhrsw m4, m3 - pmaddubsw m1, [r4] ; [2] - pmulhrsw m1, m3 - packuswb m4, m1 + INTRA_PRED_PLANAR16 0 + INTRA_PRED_PLANAR16 1 + INTRA_PRED_PLANAR16 2 + INTRA_PRED_PLANAR16 3 + INTRA_PRED_PLANAR16 4 + INTRA_PRED_PLANAR16 5 + INTRA_PRED_PLANAR16 6 + INTRA_PRED_PLANAR16 7 + INTRA_PRED_PLANAR16 8 + INTRA_PRED_PLANAR16 9 + INTRA_PRED_PLANAR16 10 + INTRA_PRED_PLANAR16 11 + INTRA_PRED_PLANAR16 12 + INTRA_PRED_PLANAR16 13 + INTRA_PRED_PLANAR16 14 + INTRA_PRED_PLANAR16 15 + RET - pmaddubsw m5, [r3 + 2 * 16] ; [19] - pmulhrsw m5, m3 +;--------------------------------------------------------------------------------------- +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel*srcPix, int, int filter) +;--------------------------------------------------------------------------------------- +INIT_XMM sse4 +%if ARCH_X86_64 == 1 +cglobal intra_pred_planar32, 3,4,12 +%else +cglobal intra_pred_planar32, 3,4,8,0-(4*mmsize) + %define m8 [rsp + 0 * mmsize] + %define m9 [rsp + 1 * mmsize] + %define m10 [rsp + 2 * mmsize] + %define m11 [rsp + 3 * mmsize] +%endif + movd m3, [r2 + 33] ; topRight = above[32] - palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] - mova m1, m6 + pxor m7, m7 + pshufb m3, m7 + punpcklbw m3, m7 ; v_topRight - pmaddubsw m1, [r4 + 2 * 16] ; [4] - pmulhrsw m1, m3 - packuswb m5, m1 + pmullw m0, m3, [multiL] ; (x + 1) * topRight + pmullw m1, m3, [multiH] ; (x + 1) * topRight + pmullw m2, m3, [multiH2] ; (x + 1) * topRight + pmullw m3, [multiH3] ; (x + 1) * topRight - pmaddubsw m6, [r3 + 4 * 16] ; [21] - pmulhrsw m6, m3 + movd m6, [r2 + 97] ; bottomLeft = left[32] + pshufb m6, m7 + punpcklbw m6, m7 ; v_bottomLeft - palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] + paddw m0, m6 + paddw m1, m6 + paddw m2, m6 + paddw m3, m6 + paddw m0, [pw_32] + paddw m1, [pw_32] + paddw m2, [pw_32] + paddw m3, [pw_32] - mova m7, m1 - pmaddubsw m7, [r4 + 4 * 16] ; [6] - pmulhrsw m7, m3 - packuswb m6, m7 + pmovzxbw m4, [r2 + 1] + pmullw m5, m4, [pw_planar32_1] + paddw m0, m5 + psubw m5, m6, m4 + mova m8, m5 - pmaddubsw m1, [r3 + 6 * 16] ; [23] - pmulhrsw m1, m3 + pmovzxbw m4, [r2 + 9] + pmullw m5, m4, [pw_planar32_1] + paddw m1, m5 + psubw m5, m6, m4 + mova m9, m5 - palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9] + pmovzxbw m4, [r2 + 17] + pmullw m5, m4, [pw_planar32_1] + paddw m2, m5 + psubw m5, m6, m4 + mova m10, m5 - pmaddubsw m2, [r4 + 6 * 16] ; [8] - pmulhrsw m2, m3 - packuswb m1, m2 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + pmovzxbw m4, [r2 + 25] + pmullw m5, m4, [pw_planar32_1] + paddw m3, m5 + psubw m5, m6, m4 + mova m11, m5 + add r2, 65 ; (2 * blkSize + 1) -cglobal intra_pred_ang8_6, 3,5,8 - lea r4, [r2 + 1] - add r2, 17 - cmp r3m, byte 30 - cmove r2, r4 - lea r3, [ang_table + 20 * 16] - lea r4, [ang_table + 8 * 16] - mova m7, [pw_1024] +%macro INTRA_PRED_PLANAR32 0 + movd m4, [r2] + pshufb m4, m7 + punpcklbw m4, m7 - movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + pmullw m5, m4, [pw_planar32_L] + pmullw m6, m4, [pw_planar32_H] + paddw m5, m0 + paddw m6, m1 + paddw m0, m8 + paddw m1, m9 + psraw m5, 6 + psraw m6, 6 + packuswb m5, m6 + movu [r0], m5 - punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - mova m1, m0 + pmullw m5, m4, [pw_planar16_0] + pmullw m4, [pw_planar8_0] + paddw m5, m2 + paddw m4, m3 + paddw m2, m10 + paddw m3, m11 + psraw m5, 6 + psraw m4, 6 + packuswb m5, m4 + movu [r0 + 16], m5 - pmaddubsw m4, m0, [r3 - 7 * 16] ; [13] - pmulhrsw m4, m7 - pmaddubsw m1, [r3 + 6 * 16] ; [26] - pmulhrsw m1, m7 - packuswb m4, m1 + lea r0, [r0 + r1] + inc r2 +%endmacro - palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + mov r3, 4 +.loop: + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + INTRA_PRED_PLANAR32 + dec r3 + jnz .loop + RET - pmaddubsw m5, m6, [r4 - 1 * 16] ; [7] - pmulhrsw m5, m7 +;----------------------------------------------------------------------------------------- +; void intraPredAng4(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang4_2, 3,5,3 + lea r4, [r2 + 2] + add r2, 10 + cmp r3m, byte 34 + cmove r2, r4 - pmaddubsw m6, [r3] ; [20] - pmulhrsw m6, m7 - packuswb m5, m6 + movh m0, [r2] + movd [r0], m0 + palignr m1, m0, 1 + movd [r0 + r1], m1 + palignr m2, m0, 2 + movd [r0 + r1 * 2], m2 + lea r1, [r1 * 3] + psrldq m0, 3 + movd [r0 + r1], m0 + RET - palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] +INIT_XMM sse4 +cglobal intra_pred_ang4_3, 3,5,5 + mov r4, 1 + cmp r3m, byte 33 + mov r3, 9 + cmove r3, r4 - pmaddubsw m6, m1, [r4 - 7 * 16] ; [1] - pmulhrsw m6, m7 + movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + palignr m2, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] + palignr m3, m0, 6 ; [x x x x x x x x 8 7 7 6 6 5 5 4] + punpcklqdq m0, m1 + punpcklqdq m2, m3 - mova m3, m1 - pmaddubsw m3, [r3 - 6 * 16] ; [14] - pmulhrsw m3, m7 - packuswb m6, m3 + lea r3, [ang_table + 20 * 16] + movh m3, [r3 + 6 * 16] ; [26] + movhps m3, [r3] ; [20] + movh m4, [r3 - 6 * 16] ; [14] + movhps m4, [r3 - 12 * 16] ; [ 8] + jmp .do_filter4x4 - pmaddubsw m1, [r3 + 7 * 16] ; [27] - pmulhrsw m1, m7 + ; NOTE: share path, input is m0=[1 0], m2=[3 2], m3,m4=coef, flag_z=no_transpose +ALIGN 16 +.do_filter4x4: + mova m1, [pw_1024] - palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] - - pmaddubsw m2, [r4] ; [8] - pmulhrsw m2, m7 - packuswb m1, m2 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - -cglobal intra_pred_ang8_7, 3,5,8 - lea r4, [r2 + 1] - add r2, 17 - cmp r3m, byte 29 - cmove r2, r4 - lea r3, [ang_table + 24 * 16] - lea r4, [ang_table + 6 * 16] - mova m7, [pw_1024] + pmaddubsw m0, m3 + pmulhrsw m0, m1 + pmaddubsw m2, m4 + pmulhrsw m2, m1 + packuswb m0, m2 - movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + ; NOTE: mode 33 doesn't reorde, UNSAFE but I don't use any instruction that affect eflag register before + jz .store - punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + ; transpose 4x4 + pshufb m0, [c_trans_4x4] - pmaddubsw m4, m0, [r4 + 3 * 16] ; [9] - pmulhrsw m4, m7 - pmaddubsw m3, m0, [r3 - 6 * 16] ; [18] - pmulhrsw m3, m7 - packuswb m4, m3 +.store: + ; TODO: use pextrd here after intrinsic ssse3 removed + movd [r0], m0 + pextrd [r0 + r1], m0, 1 + pextrd [r0 + r1 * 2], m0, 2 + lea r1, [r1 * 3] + pextrd [r0 + r1], m0, 3 + RET - pmaddubsw m5, m0, [r3 + 3 * 16] ; [27] - pmulhrsw m5, m7 +cglobal intra_pred_ang4_4, 3,5,5 + xor r4, r4 + inc r4 + cmp r3m, byte 32 + mov r3, 9 + cmove r3, r4 - palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] + punpcklqdq m0, m1 + punpcklqdq m2, m1, m3 - pmaddubsw m6, m1, [r4 - 2 * 16] ; [4] - pmulhrsw m6, m7 - packuswb m5, m6 + lea r3, [ang_table + 18 * 16] + movh m3, [r3 + 3 * 16] ; [21] + movhps m3, [r3 - 8 * 16] ; [10] + movh m4, [r3 + 13 * 16] ; [31] + movhps m4, [r3 + 2 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - pmaddubsw m6, m1, [r4 + 7 * 16] ; [13] - pmulhrsw m6, m7 +cglobal intra_pred_ang4_5, 3,5,5 + xor r4, r4 + inc r4 + cmp r3m, byte 31 + mov r3, 9 + cmove r3, r4 - mova m3, m1 - pmaddubsw m3, [r3 - 2 * 16] ; [22] - pmulhrsw m3, m7 - packuswb m6, m3 + movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + palignr m3, m0, 4 ; [x x x x x x x x 7 6 6 5 5 4 4 3] + punpcklqdq m0, m1 + punpcklqdq m2, m1, m3 - pmaddubsw m1, [r3 + 7 * 16] ; [31] - pmulhrsw m1, m7 + lea r3, [ang_table + 10 * 16] + movh m3, [r3 + 7 * 16] ; [17] + movhps m3, [r3 - 8 * 16] ; [ 2] + movh m4, [r3 + 9 * 16] ; [19] + movhps m4, [r3 - 6 * 16] ; [ 4] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] +cglobal intra_pred_ang4_6, 3,5,5 + xor r4, r4 + inc r4 + cmp r3m, byte 30 + mov r3, 9 + cmove r3, r4 - pmaddubsw m2, [r4 + 2 * 16] ; [8] - pmulhrsw m2, m7 - packuswb m1, m2 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m2, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + punpcklqdq m0, m0 + punpcklqdq m2, m2 -cglobal intra_pred_ang8_8, 3,5,8 - lea r4, [r2 + 1] - add r2, 17 - cmp r3m, byte 28 - cmove r2, r4 - lea r3, [ang_table + 23 * 16] - lea r4, [ang_table + 8 * 16] - mova m7, [pw_1024] + lea r3, [ang_table + 19 * 16] + movh m3, [r3 - 6 * 16] ; [13] + movhps m3, [r3 + 7 * 16] ; [26] + movh m4, [r3 - 12 * 16] ; [ 7] + movhps m4, [r3 + 1 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] +cglobal intra_pred_ang4_7, 3,5,5 + xor r4, r4 + inc r4 + cmp r3m, byte 29 + mov r3, 9 + cmove r3, r4 - punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m3, m0, 2 ; [x x x x x x x x 6 5 5 4 4 3 3 2] + punpcklqdq m2, m0, m3 + punpcklqdq m0, m0 - pmaddubsw m4, m0, [r4 - 3 * 16] ; [5] - pmulhrsw m4, m7 - pmaddubsw m3, m0, [r4 + 2 * 16] ; [10] - pmulhrsw m3, m7 - packuswb m4, m3 + lea r3, [ang_table + 20 * 16] + movh m3, [r3 - 11 * 16] ; [ 9] + movhps m3, [r3 - 2 * 16] ; [18] + movh m4, [r3 + 7 * 16] ; [27] + movhps m4, [r3 - 16 * 16] ; [ 4] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - pmaddubsw m5, m0, [r3 - 8 * 16] ; [15] - pmulhrsw m5, m7 +cglobal intra_pred_ang4_8, 3,5,5 + xor r4, r4 + inc r4 + cmp r3m, byte 28 + mov r3, 9 + cmove r3, r4 - pmaddubsw m6, m0, [r3 - 3 * 16] ; [20] - pmulhrsw m6, m7 - packuswb m5, m6 + movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + punpcklqdq m0, m0 + mova m2, m0 - pmaddubsw m6, m0, [r3 + 2 * 16] ; [25] - pmulhrsw m6, m7 + lea r3, [ang_table + 13 * 16] + movh m3, [r3 - 8 * 16] ; [ 5] + movhps m3, [r3 - 3 * 16] ; [10] + movh m4, [r3 + 2 * 16] ; [15] + movhps m4, [r3 + 7 * 16] ; [20] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - pmaddubsw m0, [r3 + 7 * 16] ; [30] - pmulhrsw m0, m7 - packuswb m6, m0 +cglobal intra_pred_ang4_9, 3,5,5 + xor r4, r4 + inc r4 + cmp r3m, byte 27 + mov r3, 9 + cmove r3, r4 - pmaddubsw m1, m2, [r4 - 5 * 16] ; [3] - pmulhrsw m1, m7 + movh m0, [r2 + r3] ; [8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 8 7 6 5 4 3 2] + punpcklbw m0, m1 ; [x 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + punpcklqdq m0, m0 + mova m2, m0 - pmaddubsw m2, [r4] ; [8] - pmulhrsw m2, m7 - packuswb m1, m2 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + lea r3, [ang_table + 4 * 16] + movh m3, [r3 - 2 * 16] ; [ 2] + movhps m3, [r3 - 0 * 16] ; [ 4] + movh m4, [r3 + 2 * 16] ; [ 6] + movhps m4, [r3 + 4 * 16] ; [ 8] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) -cglobal intra_pred_ang8_9, 3,5,8 - lea r4, [r2 + 1] - add r2, 17 - cmp r3m, byte 27 - cmove r2, r4 - lea r3, [ang_table + 10 * 16] - mova m7, [pw_1024] +cglobal intra_pred_ang4_10, 3,3,4 + movd m0, [r2 + 9] ; [8 7 6 5 4 3 2 1] + pshufb m0, [pb_unpackbd1] + pshufd m1, m0, 1 + movhlps m2, m0 + pshufd m3, m0, 3 + movd [r0 + r1], m1 + movd [r0 + r1 * 2], m2 + lea r1, [r1 * 3] + movd [r0 + r1], m3 + cmp r4m, byte 0 + jz .quit - movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + ; filter + pmovzxbw m0, m0 ; [-1 -1 -1 -1] + movh m1, [r2] ; [4 3 2 1 0] + pshufb m2, m1, [pb_0_8] ; [0 0 0 0] + pshufb m1, [pb_unpackbw1] ; [4 3 2 1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + packuswb m0, m0 +.quit: + movd [r0], m0 + RET - punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] +INIT_XMM sse4 +cglobal intra_pred_ang4_26, 3,4,3 + movd m0, [r2 + 1] ; [8 7 6 5 4 3 2 1] - pmaddubsw m4, m0, [r3 - 8 * 16] ; [2] - pmulhrsw m4, m7 - pmaddubsw m3, m0, [r3 - 6 * 16] ; [4] - pmulhrsw m3, m7 - packuswb m4, m3 + ; store + movd [r0], m0 + movd [r0 + r1], m0 + movd [r0 + r1 * 2], m0 + lea r3, [r1 * 3] + movd [r0 + r3], m0 - pmaddubsw m5, m0, [r3 - 4 * 16] ; [6] - pmulhrsw m5, m7 + ; filter + cmp r4m, byte 0 + jz .quit - pmaddubsw m6, m0, [r3 - 2 * 16] ; [8] - pmulhrsw m6, m7 - packuswb m5, m6 - - pmaddubsw m6, m0, [r3] ; [10] - pmulhrsw m6, m7 - - pmaddubsw m2, m0, [r3 + 2 * 16] ; [12] - pmulhrsw m2, m7 - packuswb m6, m2 - - pmaddubsw m1, m0, [r3 + 4 * 16] ; [14] - pmulhrsw m1, m7 - - pmaddubsw m0, [r3 + 6 * 16] ; [16] - pmulhrsw m0, m7 - packuswb m1, m0 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - -cglobal intra_pred_ang8_10, 3,6,5 - movh m0, [r2 + 17] - mova m4, [pb_unpackbq] - palignr m1, m0, 2 - pshufb m1, m4 - palignr m2, m0, 4 - pshufb m2, m4 - palignr m3, m0, 6 - pshufb m3, m4 - pshufb m0, m4 - - lea r5, [r1 * 3] - movhps [r0 + r1], m0 - movh [r0 + r1 * 2], m1 - movhps [r0 + r5], m1 - lea r3, [r0 + r1 * 4] - movh [r3], m2 - movhps [r3 + r1], m2 - movh [r3 + r1 * 2], m3 - movhps [r3 + r5], m3 - -; filter - cmp r4m, byte 0 - jz .quit - - pmovzxbw m0, m0 - movu m1, [r2] - palignr m2, m1, 1 - pshufb m1, m4 - pmovzxbw m1, m1 - pmovzxbw m2, m2 - psubw m2, m1 - psraw m2, 1 - paddw m0, m2 - packuswb m0, m0 - -.quit: - movh [r0], m0 - RET - -cglobal intra_pred_ang8_26, 3,6,3 - movu m2, [r2] - palignr m0, m2, 1 - lea r5, [r1 * 3] - movh [r0], m0 - movh [r0 + r1], m0 - movh [r0 + r1 * 2], m0 - movh [r0 + r5], m0 - lea r3, [r0 + r1 * 4] - movh [r3], m0 - movh [r3 + r1], m0 - movh [r3 + r1 * 2], m0 - movh [r3 + r5], m0 - -; filter - cmp r4m, byte 0 - jz .quit + pshufb m0, [pb_0_8] ; [ 1 1 1 1] + movh m1, [r2 + 8] ; [-4 -3 -2 -1 0] + pinsrb m1, [r2], 0 + pshufb m2, m1, [pb_0_8] ; [0 0 0 0] + pshufb m1, [pb_unpackbw1] ; [-4 -3 -2 -1] + psubw m1, m2 + psraw m1, 1 + paddw m0, m1 + packuswb m0, m0 - pshufb m2, [pb_unpackbq] - movhlps m1, m2 - pmovzxbw m2, m2 - movu m0, [r2 + 17] - pmovzxbw m1, m1 - pmovzxbw m0, m0 - psubw m0, m2 - psraw m0, 1 - paddw m1, m0 - packuswb m1, m1 - pextrb [r0], m1, 0 - pextrb [r0 + r1], m1, 1 - pextrb [r0 + r1 * 2], m1, 2 - pextrb [r0 + r5], m1, 3 - pextrb [r3], m1, 4 - pextrb [r3 + r1], m1, 5 - pextrb [r3 + r1 * 2], m1, 6 - pextrb [r3 + r5], m1, 7 + pextrb [r0], m0, 0 + pextrb [r0 + r1], m0, 1 + pextrb [r0 + r1 * 2], m0, 2 + pextrb [r0 + r3], m0, 3 .quit: RET -cglobal intra_pred_ang8_11, 3,5,8 - xor r4, r4 - cmp r3m, byte 25 - mov r3, 16 - cmove r3, r4 - - movu m0, [r2 + r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - pinsrb m0, [r2], 0 - palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - - punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] +cglobal intra_pred_ang4_11, 3,5,5 + xor r4, r4 + cmp r3m, byte 25 + mov r3, 8 + cmove r3, r4 - lea r3, [ang_table + 23 * 16] - mova m7, [pw_1024] + movh m0, [r2 + r3] ; [x x x 4 3 2 1 0] + pinsrb m0, [r2], 0 + palignr m1, m0, 1 ; [x x x x 4 3 2 1] + punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] + punpcklqdq m0, m0 + mova m2, m0 - pmaddubsw m4, m0, [r3 + 7 * 16] ; [30] - pmulhrsw m4, m7 - pmaddubsw m3, m0, [r3 + 5 * 16] ; [28] - pmulhrsw m3, m7 - packuswb m4, m3 + lea r3, [ang_table + 24 * 16] - pmaddubsw m5, m0, [r3 + 3 * 16] ; [26] - pmulhrsw m5, m7 + movh m3, [r3 + 6 * 16] ; [24] + movhps m3, [r3 + 4 * 16] ; [26] + movh m4, [r3 + 2 * 16] ; [28] + movhps m4, [r3 + 0 * 16] ; [30] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - pmaddubsw m6, m0, [r3 + 1 * 16] ; [24] - pmulhrsw m6, m7 - packuswb m5, m6 +cglobal intra_pred_ang4_12, 3,5,5 + xor r4, r4 + cmp r3m, byte 24 + mov r3, 8 + cmove r3, r4 - pmaddubsw m6, m0, [r3 - 1 * 16] ; [22] - pmulhrsw m6, m7 + movh m0, [r2 + r3] ; [x x x 4 3 2 1 0] + pinsrb m0, [r2], 0 + palignr m1, m0, 1 ; [x x x x 4 3 2 1] + punpcklbw m0, m1 ; [x x x x x x x x 4 3 3 2 2 1 1 0] + punpcklqdq m0, m0 + mova m2, m0 - pmaddubsw m2, m0, [r3 - 3 * 16] ; [20] - pmulhrsw m2, m7 - packuswb m6, m2 + lea r3, [ang_table + 20 * 16] + movh m3, [r3 + 7 * 16] ; [27] + movhps m3, [r3 + 2 * 16] ; [22] + movh m4, [r3 - 3 * 16] ; [17] + movhps m4, [r3 - 8 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - pmaddubsw m1, m0, [r3 - 5 * 16] ; [18] - pmulhrsw m1, m7 +cglobal intra_pred_ang4_13, 4,5,5 + xor r4, r4 + cmp r3m, byte 23 + mov r3, 8 + jz .next + xchg r3, r4 +.next: + movh m1, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] + pinsrb m1, [r2], 1 + palignr m0, m1, 1 ; [x x x 4 3 2 1 0] + palignr m2, m1, 2 ; [x x x x 4 3 2 1] + pinsrb m1, [r2 + r3 + 4], 0 + punpcklbw m1, m0 ; [3 2 2 1 1 0 0 x] + punpcklbw m0, m2 ; [4 3 3 2 2 1 1 0] + punpcklqdq m2, m0, m1 + punpcklqdq m0, m0 - pmaddubsw m0, [r3 - 7 * 16] ; [16] - pmulhrsw m0, m7 - packuswb m1, m0 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + lea r3, [ang_table + 21 * 16] + movh m3, [r3 + 2 * 16] ; [23] + movhps m3, [r3 - 7 * 16] ; [14] + movh m4, [r3 - 16 * 16] ; [ 5] + movhps m4, [r3 + 7 * 16] ; [28] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) -cglobal intra_pred_ang8_12, 3,5,8 - xor r4, r4 - cmp r3m, byte 24 - mov r3, 16 +cglobal intra_pred_ang4_14, 4,5,5 + xor r4, r4 + cmp r3m, byte 22 + mov r3, 8 jz .next - xchg r3, r4 + xchg r3, r4 .next: + movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] + pinsrb m2, [r2], 1 + palignr m0, m2, 1 ; [x x x 4 3 2 1 0] + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + pinsrb m2, [r2 + r3 + 2], 0 + punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] + punpcklqdq m0, m0 + punpcklqdq m2, m2 - movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - pinsrb m1, [r2], 0 - pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] - pinsrb m0, [r2 + r3 + 6], 0 + lea r3, [ang_table + 19 * 16] + movh m3, [r3 + 0 * 16] ; [19] + movhps m3, [r3 - 13 * 16] ; [ 6] + movh m4, [r3 + 6 * 16] ; [25] + movhps m4, [r3 - 7 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - lea r4, [ang_table + 22 * 16] - mova m7, [pw_1024] +cglobal intra_pred_ang4_15, 4,5,5 + xor r4, r4 + cmp r3m, byte 21 + mov r3, 8 + jz .next + xchg r3, r4 +.next: + movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] + pinsrb m2, [r2], 1 + palignr m0, m2, 1 ; [x x x 4 3 2 1 0] + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + pinsrb m2, [r2 + r3 + 2], 0 + pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] + pinsrb m3, [r2 + r3 + 4], 0 + punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] + punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] + punpcklqdq m0, m2 + punpcklqdq m2, m4 - punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] - punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] - palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + lea r3, [ang_table + 23 * 16] + movh m3, [r3 - 8 * 16] ; [15] + movhps m3, [r3 + 7 * 16] ; [30] + movh m4, [r3 - 10 * 16] ; [13] + movhps m4, [r3 + 5 * 16] ; [28] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - pmaddubsw m4, m2, [r4 + 5 * 16] ; [27] - pmulhrsw m4, m7 - pmaddubsw m3, m2, [r4] ; [22] - pmulhrsw m3, m7 - packuswb m4, m3 - - pmaddubsw m1, m0, [r4 + 7 * 16] ; [29] - pmulhrsw m1, m7 - - pmaddubsw m0, [r4 + 2 * 16] ; [24] - pmulhrsw m0, m7 - packuswb m1, m0 - - pmaddubsw m5, m2, [r4 - 5 * 16] ; [17] - pmulhrsw m5, m7 - - lea r4, [ang_table + 7 * 16] - pmaddubsw m6, m2, [r4 + 5 * 16] ; [12] - pmulhrsw m6, m7 - packuswb m5, m6 - - pmaddubsw m6, m2, [r4] ; [7] - pmulhrsw m6, m7 - - pmaddubsw m2, [r4 - 5 * 16] ; [2] - pmulhrsw m2, m7 - packuswb m6, m2 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - -cglobal intra_pred_ang8_13, 4,5,8 - xor r4, r4 - cmp r3m, byte 23 - mov r3, 16 +cglobal intra_pred_ang4_16, 3,5,5 + xor r4, r4 + cmp r3m, byte 20 + mov r3, 8 jz .next - xchg r3, r4 + xchg r3, r4 .next: + movh m2, [r2 + r4 - 1] ; [x x 4 3 2 1 0 x] + pinsrb m2, [r2], 1 + palignr m0, m2, 1 ; [x x x 4 3 2 1 0] + palignr m1, m2, 2 ; [x x x x 4 3 2 1] + pinsrb m2, [r2 + r3 + 2], 0 + pslldq m3, m2, 1 ; [x 4 3 2 1 0 x y] + pinsrb m3, [r2 + r3 + 3], 0 + punpcklbw m4, m3, m2 ; [2 1 1 0 0 x x y] + punpcklbw m2, m0 ; [3 2 2 1 1 0 0 x] + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] + punpcklqdq m0, m2 + punpcklqdq m2, m4 - movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - pinsrb m1, [r2], 0 - pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] - pinsrb m1, [r2 + r3 + 4], 0 - pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] - pinsrb m0, [r2 + r3 + 7], 0 - punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] - punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] - palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] - palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - - lea r4, [ang_table + 24 * 16] - mova m7, [pw_1024] - - pmaddubsw m4, m5, [r4 - 1 * 16] ; [23] - pmulhrsw m4, m7 + lea r3, [ang_table + 19 * 16] + movh m3, [r3 - 8 * 16] ; [11] + movhps m3, [r3 + 3 * 16] ; [22] + movh m4, [r3 - 18 * 16] ; [ 1] + movhps m4, [r3 - 7 * 16] ; [12] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - pmaddubsw m6, m1, [r4 + 4 * 16] ; [28] - pmulhrsw m6, m7 +cglobal intra_pred_ang4_17, 3,5,5 + xor r4, r4 + cmp r3m, byte 19 + mov r3, 8 + jz .next + xchg r3, r4 +.next: + movh m3, [r2 + r4 - 1] ; [- - 4 3 2 1 0 x] + pinsrb m3, [r2], 1 + palignr m0, m3, 1 ; [- - - 4 3 2 1 0] + palignr m1, m3, 2 ; [- - - - 4 3 2 1] + mova m4, m0 + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0] + pinsrb m3, [r2 + r3 + 1], 0 + punpcklbw m1, m3, m4 ; [3 2 2 1 1 0 0 x] + punpcklqdq m0, m1 - pmaddubsw m0, [r4] ; [24] - pmulhrsw m0, m7 + pslldq m2, m3, 1 ; [- 4 3 2 1 0 x y] + pinsrb m2, [r2 + r3 + 2], 0 + pslldq m1, m2, 1 ; [4 3 2 1 0 x y z] + pinsrb m1, [r2 + r3 + 4], 0 + punpcklbw m1, m2 ; [1 0 0 x x y y z] + punpcklbw m2, m3 ; [2 1 1 0 0 x x y] + punpcklqdq m2, m1 - lea r4, [ang_table + 13 * 16] - pmaddubsw m3, m5, [r4 + 1 * 16] ; [14] - pmulhrsw m3, m7 - packuswb m4, m3 + lea r3, [ang_table + 14 * 16] + movh m3, [r3 - 8 * 16] ; [ 6] + movhps m3, [r3 - 2 * 16] ; [12] + movh m4, [r3 + 4 * 16] ; [18] + movhps m4, [r3 + 10 * 16] ; [24] + jmp mangle(private_prefix %+ _ %+ intra_pred_ang4_3 %+ SUFFIX %+ .do_filter4x4) - pmaddubsw m5, [r4 - 8 * 16] ; [5] - pmulhrsw m5, m7 - packuswb m5, m6 +cglobal intra_pred_ang4_18, 3,5,1 + mov r4d, [r2 + 8] + mov r3b, byte [r2] + mov [r2 + 8], r3b + mov r3d, [r2 + 8] + bswap r3d + movd m0, r3d - pmaddubsw m6, m1, [r4 + 6 * 16] ; [19] - pmulhrsw m6, m7 + pinsrd m0, [r2 + 1], 1 ; [- 3 2 1 0 -1 -2 -3] + lea r3, [r1 * 3] + movd [r0 + r3], m0 + psrldq m0, 1 + movd [r0 + r1 * 2], m0 + psrldq m0, 1 + movd [r0 + r1], m0 + psrldq m0, 1 + movd [r0], m0 + mov [r2 + 8], r4w + RET - pmaddubsw m2, m1, [r4 - 3 * 16] ; [10] - pmulhrsw m2, m7 - packuswb m6, m2 +;----------------------------------------------------------------------------------------- +; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal intra_pred_ang8_2, 3,5,2 + lea r4, [r2 + 2] + add r2, 18 + cmp r3m, byte 34 + cmove r2, r4 + movu m0, [r2] + lea r4, [r1 * 3] - pmaddubsw m1, [r4 - 12 * 16] ; [1] - pmulhrsw m1, m7 - packuswb m1, m0 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + movh [r0], m0 + palignr m1, m0, 1 + movh [r0 + r1], m1 + palignr m1, m0, 2 + movh [r0 + r1 * 2], m1 + palignr m1, m0, 3 + movh [r0 + r4], m1 + palignr m1, m0, 4 + lea r0, [r0 + r1 * 4] + movh [r0], m1 + palignr m1, m0, 5 + movh [r0 + r1], m1 + palignr m1, m0, 6 + movh [r0 + r1 * 2], m1 + palignr m1, m0, 7 + movh [r0 + r4], m1 + RET -cglobal intra_pred_ang8_14, 4,5,8 - xor r4, r4 - cmp r3m, byte 22 - mov r3, 16 - jz .next - xchg r3, r4 -.next: +INIT_XMM sse4 +cglobal intra_pred_ang8_3, 3,5,8 + lea r4, [r2 + 1] + add r2, 17 + cmp r3m, byte 33 + cmove r2, r4 + lea r3, [ang_table + 22 * 16] + lea r4, [ang_table + 8 * 16] + mova m3, [pw_1024] - movu m1, [r2 + r4 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] - pinsrb m1, [r2], 2 - pinsrb m1, [r2 + r3 + 2], 1 - pinsrb m1, [r2 + r3 + 5], 0 - pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] - pinsrb m0, [r2 + r3 + 7], 0 - punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] - punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] - palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] - palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] - palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - lea r4, [ang_table + 24 * 16] - mova m3, [pw_1024] + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] - pmaddubsw m4, m2, [r4 - 5 * 16] ; [19] + pmaddubsw m4, m0, [r3 + 4 * 16] ; [26] pmulhrsw m4, m3 + pmaddubsw m1, [r3 - 2 * 16] ; [20] + pmulhrsw m1, m3 + packuswb m4, m1 - pmaddubsw m0, [r4] ; [24] - pmulhrsw m0, m3 + palignr m5, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] - pmaddubsw m5, m6, [r4 + 1 * 16] ; [25] + pmaddubsw m5, [r3 - 8 * 16] ; [14] pmulhrsw m5, m3 - lea r4, [ang_table + 12 * 16] - pmaddubsw m6, [r4] ; [12] + palignr m6, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] + + pmaddubsw m6, [r4] ; [ 8] pmulhrsw m6, m3 packuswb m5, m6 - pmaddubsw m6, m1, [r4 + 19 * 16] ; [31] + palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] + + pmaddubsw m6, m1, [r4 - 6 * 16] ; [ 2] pmulhrsw m6, m3 - pmaddubsw m2, [r4 - 6 * 16] ; [6] - pmulhrsw m2, m3 - packuswb m4, m2 + pmaddubsw m1, [r3 + 6 * 16] ; [28] + pmulhrsw m1, m3 + packuswb m6, m1 - pmaddubsw m2, m1, [r4 + 6 * 16] ; [18] - pmulhrsw m2, m3 - packuswb m6, m2 + palignr m1, m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] - pmaddubsw m1, [r4 - 7 * 16] ; [5] + pmaddubsw m1, [r3] ; [22] pmulhrsw m1, m3 - packuswb m1, m0 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - -cglobal intra_pred_ang8_15, 4,5,8 - xor r4, r4 - cmp r3m, byte 21 - mov r3, 16 - jz .next - xchg r3, r4 -.next: - movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - pinsrb m1, [r2], 0 - movu m2, [r2 + r3] - pshufb m2, [c_mode16_15] - palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] - pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] - pinsrb m0, [r2 + r3 + 8], 0 - punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] - punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] - palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] - palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] - palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] - palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + palignr m2, m0, 12 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] - lea r4, [ang_table + 23 * 16] + pmaddubsw m2, [r3 - 6 * 16] ; [16] + pmulhrsw m2, m3 + packuswb m1, m2 + jmp .transpose8x8 + +ALIGN 16 +.transpose8x8: + jz .store + + ; transpose 8x8 + punpckhbw m0, m4, m5 + punpcklbw m4, m5 + punpckhbw m2, m4, m0 + punpcklbw m4, m0 + + punpckhbw m0, m6, m1 + punpcklbw m6, m1 + punpckhbw m1, m6, m0 + punpcklbw m6, m0 + + punpckhdq m5, m4, m6 + punpckldq m4, m6 + punpckldq m6, m2, m1 + punpckhdq m2, m1 + mova m1, m2 + +.store: + lea r4, [r1 * 3] + movh [r0], m4 + movhps [r0 + r1], m4 + movh [r0 + r1 * 2], m5 + movhps [r0 + r4], m5 + add r0, r4 + movh [r0 + r1], m6 + movhps [r0 + r1 * 2], m6 + movh [r0 + r4], m1 + movhps [r0 + r1 * 4], m1 + RET + +cglobal intra_pred_ang8_4, 3,5,8 + lea r4, [r2 + 1] + add r2, 17 + cmp r3m, byte 32 + cmove r2, r4 + lea r3, [ang_table + 24 * 16] + lea r4, [ang_table + 10 * 16] mova m3, [pw_1024] - pmaddubsw m4, [r4 - 8 * 16] ; [15] - pmulhrsw m4, m3 + movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - pmaddubsw m2, m5, [r4 + 7 * 16] ; [30] - pmulhrsw m2, m3 - packuswb m4, m2 + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + mova m5, m1 - pmaddubsw m5, [r4 - 10 * 16] ; [13] + pmaddubsw m4, m0, [r3 - 3 * 16] ; [21] + pmulhrsw m4, m3 + pmaddubsw m1, [r4] ; [10] + pmulhrsw m1, m3 + packuswb m4, m1 + + pmaddubsw m5, [r3 + 7 * 16] ; [31] pmulhrsw m5, m3 - pmaddubsw m2, m6, [r4 + 5 * 16] ; [28] - pmulhrsw m2, m3 - packuswb m5, m2 + palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] - pmaddubsw m2, m1, [r4 + 3 * 16] ; [26] - pmulhrsw m2, m3 + pmaddubsw m6, [r3 - 4 * 16] ; [ 20] + pmulhrsw m6, m3 + packuswb m5, m6 - pmaddubsw m0, [r4 + 1 * 16] ; [24] - pmulhrsw m0, m3 + palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] - lea r4, [ang_table + 11 * 16] - pmaddubsw m6, [r4] ; [11] + pmaddubsw m6, m1, [r4 - 1 * 16] ; [ 9] pmulhrsw m6, m3 - packuswb m6, m2 - pmaddubsw m1, [r4 - 2 * 16] ; [9] + pmaddubsw m1, [r3 + 6 * 16] ; [30] pmulhrsw m1, m3 - packuswb m1, m0 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + packuswb m6, m1 -cglobal intra_pred_ang8_16, 4,5,8 - xor r4, r4 - cmp r3m, byte 20 - mov r3, 16 - jz .next - xchg r3, r4 -.next: + palignr m1, m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] - movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - pinsrb m1, [r2], 0 - movu m2, [r2 + r3] - pshufb m2, [c_mode16_16] - palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] - pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] - pinsrb m0, [r2 + r3 + 8], 0 - punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] - punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e] - palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] - palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] - palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] - palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] - palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + pmaddubsw m1, [r3 - 5 * 16] ; [19] + pmulhrsw m1, m3 - lea r4, [ang_table + 22 * 16] - mova m7, [pw_1024] + palignr m2, m0, 10 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 8] - pmaddubsw m3, m5, [r4] ; [22] - pmulhrsw m3, m7 + pmaddubsw m2, [r4 - 2 * 16] ; [8] + pmulhrsw m2, m3 + packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - pmaddubsw m0, [r4 + 2 * 16] ; [24] - pmulhrsw m0, m7 +cglobal intra_pred_ang8_5, 3,5,8 + lea r4, [r2 + 1] + add r2, 17 + cmp r3m, byte 31 + cmove r2, r4 + lea r3, [ang_table + 17 * 16] + lea r4, [ang_table + 2 * 16] + mova m3, [pw_1024] - lea r4, [ang_table + 9 * 16] + movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - pmaddubsw m4, [r4 + 2 * 16] ; [11] - pmulhrsw m4, m7 - packuswb m4, m3 + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + mova m5, m1 - pmaddubsw m2, [r4 + 3 * 16] ; [12] - pmulhrsw m2, m7 + pmaddubsw m4, m0, [r3] ; [17] + pmulhrsw m4, m3 + pmaddubsw m1, [r4] ; [2] + pmulhrsw m1, m3 + packuswb m4, m1 - pmaddubsw m5, [r4 - 8 * 16] ; [1] - pmulhrsw m5, m7 - packuswb m5, m2 + pmaddubsw m5, [r3 + 2 * 16] ; [19] + pmulhrsw m5, m3 - mova m2, m6 - pmaddubsw m6, [r4 + 14 * 16] ; [23] - pmulhrsw m6, m7 + palignr m6, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] + mova m1, m6 - pmaddubsw m2, [r4 - 7 * 16] ; [2] - pmulhrsw m2, m7 - packuswb m6, m2 + pmaddubsw m1, [r4 + 2 * 16] ; [4] + pmulhrsw m1, m3 + packuswb m5, m1 - pmaddubsw m1, [r4 + 4 * 16] ; [13] - pmulhrsw m1, m7 - packuswb m1, m0 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + pmaddubsw m6, [r3 + 4 * 16] ; [21] + pmulhrsw m6, m3 -cglobal intra_pred_ang8_17, 4,5,8 - xor r4, r4 - cmp r3m, byte 19 - mov r3, 16 - jz .next - xchg r3, r4 -.next: + palignr m1, m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] - movu m2, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - pinsrb m2, [r2], 0 - movu m1, [r2 + r3] - pshufb m1, [c_mode16_17] - palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] - pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f] - pinsrb m0, [r2 + r3 + 7], 0 - punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] - punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f] + mova m7, m1 + pmaddubsw m7, [r4 + 4 * 16] ; [6] + pmulhrsw m7, m3 + packuswb m6, m7 - palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] - palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] - palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + pmaddubsw m1, [r3 + 6 * 16] ; [23] + pmulhrsw m1, m3 - lea r4, [ang_table + 17 * 16] - mova m3, [pw_1024] + palignr m2, m0, 8 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 8 8 9] - pmaddubsw m2, [r4 - 5 * 16] ; [12] + pmaddubsw m2, [r4 + 6 * 16] ; [8] pmulhrsw m2, m3 + packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - pmaddubsw m4, [r4 - 11 * 16] ; [6] - pmulhrsw m4, m3 - packuswb m4, m2 +cglobal intra_pred_ang8_6, 3,5,8 + lea r4, [r2 + 1] + add r2, 17 + cmp r3m, byte 30 + cmove r2, r4 + lea r3, [ang_table + 20 * 16] + lea r4, [ang_table + 8 * 16] + mova m7, [pw_1024] - pmaddubsw m5, [r4 + 1 * 16] ; [18] - pmulhrsw m5, m3 + movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] - pmaddubsw m2, [r4 + 7 * 16] ; [24] - pmulhrsw m2, m3 - packuswb m5, m2 + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + mova m1, m0 - palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] - mova m2, m6 - pmaddubsw m6, [r4 + 13 * 16] ; [30] - pmulhrsw m6, m3 + pmaddubsw m4, m0, [r3 - 7 * 16] ; [13] + pmulhrsw m4, m7 + pmaddubsw m1, [r3 + 6 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 - pmaddubsw m2, [r4 - 13 * 16] ; [4] - pmulhrsw m2, m3 - packuswb m6, m2 + palignr m6, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] - palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e] - pmaddubsw m1, [r4 - 7 * 16] ; [10] - pmulhrsw m1, m3 + pmaddubsw m5, m6, [r4 - 1 * 16] ; [7] + pmulhrsw m5, m7 - pmaddubsw m0, [r4 - 1 * 16] ; [16] - pmulhrsw m0, m3 - packuswb m1, m0 - jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + pmaddubsw m6, [r3] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 -cglobal intra_pred_ang8_18, 4,4,1 - movu m0, [r2 + 16] - pinsrb m0, [r2], 0 - pshufb m0, [pb_swap8] - movhps m0, [r2 + 1] - lea r2, [r0 + r1 * 4] - lea r3, [r1 * 3] - movh [r2 + r3], m0 - psrldq m0, 1 - movh [r2 + r1 * 2], m0 - psrldq m0, 1 - movh [r2 + r1], m0 - psrldq m0, 1 - movh [r2], m0 - psrldq m0, 1 - movh [r0 + r3], m0 - psrldq m0, 1 - movh [r0 + r1 * 2], m0 - psrldq m0, 1 - movh [r0 + r1], m0 - psrldq m0, 1 - movh [r0], m0 - RET + palignr m1, m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] -%macro TRANSPOSE_STORE_8x8 6 - %if %2 == 1 - ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 - punpckhbw m0, %3, %4 - punpcklbw %3, %4 - punpckhbw %4, %3, m0 - punpcklbw %3, m0 + pmaddubsw m6, m1, [r4 - 7 * 16] ; [1] + pmulhrsw m6, m7 - punpckhbw m0, %5, m1 - punpcklbw %5, %6 - punpckhbw %6, %5, m0 - punpcklbw %5, m0 + mova m3, m1 + pmaddubsw m3, [r3 - 6 * 16] ; [14] + pmulhrsw m3, m7 + packuswb m6, m3 - punpckhdq m0, %3, %5 - punpckldq %3, %5 - punpckldq %5, %4, %6 - punpckhdq %4, %6 + pmaddubsw m1, [r3 + 7 * 16] ; [27] + pmulhrsw m1, m7 - movh [r0 + + %1 * 8], %3 - movhps [r0 + r1 + %1 * 8], %3 - movh [r0 + r1*2 + %1 * 8], m0 - movhps [r0 + r5 + %1 * 8], m0 - movh [r6 + %1 * 8], %5 - movhps [r6 + r1 + %1 * 8], %5 - movh [r6 + r1*2 + %1 * 8], %4 - movhps [r6 + r5 + %1 * 8], %4 - %else - ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32 - movh [r0 ], %3 - movhps [r0 + r1 ], %3 - movh [r0 + r1 * 2], %4 - movhps [r0 + r5 ], %4 - lea r0, [r0 + r1 * 4] - movh [r0 ], %5 - movhps [r0 + r1 ], %5 - movh [r0 + r1 * 2], %6 - movhps [r0 + r5 ], %6 - lea r0, [r0 + r1 * 4] - %endif -%endmacro + palignr m2, m0, 6 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] -;------------------------------------------------------------------------------------------ -; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) -;------------------------------------------------------------------------------------------ -INIT_XMM ssse3 -cglobal intra_pred_ang16_2, 3,5,3 - lea r4, [r2 + 2] - add r2, 34 - cmp r3m, byte 34 - cmove r2, r4 - movu m0, [r2] - movu m1, [r2 + 16] - movu [r0], m0 - palignr m2, m1, m0, 1 - movu [r0 + r1], m2 - lea r0, [r0 + r1 * 2] - palignr m2, m1, m0, 2 - movu [r0], m2 - palignr m2, m1, m0, 3 - movu [r0 + r1], m2 - lea r0, [r0 + r1 * 2] - palignr m2, m1, m0, 4 - movu [r0], m2 - palignr m2, m1, m0, 5 - movu [r0 + r1], m2 - lea r0, [r0 + r1 * 2] - palignr m2, m1, m0, 6 - movu [r0], m2 - palignr m2, m1, m0, 7 - movu [r0 + r1], m2 - lea r0, [r0 + r1 * 2] - palignr m2, m1, m0, 8 - movu [r0], m2 - palignr m2, m1, m0, 9 - movu [r0 + r1], m2 - lea r0, [r0 + r1 * 2] - palignr m2, m1, m0, 10 - movu [r0], m2 - palignr m2, m1, m0, 11 - movu [r0 + r1], m2 - lea r0, [r0 + r1 * 2] - palignr m2, m1, m0, 12 - movu [r0], m2 - palignr m2, m1, m0, 13 - movu [r0 + r1], m2 - lea r0, [r0 + r1 * 2] - palignr m2, m1, m0, 14 - movu [r0], m2 - palignr m2, m1, m0, 15 - movu [r0 + r1], m2 - RET + pmaddubsw m2, [r4] ; [8] + pmulhrsw m2, m7 + packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) -INIT_XMM sse4 -cglobal intra_pred_ang16_3, 3,7,8 - add r2, 32 - lea r3, [ang_table + 16 * 16] - mov r4d, 2 - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride +cglobal intra_pred_ang8_7, 3,5,8 + lea r4, [r2 + 1] + add r2, 17 + cmp r3m, byte 29 + cmove r2, r4 + lea r3, [ang_table + 24 * 16] + lea r4, [ang_table + 6 * 16] mova m7, [pw_1024] -.loop: - movu m0, [r2 + 1] - palignr m1, m0, 1 + movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - palignr m1, m2, m0, 2 + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] + pmaddubsw m4, m0, [r4 + 3 * 16] ; [9] pmulhrsw m4, m7 - pmaddubsw m1, [r3 + 4 * 16] ; [20] - pmulhrsw m1, m7 - packuswb m4, m1 - - palignr m5, m2, m0, 4 + pmaddubsw m3, m0, [r3 - 6 * 16] ; [18] + pmulhrsw m3, m7 + packuswb m4, m3 - pmaddubsw m5, [r3 - 2 * 16] ; [14] + pmaddubsw m5, m0, [r3 + 3 * 16] ; [27] pmulhrsw m5, m7 - palignr m6, m2, m0, 6 + palignr m1, m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] - pmaddubsw m6, [r3 - 8 * 16] ; [ 8] + pmaddubsw m6, m1, [r4 - 2 * 16] ; [4] pmulhrsw m6, m7 packuswb m5, m6 - palignr m1, m2, m0, 8 - - pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] + pmaddubsw m6, m1, [r4 + 7 * 16] ; [13] pmulhrsw m6, m7 - pmaddubsw m1, [r3 + 12 * 16] ; [28] - pmulhrsw m1, m7 - packuswb m6, m1 - - palignr m1, m2, m0, 10 + mova m3, m1 + pmaddubsw m3, [r3 - 2 * 16] ; [22] + pmulhrsw m3, m7 + packuswb m6, m3 - pmaddubsw m1, [r3 + 6 * 16] ; [22] + pmaddubsw m1, [r3 + 7 * 16] ; [31] pmulhrsw m1, m7 - palignr m2, m0, 12 + palignr m2, m0, 4 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] - pmaddubsw m2, [r3] ; [16] + pmaddubsw m2, [r4 + 2 * 16] ; [8] pmulhrsw m2, m7 packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 +cglobal intra_pred_ang8_8, 3,5,8 + lea r4, [r2 + 1] + add r2, 17 + cmp r3m, byte 28 + cmove r2, r4 + lea r3, [ang_table + 23 * 16] + lea r4, [ang_table + 8 * 16] + mova m7, [pw_1024] - movu m0, [r2 + 8] - palignr m1, m0, 1 + movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - palignr m5, m2, m0, 2 + punpckhbw m2, m0, m1 ; [x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m2, m0, 2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] - pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] + pmaddubsw m4, m0, [r4 - 3 * 16] ; [5] pmulhrsw m4, m7 - pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] - pmulhrsw m1, m7 - packuswb m4, m1 + pmaddubsw m3, m0, [r4 + 2 * 16] ; [10] + pmulhrsw m3, m7 + packuswb m4, m3 - pmaddubsw m5, [r3 + 14 * 16] ; [30] + pmaddubsw m5, m0, [r3 - 8 * 16] ; [15] pmulhrsw m5, m7 - palignr m6, m2, m0, 4 - - pmaddubsw m6, [r3 + 8 * 16] ; [24] + pmaddubsw m6, m0, [r3 - 3 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - palignr m1, m2, m0, 6 - - pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] + pmaddubsw m6, m0, [r3 + 2 * 16] ; [25] pmulhrsw m6, m7 - palignr m1, m2, m0, 8 + pmaddubsw m0, [r3 + 7 * 16] ; [30] + pmulhrsw m0, m7 + packuswb m6, m0 - pmaddubsw m1, [r3 - 4 * 16] ; [12] + pmaddubsw m1, m2, [r4 - 5 * 16] ; [3] pmulhrsw m1, m7 - packuswb m6, m1 - palignr m1, m2, m0, 10 + pmaddubsw m2, [r4] ; [8] + pmulhrsw m2, m7 + packuswb m1, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - pmaddubsw m1, [r3 - 10 * 16] ; [06] - pmulhrsw m1, m7 - packuswb m1, m1 +cglobal intra_pred_ang8_9, 3,5,8 + lea r4, [r2 + 1] + add r2, 17 + cmp r3m, byte 27 + cmove r2, r4 + lea r3, [ang_table + 10 * 16] + mova m7, [pw_1024] - movhps m1, [r2 + 14] ; [00] + movu m0, [r2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m0, 1 ; [x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + punpcklbw m0, m1 ; [9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] - add r2, 8 - dec r4 - jnz .loop - RET + pmaddubsw m4, m0, [r3 - 8 * 16] ; [2] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r3 - 6 * 16] ; [4] + pmulhrsw m3, m7 + packuswb m4, m3 -INIT_XMM sse4 -cglobal intra_pred_ang16_33, 3,7,8 - lea r3, [ang_table + 16 * 16] - mov r4d, 2 - lea r5, [r1 * 3] - mov r6, r0 - mova m7, [pw_1024] + pmaddubsw m5, m0, [r3 - 4 * 16] ; [6] + pmulhrsw m5, m7 -.loop: - movu m0, [r2 + 1] - palignr m1, m0, 1 + pmaddubsw m6, m0, [r3 - 2 * 16] ; [8] + pmulhrsw m6, m7 + packuswb m5, m6 - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - palignr m1, m2, m0, 2 + pmaddubsw m6, m0, [r3] ; [10] + pmulhrsw m6, m7 - pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] - pmulhrsw m4, m7 - pmaddubsw m1, [r3 + 4 * 16] ; [20] - pmulhrsw m1, m7 - packuswb m4, m1 + pmaddubsw m2, m0, [r3 + 2 * 16] ; [12] + pmulhrsw m2, m7 + packuswb m6, m2 - palignr m5, m2, m0, 4 + pmaddubsw m1, m0, [r3 + 4 * 16] ; [14] + pmulhrsw m1, m7 - pmaddubsw m5, [r3 - 2 * 16] ; [14] - pmulhrsw m5, m7 + pmaddubsw m0, [r3 + 6 * 16] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - palignr m6, m2, m0, 6 +cglobal intra_pred_ang8_10, 3,6,5 + movh m0, [r2 + 17] + mova m4, [pb_unpackbq] + palignr m1, m0, 2 + pshufb m1, m4 + palignr m2, m0, 4 + pshufb m2, m4 + palignr m3, m0, 6 + pshufb m3, m4 + pshufb m0, m4 - pmaddubsw m6, [r3 - 8 * 16] ; [ 8] - pmulhrsw m6, m7 - packuswb m5, m6 + lea r5, [r1 * 3] + movhps [r0 + r1], m0 + movh [r0 + r1 * 2], m1 + movhps [r0 + r5], m1 + lea r3, [r0 + r1 * 4] + movh [r3], m2 + movhps [r3 + r1], m2 + movh [r3 + r1 * 2], m3 + movhps [r3 + r5], m3 - palignr m1, m2, m0, 8 +; filter + cmp r4m, byte 0 + jz .quit - pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] - pmulhrsw m6, m7 + pmovzxbw m0, m0 + movu m1, [r2] + palignr m2, m1, 1 + pshufb m1, m4 + pmovzxbw m1, m1 + pmovzxbw m2, m2 + psubw m2, m1 + psraw m2, 1 + paddw m0, m2 + packuswb m0, m0 - pmaddubsw m1, [r3 + 12 * 16] ; [28] - pmulhrsw m1, m7 - packuswb m6, m1 +.quit: + movh [r0], m0 + RET - palignr m1, m2, m0, 10 +cglobal intra_pred_ang8_26, 3,6,3 + movu m2, [r2] + palignr m0, m2, 1 + lea r5, [r1 * 3] + movh [r0], m0 + movh [r0 + r1], m0 + movh [r0 + r1 * 2], m0 + movh [r0 + r5], m0 + lea r3, [r0 + r1 * 4] + movh [r3], m0 + movh [r3 + r1], m0 + movh [r3 + r1 * 2], m0 + movh [r3 + r5], m0 - pmaddubsw m1, [r3 + 6 * 16] ; [22] - pmulhrsw m1, m7 +; filter + cmp r4m, byte 0 + jz .quit - palignr m2, m0, 12 + pshufb m2, [pb_unpackbq] + movhlps m1, m2 + pmovzxbw m2, m2 + movu m0, [r2 + 17] + pmovzxbw m1, m1 + pmovzxbw m0, m0 + psubw m0, m2 + psraw m0, 1 + paddw m1, m0 + packuswb m1, m1 + pextrb [r0], m1, 0 + pextrb [r0 + r1], m1, 1 + pextrb [r0 + r1 * 2], m1, 2 + pextrb [r0 + r5], m1, 3 + pextrb [r3], m1, 4 + pextrb [r3 + r1], m1, 5 + pextrb [r3 + r1 * 2], m1, 6 + pextrb [r3 + r5], m1, 7 +.quit: + RET - pmaddubsw m2, [r3] ; [16] - pmulhrsw m2, m7 - packuswb m1, m2 +cglobal intra_pred_ang8_11, 3,5,8 + xor r4, r4 + cmp r3m, byte 25 + mov r3, 16 + cmove r3, r4 - TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + movu m0, [r2 + r3] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pinsrb m0, [r2], 0 + palignr m1, m0, 1 ; [x 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - movu m0, [r2 + 8] - palignr m1, m0, 1 + punpcklbw m0, m1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - palignr m5, m2, m0, 2 + lea r3, [ang_table + 23 * 16] + mova m7, [pw_1024] - pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] + pmaddubsw m4, m0, [r3 + 7 * 16] ; [30] pmulhrsw m4, m7 - pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] - pmulhrsw m1, m7 - packuswb m4, m1 + pmaddubsw m3, m0, [r3 + 5 * 16] ; [28] + pmulhrsw m3, m7 + packuswb m4, m3 - pmaddubsw m5, [r3 + 14 * 16] ; [30] + pmaddubsw m5, m0, [r3 + 3 * 16] ; [26] pmulhrsw m5, m7 - palignr m6, m2, m0, 4 - - pmaddubsw m6, [r3 + 8 * 16] ; [24] + pmaddubsw m6, m0, [r3 + 1 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 - palignr m1, m2, m0, 6 - - pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] + pmaddubsw m6, m0, [r3 - 1 * 16] ; [22] pmulhrsw m6, m7 - palignr m1, m2, m0, 8 - - pmaddubsw m1, [r3 - 4 * 16] ; [12] - pmulhrsw m1, m7 - packuswb m6, m1 - - palignr m1, m2, m0, 10 + pmaddubsw m2, m0, [r3 - 3 * 16] ; [20] + pmulhrsw m2, m7 + packuswb m6, m2 - pmaddubsw m1, [r3 - 10 * 16] ; [06] + pmaddubsw m1, m0, [r3 - 5 * 16] ; [18] pmulhrsw m1, m7 - packuswb m1, m1 - movh m2, [r2 + 14] ; [00] + pmaddubsw m0, [r3 - 7 * 16] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - movh [r0 ], m4 - movhps [r0 + r1 ], m4 - movh [r0 + r1 * 2], m5 - movhps [r0 + r5 ], m5 - lea r0, [r0 + r1 * 4] - movh [r0 ], m6 - movhps [r0 + r1 ], m6 - movh [r0 + r1 * 2], m1 - movh [r0 + r5 ], m2 +cglobal intra_pred_ang8_12, 3,5,8 + xor r4, r4 + cmp r3m, byte 24 + mov r3, 16 + jz .next + xchg r3, r4 +.next: - lea r0, [r6 + 8] - add r2, 8 - dec r4 - jnz .loop - RET + movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pinsrb m1, [r2], 0 + pslldq m0, m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] + pinsrb m0, [r2 + r3 + 6], 0 -INIT_XMM sse4 -cglobal intra_pred_ang16_4, 3,7,8 - add r2, 32 - lea r3, [ang_table + 16 * 16] - mov r4d, 2 - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + lea r4, [ang_table + 22 * 16] mova m7, [pw_1024] -.loop: - movu m0, [r2 + 1] - palignr m1, m0, 1 - - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - palignr m1, m2, m0, 2 - mova m5, m1 + punpckhbw m2, m0, m1 ; [15 14 14 13 13 12 12 11 11 10 10 9 9 8 8 7] + punpcklbw m0, m1 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m2, m0, 2 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] + pmaddubsw m4, m2, [r4 + 5 * 16] ; [27] pmulhrsw m4, m7 - pmaddubsw m1, [r3 - 6 * 16] ; [10] + pmaddubsw m3, m2, [r4] ; [22] + pmulhrsw m3, m7 + packuswb m4, m3 + + pmaddubsw m1, m0, [r4 + 7 * 16] ; [29] pmulhrsw m1, m7 - packuswb m4, m1 - pmaddubsw m5, [r3 + 15 * 16] ; [31] - pmulhrsw m5, m7 + pmaddubsw m0, [r4 + 2 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 - palignr m6, m2, m0, 4 + pmaddubsw m5, m2, [r4 - 5 * 16] ; [17] + pmulhrsw m5, m7 - pmaddubsw m6, [r3 + 4 * 16] ; [ 20] + lea r4, [ang_table + 7 * 16] + pmaddubsw m6, m2, [r4 + 5 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - palignr m1, m2, m0, 6 - - pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] + pmaddubsw m6, m2, [r4] ; [7] pmulhrsw m6, m7 - pmaddubsw m1, [r3 + 14 * 16] ; [30] - pmulhrsw m1, m7 - packuswb m6, m1 - - palignr m1, m2, m0, 8 - - pmaddubsw m1, [r3 + 3 * 16] ; [19] - pmulhrsw m1, m7 - - palignr m2, m0, 10 + pmaddubsw m2, [r4 - 5 * 16] ; [2] + pmulhrsw m2, m7 + packuswb m6, m2 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] - pmulhrsw m3, m7 - packuswb m1, m3 +cglobal intra_pred_ang8_13, 4,5,8 + xor r4, r4 + cmp r3m, byte 23 + mov r3, 16 + jz .next + xchg r3, r4 +.next: - TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pinsrb m1, [r2], 0 + pslldq m1, 1 ; [14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 a] + pinsrb m1, [r2 + r3 + 4], 0 + pslldq m0, m1, 1 ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] + pinsrb m0, [r2 + r3 + 7], 0 + punpckhbw m5, m0, m1 ; [14 13 13 12 12 11 11 10 10 9 9 8 8 7 7 6] + punpcklbw m0, m1 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m1, m5, m0, 2 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m5, m0, 4 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] - pmulhrsw m4, m7 + lea r4, [ang_table + 24 * 16] + mova m7, [pw_1024] - movu m0, [r2 + 6] - palignr m1, m0, 1 + pmaddubsw m4, m5, [r4 - 1 * 16] ; [23] + pmulhrsw m4, m7 - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - palignr m1, m2, m0, 2 + pmaddubsw m6, m1, [r4 + 4 * 16] ; [28] + pmulhrsw m6, m7 - pmaddubsw m1, [r3 + 2 * 16] ; [18] - pmulhrsw m1, m7 - packuswb m4, m1 + pmaddubsw m0, [r4] ; [24] + pmulhrsw m0, m7 - palignr m5, m2, m0, 4 - mova m6, m5 + lea r4, [ang_table + 13 * 16] + pmaddubsw m3, m5, [r4 + 1 * 16] ; [14] + pmulhrsw m3, m7 + packuswb m4, m3 - pmaddubsw m5, [r3 - 9 * 16] ; [07] + pmaddubsw m5, [r4 - 8 * 16] ; [5] pmulhrsw m5, m7 - - pmaddubsw m6, [r3 + 12 * 16] ; [28] - pmulhrsw m6, m7 packuswb m5, m6 - palignr m6, m2, m0, 6 - - pmaddubsw m6, [r3 + 16] ; [17] + pmaddubsw m6, m1, [r4 + 6 * 16] ; [19] pmulhrsw m6, m7 - palignr m1, m2, m0, 8 - palignr m2, m0, 10 - - pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] - pmulhrsw m3, m7 - packuswb m6, m3 - - pmaddubsw m1, [r3 + 11 * 16] ; [27] - pmulhrsw m1, m7 - - pmaddubsw m2, [r3] ; [16] + pmaddubsw m2, m1, [r4 - 3 * 16] ; [10] pmulhrsw m2, m7 - packuswb m1, m2 - - TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 - - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] - add r2, 8 - dec r4 - jnz .loop - RET + packuswb m6, m2 -INIT_XMM sse4 -cglobal intra_pred_ang16_32, 3,7,8 - lea r3, [ang_table + 16 * 16] - mov r4d, 2 - lea r5, [r1 * 3] ; r5 -> 3 * stride - mov r6, r0 - mova m7, [pw_1024] + pmaddubsw m1, [r4 - 12 * 16] ; [1] + pmulhrsw m1, m7 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) -.loop: - movu m0, [r2 + 1] - palignr m1, m0, 1 +cglobal intra_pred_ang8_14, 4,5,8 + xor r4, r4 + cmp r3m, byte 22 + mov r3, 16 + jz .next + xchg r3, r4 +.next: - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - palignr m1, m2, m0, 2 - mova m5, m1 + movu m1, [r2 + r4 - 2] ; [13 12 11 10 9 8 7 6 5 4 3 2 1 0 a b] + pinsrb m1, [r2], 2 + pinsrb m1, [r2 + r3 + 2], 1 + pinsrb m1, [r2 + r3 + 5], 0 + pslldq m0, m1, 1 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] + pinsrb m0, [r2 + r3 + 7], 0 + punpckhbw m2, m0, m1 ; [13 12 12 11 11 10 10 9 9 8 8 7 7 6 6 5] + punpcklbw m0, m1 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] + palignr m1, m2, m0, 2 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m6, m2, m0, 4 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m2, m0, 6 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + lea r4, [ang_table + 24 * 16] + mova m3, [pw_1024] - pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] - pmulhrsw m4, m7 - pmaddubsw m1, [r3 - 6 * 16] ; [10] - pmulhrsw m1, m7 - packuswb m4, m1 + pmaddubsw m4, m2, [r4 - 5 * 16] ; [19] + pmulhrsw m4, m3 - pmaddubsw m5, [r3 + 15 * 16] ; [31] - pmulhrsw m5, m7 + pmaddubsw m0, [r4] ; [24] + pmulhrsw m0, m3 - palignr m6, m2, m0, 4 + pmaddubsw m5, m6, [r4 + 1 * 16] ; [25] + pmulhrsw m5, m3 - pmaddubsw m6, [r3 + 4 * 16] ; [ 20] - pmulhrsw m6, m7 + lea r4, [ang_table + 12 * 16] + pmaddubsw m6, [r4] ; [12] + pmulhrsw m6, m3 packuswb m5, m6 - palignr m1, m2, m0, 6 + pmaddubsw m6, m1, [r4 + 19 * 16] ; [31] + pmulhrsw m6, m3 - pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] - pmulhrsw m6, m7 + pmaddubsw m2, [r4 - 6 * 16] ; [6] + pmulhrsw m2, m3 + packuswb m4, m2 - pmaddubsw m1, [r3 + 14 * 16] ; [30] - pmulhrsw m1, m7 - packuswb m6, m1 + pmaddubsw m2, m1, [r4 + 6 * 16] ; [18] + pmulhrsw m2, m3 + packuswb m6, m2 - palignr m1, m2, m0, 8 + pmaddubsw m1, [r4 - 7 * 16] ; [5] + pmulhrsw m1, m3 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - pmaddubsw m1, [r3 + 3 * 16] ; [19] - pmulhrsw m1, m7 +cglobal intra_pred_ang8_15, 4,5,8 + xor r4, r4 + cmp r3m, byte 21 + mov r3, 16 + jz .next + xchg r3, r4 +.next: - palignr m2, m0, 10 + movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pinsrb m1, [r2], 0 + movu m2, [r2 + r3] + pshufb m2, [c_mode16_15] + palignr m1, m2, 13 ; [12 11 10 9 8 7 6 5 4 3 2 1 0 a b c] + pslldq m0, m1, 1 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] + pinsrb m0, [r2 + r3 + 8], 0 + punpckhbw m4, m0, m1 ; [12 11 11 10 10 9 9 8 8 7 7 6 6 5 5 4] + punpcklbw m0, m1 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] + palignr m1, m4, m0, 2 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] + palignr m6, m4, m0, 4 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m5, m4, m0, 6 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m4, m0, 8 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] - pmulhrsw m3, m7 - packuswb m1, m3 + lea r4, [ang_table + 23 * 16] + mova m3, [pw_1024] - TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + pmaddubsw m4, [r4 - 8 * 16] ; [15] + pmulhrsw m4, m3 - pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] - pmulhrsw m4, m7 + pmaddubsw m2, m5, [r4 + 7 * 16] ; [30] + pmulhrsw m2, m3 + packuswb m4, m2 - movu m0, [r2 + 6] - palignr m1, m0, 1 + pmaddubsw m5, [r4 - 10 * 16] ; [13] + pmulhrsw m5, m3 - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - palignr m1, m2, m0, 2 + pmaddubsw m2, m6, [r4 + 5 * 16] ; [28] + pmulhrsw m2, m3 + packuswb m5, m2 - pmaddubsw m1, [r3 + 2 * 16] ; [18] - pmulhrsw m1, m7 - packuswb m4, m1 + pmaddubsw m2, m1, [r4 + 3 * 16] ; [26] + pmulhrsw m2, m3 - palignr m5, m2, m0, 4 - mova m6, m5 + pmaddubsw m0, [r4 + 1 * 16] ; [24] + pmulhrsw m0, m3 - pmaddubsw m5, [r3 - 9 * 16] ; [07] - pmulhrsw m5, m7 + lea r4, [ang_table + 11 * 16] + pmaddubsw m6, [r4] ; [11] + pmulhrsw m6, m3 + packuswb m6, m2 - pmaddubsw m6, [r3 + 12 * 16] ; [28] - pmulhrsw m6, m7 - packuswb m5, m6 + pmaddubsw m1, [r4 - 2 * 16] ; [9] + pmulhrsw m1, m3 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) - palignr m6, m2, m0, 6 +cglobal intra_pred_ang8_16, 4,5,8 + xor r4, r4 + cmp r3m, byte 20 + mov r3, 16 + jz .next + xchg r3, r4 +.next: - pmaddubsw m6, [r3 + 16] ; [17] - pmulhrsw m6, m7 + movu m1, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pinsrb m1, [r2], 0 + movu m2, [r2 + r3] + pshufb m2, [c_mode16_16] + palignr m1, m2, 12 ; [11 10 9 8 7 6 5 4 3 2 1 0 a b c d] + pslldq m0, m1, 1 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] + pinsrb m0, [r2 + r3 + 8], 0 + punpckhbw m4, m0, m1 ; [11 10 10 9 9 8 8 7 7 6 6 5 5 4 4 3] + punpcklbw m0, m1 ; [3 2 2 1 1 0 0 a a b b c c d d e] + palignr m1, m4, m0, 2 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] + palignr m6, m4, m0, 4 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] + palignr m2, m4, m0, 6 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m5, m4, m0, 8 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m4, m0, 10 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - palignr m1, m2, m0, 8 - palignr m2, m0, 10 + lea r4, [ang_table + 22 * 16] + mova m7, [pw_1024] - pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] + pmaddubsw m3, m5, [r4] ; [22] pmulhrsw m3, m7 - packuswb m6, m3 - pmaddubsw m1, [r3 + 11 * 16] ; [27] - pmulhrsw m1, m7 + pmaddubsw m0, [r4 + 2 * 16] ; [24] + pmulhrsw m0, m7 - pmaddubsw m2, [r3] ; [16] + lea r4, [ang_table + 9 * 16] + + pmaddubsw m4, [r4 + 2 * 16] ; [11] + pmulhrsw m4, m7 + packuswb m4, m3 + + pmaddubsw m2, [r4 + 3 * 16] ; [12] pmulhrsw m2, m7 - packuswb m1, m2 - TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + pmaddubsw m5, [r4 - 8 * 16] ; [1] + pmulhrsw m5, m7 + packuswb m5, m2 - lea r0, [r6 + 8] - add r2, 8 - dec r4 - jnz .loop + mova m2, m6 + pmaddubsw m6, [r4 + 14 * 16] ; [23] + pmulhrsw m6, m7 + + pmaddubsw m2, [r4 - 7 * 16] ; [2] + pmulhrsw m2, m7 + packuswb m6, m2 + + pmaddubsw m1, [r4 + 4 * 16] ; [13] + pmulhrsw m1, m7 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_17, 4,5,8 + xor r4, r4 + cmp r3m, byte 19 + mov r3, 16 + jz .next + xchg r3, r4 +.next: + + movu m2, [r2 + r4] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pinsrb m2, [r2], 0 + movu m1, [r2 + r3] + pshufb m1, [c_mode16_17] + palignr m2, m1, 11 ; [10 9 8 7 6 5 4 3 2 1 0 a b c d e] + pslldq m0, m2, 1 ; [9 8 7 6 5 4 3 2 1 0 a b c d e f] + pinsrb m0, [r2 + r3 + 7], 0 + punpckhbw m1, m0, m2 ; [10 9 9 8 8 7 7 6 6 5 5 4 4 3 3 2] + punpcklbw m0, m2 ; [2 1 1 0 0 a a b b c c d d e e f] + + palignr m5, m1, m0, 8 ; [6 5 5 4 4 3 3 2 2 1 1 0 0 a a b] + palignr m2, m1, m0, 10 ; [7 6 6 5 5 4 4 3 3 2 2 1 1 0 0 a] + palignr m4, m1, m0, 12 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + lea r4, [ang_table + 17 * 16] + mova m3, [pw_1024] + + pmaddubsw m2, [r4 - 5 * 16] ; [12] + pmulhrsw m2, m3 + + pmaddubsw m4, [r4 - 11 * 16] ; [6] + pmulhrsw m4, m3 + packuswb m4, m2 + + pmaddubsw m5, [r4 + 1 * 16] ; [18] + pmulhrsw m5, m3 + + palignr m2, m1, m0, 6 ; [5 4 4 3 3 2 2 1 1 0 0 a a b b c] + pmaddubsw m2, [r4 + 7 * 16] ; [24] + pmulhrsw m2, m3 + packuswb m5, m2 + + palignr m6, m1, m0, 4 ; [4 3 3 2 2 1 1 0 0 a a b b c c d] + mova m2, m6 + pmaddubsw m6, [r4 + 13 * 16] ; [30] + pmulhrsw m6, m3 + + pmaddubsw m2, [r4 - 13 * 16] ; [4] + pmulhrsw m2, m3 + packuswb m6, m2 + + palignr m1, m0, 2 ; [3 2 2 1 1 0 0 a a b b c c d d e] + pmaddubsw m1, [r4 - 7 * 16] ; [10] + pmulhrsw m1, m3 + + pmaddubsw m0, [r4 - 1 * 16] ; [16] + pmulhrsw m0, m3 + packuswb m1, m0 + jmp mangle(private_prefix %+ _ %+ intra_pred_ang8_3 %+ SUFFIX %+ .transpose8x8) + +cglobal intra_pred_ang8_18, 4,4,1 + movu m0, [r2 + 16] + pinsrb m0, [r2], 0 + pshufb m0, [pb_swap8] + movhps m0, [r2 + 1] + lea r2, [r0 + r1 * 4] + lea r3, [r1 * 3] + movh [r2 + r3], m0 + psrldq m0, 1 + movh [r2 + r1 * 2], m0 + psrldq m0, 1 + movh [r2 + r1], m0 + psrldq m0, 1 + movh [r2], m0 + psrldq m0, 1 + movh [r0 + r3], m0 + psrldq m0, 1 + movh [r0 + r1 * 2], m0 + psrldq m0, 1 + movh [r0 + r1], m0 + psrldq m0, 1 + movh [r0], m0 + RET + +%macro TRANSPOSE_STORE_8x8 6 + %if %2 == 1 + ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32 + punpckhbw m0, %3, %4 + punpcklbw %3, %4 + punpckhbw %4, %3, m0 + punpcklbw %3, m0 + + punpckhbw m0, %5, m1 + punpcklbw %5, %6 + punpckhbw %6, %5, m0 + punpcklbw %5, m0 + + punpckhdq m0, %3, %5 + punpckldq %3, %5 + punpckldq %5, %4, %6 + punpckhdq %4, %6 + + movh [r0 + + %1 * 8], %3 + movhps [r0 + r1 + %1 * 8], %3 + movh [r0 + r1*2 + %1 * 8], m0 + movhps [r0 + r5 + %1 * 8], m0 + movh [r6 + %1 * 8], %5 + movhps [r6 + r1 + %1 * 8], %5 + movh [r6 + r1*2 + %1 * 8], %4 + movhps [r6 + r5 + %1 * 8], %4 + %else + ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32 + movh [r0 ], %3 + movhps [r0 + r1 ], %3 + movh [r0 + r1 * 2], %4 + movhps [r0 + r5 ], %4 + lea r0, [r0 + r1 * 4] + movh [r0 ], %5 + movhps [r0 + r1 ], %5 + movh [r0 + r1 * 2], %6 + movhps [r0 + r5 ], %6 + lea r0, [r0 + r1 * 4] + %endif +%endmacro + +;------------------------------------------------------------------------------------------ +; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) +;------------------------------------------------------------------------------------------ +INIT_XMM ssse3 +cglobal intra_pred_ang16_2, 3,5,3 + lea r4, [r2 + 2] + add r2, 34 + cmp r3m, byte 34 + cmove r2, r4 + movu m0, [r2] + movu m1, [r2 + 16] + movu [r0], m0 + palignr m2, m1, m0, 1 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 2 + movu [r0], m2 + palignr m2, m1, m0, 3 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 4 + movu [r0], m2 + palignr m2, m1, m0, 5 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 6 + movu [r0], m2 + palignr m2, m1, m0, 7 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 8 + movu [r0], m2 + palignr m2, m1, m0, 9 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 10 + movu [r0], m2 + palignr m2, m1, m0, 11 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 12 + movu [r0], m2 + palignr m2, m1, m0, 13 + movu [r0 + r1], m2 + lea r0, [r0 + r1 * 2] + palignr m2, m1, m0, 14 + movu [r0], m2 + palignr m2, m1, m0, 15 + movu [r0 + r1], m2 RET INIT_XMM sse4 -cglobal intra_pred_ang16_5, 3,7,8 +cglobal intra_pred_ang16_3, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 @@ -2572,75 +3535,92 @@ mova m7, [pw_1024] .loop: - movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + movu m0, [r2 + 1] + palignr m1, m0, 1 - palignr m5, m2, m3, 2 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 - pmaddubsw m4, m3, [r3 + 16] ; [17] + pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] pmulhrsw m4, m7 - pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] + pmaddubsw m1, [r3 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m4, m1 - palignr m6, m2, m3, 4 + palignr m5, m2, m0, 4 - pmaddubsw m5, [r3 + 3 * 16] ; [19] + pmaddubsw m5, [r3 - 2 * 16] ; [14] pmulhrsw m5, m7 - pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] - pmulhrsw m1, m7 - packuswb m5, m1 - palignr m1, m2, m3, 6 + palignr m6, m2, m0, 6 - pmaddubsw m6, [r3 + 5 * 16] ; [21] + pmaddubsw m6, [r3 - 8 * 16] ; [ 8] pmulhrsw m6, m7 - pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] - pmulhrsw m0, m7 - packuswb m6, m0 + packuswb m5, m6 - palignr m0, m2, m3, 8 + palignr m1, m2, m0, 8 - pmaddubsw m1, [r3 + 7 * 16] ; [23] - pmulhrsw m1, m7 - pmaddubsw m0, [r3 - 8 * 16] ; [8] - pmulhrsw m0, m7 - packuswb m1, m0 + pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] + pmulhrsw m6, m7 - TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + pmaddubsw m1, [r3 + 12 * 16] ; [28] + pmulhrsw m1, m7 + packuswb m6, m1 - palignr m4, m2, m3, 8 - palignr m5, m2, m3, 10 + palignr m1, m2, m0, 10 - pmaddubsw m4, [r3 + 9 * 16] ; [25] - pmulhrsw m4, m7 - pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] + pmaddubsw m1, [r3 + 6 * 16] ; [22] pmulhrsw m1, m7 - packuswb m4, m1 - palignr m6, m2, m3, 12 + palignr m2, m0, 12 - pmaddubsw m5, [r3 + 11 * 16] ; [27] - pmulhrsw m5, m7 - pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 + + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + movu m0, [r2 + 8] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m5, m2, m0, 2 + + pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] + pmulhrsw m4, m7 + pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 - packuswb m5, m1 + packuswb m4, m1 - palignr m1, m2, m3, 14 + pmaddubsw m5, [r3 + 14 * 16] ; [30] + pmulhrsw m5, m7 - pmaddubsw m6, [r3 + 13 * 16] ; [29] + palignr m6, m2, m0, 4 + + pmaddubsw m6, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 - pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] - pmulhrsw m0, m7 - packuswb m6, m0 + packuswb m5, m6 - pmaddubsw m1, [r3 + 15 * 16] ; [31] + palignr m1, m2, m0, 6 + + pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] + pmulhrsw m6, m7 + + palignr m1, m2, m0, 8 + + pmaddubsw m1, [r3 - 4 * 16] ; [12] pmulhrsw m1, m7 - pmaddubsw m2, [r3] ; [16] - pmulhrsw m2, m7 - packuswb m1, m2 + packuswb m6, m1 + + palignr m1, m2, m0, 10 + + pmaddubsw m1, [r3 - 10 * 16] ; [06] + pmulhrsw m1, m7 + packuswb m1, m1 + + movhps m1, [r2 + 14] ; [00] TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 @@ -2652,85 +3632,110 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_31, 3,7,8 +cglobal intra_pred_ang16_33, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 - lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r5, [r1 * 3] mov r6, r0 mova m7, [pw_1024] .loop: - movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + movu m0, [r2 + 1] + palignr m1, m0, 1 - palignr m5, m2, m3, 2 + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 - pmaddubsw m4, m3, [r3 + 16] ; [17] + pmaddubsw m4, m0, [r3 + 10 * 16] ; [26] pmulhrsw m4, m7 - pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] + pmaddubsw m1, [r3 + 4 * 16] ; [20] pmulhrsw m1, m7 packuswb m4, m1 - palignr m6, m2, m3, 4 + palignr m5, m2, m0, 4 - pmaddubsw m5, [r3 + 3 * 16] ; [19] + pmaddubsw m5, [r3 - 2 * 16] ; [14] pmulhrsw m5, m7 - pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] - pmulhrsw m1, m7 - packuswb m5, m1 - palignr m1, m2, m3, 6 + palignr m6, m2, m0, 6 - pmaddubsw m6, [r3 + 5 * 16] ; [21] + pmaddubsw m6, [r3 - 8 * 16] ; [ 8] pmulhrsw m6, m7 - pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] - pmulhrsw m0, m7 - packuswb m6, m0 + packuswb m5, m6 - palignr m0, m2, m3, 8 + palignr m1, m2, m0, 8 - pmaddubsw m1, [r3 + 7 * 16] ; [23] + pmaddubsw m6, m1, [r3 - 14 * 16] ; [ 2] + pmulhrsw m6, m7 + + pmaddubsw m1, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 - pmaddubsw m0, [r3 - 8 * 16] ; [8] - pmulhrsw m0, m7 - packuswb m1, m0 + packuswb m6, m1 + + palignr m1, m2, m0, 10 + + pmaddubsw m1, [r3 + 6 * 16] ; [22] + pmulhrsw m1, m7 + + palignr m2, m0, 12 + + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - palignr m4, m2, m3, 8 - palignr m5, m2, m3, 10 + movu m0, [r2 + 8] + palignr m1, m0, 1 - pmaddubsw m4, [r3 + 9 * 16] ; [25] + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m5, m2, m0, 2 + + pmaddubsw m4, m0, [r3 - 6 * 16] ; [10] pmulhrsw m4, m7 - pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] + pmaddubsw m1, m5, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m4, m1 - palignr m6, m2, m3, 12 - - pmaddubsw m5, [r3 + 11 * 16] ; [27] + pmaddubsw m5, [r3 + 14 * 16] ; [30] pmulhrsw m5, m7 - pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] - pmulhrsw m1, m7 - packuswb m5, m1 - palignr m1, m2, m3, 14 + palignr m6, m2, m0, 4 - pmaddubsw m6, [r3 + 13 * 16] ; [29] + pmaddubsw m6, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 - pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] - pmulhrsw m0, m7 - packuswb m6, m0 + packuswb m5, m6 - pmaddubsw m1, [r3 + 15 * 16] ; [31] + palignr m1, m2, m0, 6 + + pmaddubsw m6, m1, [r3 + 2 * 16] ; [18] + pmulhrsw m6, m7 + + palignr m1, m2, m0, 8 + + pmaddubsw m1, [r3 - 4 * 16] ; [12] pmulhrsw m1, m7 - pmaddubsw m2, [r3] ; [16] - pmulhrsw m2, m7 - packuswb m1, m2 + packuswb m6, m1 - TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + palignr m1, m2, m0, 10 + + pmaddubsw m1, [r3 - 10 * 16] ; [06] + pmulhrsw m1, m7 + packuswb m1, m1 + + movh m2, [r2 + 14] ; [00] + + movh [r0 ], m4 + movhps [r0 + r1 ], m4 + movh [r0 + r1 * 2], m5 + movhps [r0 + r5 ], m5 + lea r0, [r0 + r1 * 4] + movh [r0 ], m6 + movhps [r0 + r1 ], m6 + movh [r0 + r1 * 2], m1 + movh [r0 + r5 ], m2 lea r0, [r6 + 8] add r2, 8 @@ -2739,7 +3744,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_6, 3,7,8 +cglobal intra_pred_ang16_4, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 @@ -2748,70 +3753,90 @@ mova m7, [pw_1024] .loop: - movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + movu m0, [r2 + 1] + palignr m1, m0, 1 - pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + mova m5, m1 + + pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 - pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] + pmaddubsw m1, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 - palignr m6, m2, m3, 2 - - pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] + pmaddubsw m5, [r3 + 15 * 16] ; [31] pmulhrsw m5, m7 - pmaddubsw m6, [r3 + 4 * 16] ; [20] + + palignr m6, m2, m0, 4 + + pmaddubsw m6, [r3 + 4 * 16] ; [ 20] pmulhrsw m6, m7 packuswb m5, m6 - palignr m1, m2, m3, 4 + palignr m1, m2, m0, 6 - pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] + pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] pmulhrsw m6, m7 - pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] - pmulhrsw m0, m7 - packuswb m6, m0 - palignr m0, m2, m3, 6 + pmaddubsw m1, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m6, m1 - pmaddubsw m1, [r3 + 11 * 16] ; [27] + palignr m1, m2, m0, 8 + + pmaddubsw m1, [r3 + 3 * 16] ; [19] pmulhrsw m1, m7 - pmaddubsw m0, [r3 - 8 * 16] ; [8] - pmulhrsw m0, m7 - packuswb m1, m0 - TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + palignr m2, m0, 10 - palignr m4, m2, m3, 6 - palignr m6, m2, m3, 8 + pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 - pmaddubsw m4, [r3 + 5 * 16] ; [21] + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] pmulhrsw m4, m7 - pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] + + movu m0, [r2 + 6] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + + pmaddubsw m1, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m4, m1 - pmaddubsw m5, m6, [r3 - 16] ; [15] + palignr m5, m2, m0, 4 + mova m6, m5 + + pmaddubsw m5, [r3 - 9 * 16] ; [07] pmulhrsw m5, m7 + pmaddubsw m6, [r3 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 - palignr m0, m2, m3, 10 + palignr m6, m2, m0, 6 - pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] + pmaddubsw m6, [r3 + 16] ; [17] pmulhrsw m6, m7 - pmaddubsw m0, [r3 + 6 * 16] ; [22] - pmulhrsw m0, m7 - packuswb m6, m0 - palignr m2, m3, 12 + palignr m1, m2, m0, 8 + palignr m2, m0, 10 - pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] + pmulhrsw m3, m7 + packuswb m6, m3 + + pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 + pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 @@ -2826,7 +3851,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_30, 3,7,8 +cglobal intra_pred_ang16_32, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride @@ -2834,70 +3859,91 @@ mova m7, [pw_1024] .loop: - movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + movu m0, [r2 + 1] + palignr m1, m0, 1 - pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + mova m5, m1 + + + pmaddubsw m4, m0, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 - pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] + pmaddubsw m1, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 - palignr m6, m2, m3, 2 - - pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] + pmaddubsw m5, [r3 + 15 * 16] ; [31] pmulhrsw m5, m7 - pmaddubsw m6, [r3 + 4 * 16] ; [20] + + palignr m6, m2, m0, 4 + + pmaddubsw m6, [r3 + 4 * 16] ; [ 20] pmulhrsw m6, m7 packuswb m5, m6 - palignr m1, m2, m3, 4 + palignr m1, m2, m0, 6 - pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] + pmaddubsw m6, m1, [r3 - 7 * 16] ; [ 9] pmulhrsw m6, m7 - pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] - pmulhrsw m0, m7 - packuswb m6, m0 - palignr m0, m2, m3, 6 + pmaddubsw m1, [r3 + 14 * 16] ; [30] + pmulhrsw m1, m7 + packuswb m6, m1 - pmaddubsw m1, [r3 + 11 * 16] ; [27] + palignr m1, m2, m0, 8 + + pmaddubsw m1, [r3 + 3 * 16] ; [19] pmulhrsw m1, m7 - pmaddubsw m0, [r3 - 8 * 16] ; [8] - pmulhrsw m0, m7 - packuswb m1, m0 - TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + palignr m2, m0, 10 - palignr m4, m2, m3, 6 - palignr m6, m2, m3, 8 + pmaddubsw m3, m2, [r3 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 - pmaddubsw m4, [r3 + 5 * 16] ; [21] + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m2, [r3 + 13 * 16] ; [29] pmulhrsw m4, m7 - pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] + + movu m0, [r2 + 6] + palignr m1, m0, 1 + + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + palignr m1, m2, m0, 2 + + pmaddubsw m1, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m4, m1 - pmaddubsw m5, m6, [r3 - 16] ; [15] + palignr m5, m2, m0, 4 + mova m6, m5 + + pmaddubsw m5, [r3 - 9 * 16] ; [07] pmulhrsw m5, m7 + pmaddubsw m6, [r3 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 - palignr m0, m2, m3, 10 + palignr m6, m2, m0, 6 - pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] + pmaddubsw m6, [r3 + 16] ; [17] pmulhrsw m6, m7 - pmaddubsw m0, [r3 + 6 * 16] ; [22] - pmulhrsw m0, m7 - packuswb m6, m0 - palignr m2, m3, 12 + palignr m1, m2, m0, 8 + palignr m2, m0, 10 - pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmaddubsw m3, m1, [r3 - 10 * 16] ; [06] + pmulhrsw m3, m7 + packuswb m6, m3 + + pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 + pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 packuswb m1, m2 @@ -2911,43 +3957,47 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_7, 3,7,8 +cglobal intra_pred_ang16_5, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] + palignr m5, m2, m3, 2 + + pmaddubsw m4, m3, [r3 + 16] ; [17] pmulhrsw m4, m7 - pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] - pmulhrsw m0, m7 - packuswb m4, m0 + pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 - palignr m1, m2, m3, 2 + palignr m6, m2, m3, 4 - pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] + pmaddubsw m5, [r3 + 3 * 16] ; [19] pmulhrsw m5, m7 - pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] - pmulhrsw m6, m7 - packuswb m5, m6 + pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] + pmulhrsw m1, m7 + packuswb m5, m1 - pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] + palignr m1, m2, m3, 6 + + pmaddubsw m6, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 - pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] + pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] pmulhrsw m0, m7 packuswb m6, m0 - palignr m0, m2, m3, 4 + palignr m0, m2, m3, 8 - pmaddubsw m1, [r3 + 15 * 16] ; [31] + pmaddubsw m1, [r3 + 7 * 16] ; [23] pmulhrsw m1, m7 pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 @@ -2955,31 +4005,32 @@ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - palignr m1, m2, m3, 4 + palignr m4, m2, m3, 8 + palignr m5, m2, m3, 10 - pmaddubsw m4, m1, [r3 + 16] ; [17] + pmaddubsw m4, [r3 + 9 * 16] ; [25] pmulhrsw m4, m7 - pmaddubsw m1, [r3 + 10 * 16] ; [26] + pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 - palignr m0, m2, m3, 6 + palignr m6, m2, m3, 12 - pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] + pmaddubsw m5, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 - pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] - pmulhrsw m6, m7 - packuswb m5, m6 + pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] + pmulhrsw m1, m7 + packuswb m5, m1 - pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] + palignr m1, m2, m3, 14 + + pmaddubsw m6, [r3 + 13 * 16] ; [29] pmulhrsw m6, m7 - pmaddubsw m0, [r3 + 14 * 16] ; [30] + pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] pmulhrsw m0, m7 packuswb m6, m0 - palignr m2, m3, 8 - - pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] + pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 @@ -2995,7 +4046,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_29, 3,7,8 +cglobal intra_pred_ang16_31, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride @@ -3004,33 +4055,37 @@ .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + movu m1, [r2 + 2] ;[17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[17 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] + palignr m5, m2, m3, 2 + + pmaddubsw m4, m3, [r3 + 16] ; [17] pmulhrsw m4, m7 - pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] - pmulhrsw m0, m7 - packuswb m4, m0 + pmaddubsw m1, m5, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 - palignr m1, m2, m3, 2 + palignr m6, m2, m3, 4 - pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] + pmaddubsw m5, [r3 + 3 * 16] ; [19] pmulhrsw m5, m7 - pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] - pmulhrsw m6, m7 - packuswb m5, m6 + pmaddubsw m1, m6, [r3 - 12 * 16] ; [4] + pmulhrsw m1, m7 + packuswb m5, m1 - pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] + palignr m1, m2, m3, 6 + + pmaddubsw m6, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 - pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] + pmaddubsw m0, m1, [r3 - 10 * 16] ; [6] pmulhrsw m0, m7 packuswb m6, m0 - palignr m0, m2, m3, 4 + palignr m0, m2, m3, 8 - pmaddubsw m1, [r3 + 15 * 16] ; [31] + pmaddubsw m1, [r3 + 7 * 16] ; [23] pmulhrsw m1, m7 pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 @@ -3038,31 +4093,32 @@ TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - palignr m1, m2, m3, 4 + palignr m4, m2, m3, 8 + palignr m5, m2, m3, 10 - pmaddubsw m4, m1, [r3 + 16] ; [17] + pmaddubsw m4, [r3 + 9 * 16] ; [25] pmulhrsw m4, m7 - pmaddubsw m1, [r3 + 10 * 16] ; [26] + pmaddubsw m1, m5, [r3 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m4, m1 - palignr m0, m2, m3, 6 + palignr m6, m2, m3, 12 - pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] + pmaddubsw m5, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 - pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] - pmulhrsw m6, m7 - packuswb m5, m6 + pmaddubsw m1, m6, [r3 - 4 * 16] ; [12] + pmulhrsw m1, m7 + packuswb m5, m1 - pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] + palignr m1, m2, m3, 14 + + pmaddubsw m6, [r3 + 13 * 16] ; [29] pmulhrsw m6, m7 - pmaddubsw m0, [r3 + 14 * 16] ; [30] + pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] pmulhrsw m0, m7 packuswb m6, m0 - palignr m2, m3, 8 - - pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] + pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 pmaddubsw m2, [r3] ; [16] pmulhrsw m2, m7 @@ -3077,7 +4133,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_8, 3,7,8 +cglobal intra_pred_ang16_6, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 @@ -3086,63 +4142,73 @@ mova m7, [pw_1024] .loop: - movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] + pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 - pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] - pmulhrsw m2, m7 - packuswb m4, m2 + pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 - pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] + palignr m6, m2, m3, 2 + + pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] pmulhrsw m5, m7 - pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] + pmaddubsw m6, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] + palignr m1, m2, m3, 4 + + pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] pmulhrsw m6, m7 - pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] - pmulhrsw m2, m7 - packuswb m6, m2 + pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m6, m0 - palignr m2, m0, m1, 2 - palignr m3, m0, m1, 4 + palignr m0, m2, m3, 6 - pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 - pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] + pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] + palignr m4, m2, m3, 6 + palignr m6, m2, m3, 8 + + pmaddubsw m4, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 - pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] - pmulhrsw m5, m7 - packuswb m4, m5 + pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 - pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] + pmaddubsw m5, m6, [r3 - 16] ; [15] pmulhrsw m5, m7 - pmaddubsw m2, [r3 + 12 * 16] ; [28] - pmulhrsw m2, m7 - packuswb m5, m2 + pmaddubsw m6, [r3 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 - pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] + palignr m0, m2, m3, 10 + + pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] - pmulhrsw m1, m7 - packuswb m6, m1 + pmaddubsw m0, [r3 + 6 * 16] ; [22] + pmulhrsw m0, m7 + packuswb m6, m0 - pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] + palignr m2, m3, 12 + + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 - pmaddubsw m3, [r3] ; [16] - pmulhrsw m3, m7 - packuswb m1, m3 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 @@ -3154,7 +4220,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_28, 3,7,8 +cglobal intra_pred_ang16_30, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride @@ -3162,63 +4228,73 @@ mova m7, [pw_1024] .loop: - movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] - punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] + pmaddubsw m4, m3, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 - pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] - pmulhrsw m2, m7 - packuswb m4, m2 + pmaddubsw m1, m3, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 - pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] + palignr m6, m2, m3, 2 + + pmaddubsw m5, m6, [r3 - 9 * 16] ; [7] pmulhrsw m5, m7 - pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] + pmaddubsw m6, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] + palignr m1, m2, m3, 4 + + pmaddubsw m6, m1, [r3 - 15 * 16] ; [1] pmulhrsw m6, m7 - pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] - pmulhrsw m2, m7 - packuswb m6, m2 + pmaddubsw m0, m1, [r3 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m6, m0 - palignr m2, m0, m1, 2 - palignr m3, m0, m1, 4 + palignr m0, m2, m3, 6 - pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] + pmaddubsw m1, [r3 + 11 * 16] ; [27] pmulhrsw m1, m7 - pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] + pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] + palignr m4, m2, m3, 6 + palignr m6, m2, m3, 8 + + pmaddubsw m4, [r3 + 5 * 16] ; [21] pmulhrsw m4, m7 - pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] - pmulhrsw m5, m7 - packuswb m4, m5 + pmaddubsw m1, m6, [r3 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m4, m1 - pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] + pmaddubsw m5, m6, [r3 - 16] ; [15] pmulhrsw m5, m7 - pmaddubsw m2, [r3 + 12 * 16] ; [28] - pmulhrsw m2, m7 - packuswb m5, m2 + pmaddubsw m6, [r3 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 - pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] + palignr m0, m2, m3, 10 + + pmaddubsw m6, m0, [r3 - 7 * 16] ; [9] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] - pmulhrsw m1, m7 - packuswb m6, m1 + pmaddubsw m0, [r3 + 6 * 16] ; [22] + pmulhrsw m0, m7 + packuswb m6, m0 - pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] + palignr m2, m3, 12 + + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 - pmaddubsw m3, [r3] ; [16] - pmulhrsw m3, m7 - packuswb m1, m3 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 @@ -3229,68 +4305,79 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_9, 3,7,8 +cglobal intra_pred_ang16_7, 3,7,8 add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: - movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] + pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] pmulhrsw m4, m7 - pmaddubsw m0, m2, [r3 - 12 * 16] ; [4] + pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m4, m0 - pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] + palignr m1, m2, m3, 2 + + pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 - pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] + pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] + pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] pmulhrsw m6, m7 - pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] + pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] + palignr m0, m2, m3, 4 + + pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 - pmaddubsw m0, m2, [r3] ; [16] + pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] + palignr m1, m2, m3, 4 + + pmaddubsw m4, m1, [r3 + 16] ; [17] pmulhrsw m4, m7 - pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] - pmulhrsw m5, m7 - packuswb m4, m5 + pmaddubsw m1, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 - pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] + palignr m0, m2, m3, 6 + + pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] pmulhrsw m5, m7 - pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] + pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] + pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 - pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] - pmulhrsw m1, m7 - packuswb m6, m1 + pmaddubsw m0, [r3 + 14 * 16] ; [30] + pmulhrsw m0, m7 + packuswb m6, m0 - pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] - pmulhrsw m1, m7 - packuswb m1, m1 + palignr m2, m3, 8 - punpcklqdq m1, m3 ; [00] + pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] + pmulhrsw m1, m7 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 @@ -3302,7 +4389,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_27, 3,7,8 +cglobal intra_pred_ang16_29, 3,7,8 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride @@ -3311,66 +4398,71 @@ .loop: movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] - punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] + palignr m1, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m2, m3, m1 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m3, m1 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m3, [r3 - 14 * 16] ; [2] + pmaddubsw m4, m3, [r3 - 7 * 16] ; [9] pmulhrsw m4, m7 - pmaddubsw m0, m3, [r3 - 12 * 16] ; [4] + pmaddubsw m0, m3, [r3 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m4, m0 - pmaddubsw m5, m3, [r3 - 10 * 16] ; [6] + palignr m1, m2, m3, 2 + + pmaddubsw m5, m3, [r3 + 11 * 16] ; [27] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r3 - 8 * 16] ; [8] + pmaddubsw m6, m1, [r3 - 12 * 16] ; [4] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r3 - 6 * 16] ; [10] + pmaddubsw m6, m1, [r3 - 3 * 16] ; [13] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r3 - 4 * 16] ; [12] + pmaddubsw m0, m1, [r3 + 6 * 16] ; [22] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r3 - 2 * 16] ; [14] + palignr m0, m2, m3, 4 + + pmaddubsw m1, [r3 + 15 * 16] ; [31] pmulhrsw m1, m7 - pmaddubsw m0, m3, [r3] ; [16] + pmaddubsw m0, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r3 + 2 * 16] ; [18] + palignr m1, m2, m3, 4 + + pmaddubsw m4, m1, [r3 + 16] ; [17] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r3 + 4 * 16] ; [20] - pmulhrsw m5, m7 - packuswb m4, m5 + pmaddubsw m1, [r3 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m4, m1 - pmaddubsw m5, m3, [r3 + 6 * 16] ; [22] + palignr m0, m2, m3, 6 + + pmaddubsw m5, m0, [r3 - 13 * 16] ; [03] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] + pmaddubsw m6, m0, [r3 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r3 + 10 * 16] ; [26] + pmaddubsw m6, m0, [r3 + 5 * 16] ; [21] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r3 + 12 * 16] ; [28] - pmulhrsw m1, m7 - packuswb m6, m1 + pmaddubsw m0, [r3 + 14 * 16] ; [30] + pmulhrsw m0, m7 + packuswb m6, m0 - pmaddubsw m1, m3, [r3 + 14 * 16] ; [30] + palignr m2, m3, 8 + + pmaddubsw m1, m2, [r3 - 9 * 16] ; [07] pmulhrsw m1, m7 - packuswb m1, m1 + pmaddubsw m2, [r3] ; [16] + pmulhrsw m2, m7 + packuswb m1, m2 - movh [r0 ], m4 - movhps [r0 + r1 ], m4 - movh [r0 + r1 * 2], m5 - movhps [r0 + r5 ], m5 - lea r0, [r0 + r1 * 4] - movh [r0 ], m6 - movhps [r0 + r1 ], m6 - movh [r0 + r1 * 2], m1 - movh [r0 + r5 ], m2 + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 lea r0, [r6 + 8] add r2, 8 @@ -3379,647 +4471,675 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_10, 5,6,8 - lea r5, [r1 * 3] - pxor m7, m7 - - movu m0, [r2 + 1 + 32] - palignr m1, m0, 1 - pshufb m1, m7 - palignr m2, m0, 2 - pshufb m2, m7 - palignr m3, m0, 3 - pshufb m3, m7 - palignr m4, m0, 4 - pshufb m4, m7 - palignr m5, m0, 5 - pshufb m5, m7 - palignr m6, m0, 6 - pshufb m6, m7 - - movu [r0 + r1], m1 - movu [r0 + r1 * 2], m2 - movu [r0 + r5], m3 - lea r3, [r0 + r1 * 4] - movu [r3], m4 - movu [r3 + r1], m5 - movu [r3 + r1 * 2], m6 - - palignr m1, m0, 7 - pshufb m1, m7 - movhlps m2, m0 - pshufb m2, m7 - palignr m3, m0, 9 - pshufb m3, m7 - palignr m4, m0, 10 - pshufb m4, m7 - palignr m5, m0, 11 - pshufb m5, m7 - palignr m6, m0, 12 - pshufb m6, m7 - - movu [r3 + r5], m1 - lea r3, [r3 + r1 * 4] - movu [r3], m2 - movu [r3 + r1], m3 - movu [r3 + r1 * 2], m4 - movu [r3 + r5], m5 - lea r3, [r3 + r1 * 4] - movu [r3], m6 - - palignr m1, m0, 13 - pshufb m1, m7 - palignr m2, m0, 14 - pshufb m2, m7 - palignr m3, m0, 15 - pshufb m3, m7 - pshufb m0, m7 - - movu [r3 + r1], m1 - movu [r3 + r1 * 2], m2 - movu [r3 + r5], m3 - -; filter - cmp r4w, byte 0 - jz .quit - pmovzxbw m0, m0 - mova m1, m0 - movu m2, [r2] - movu m3, [r2 + 1] - - pshufb m2, m7 - pmovzxbw m2, m2 - movhlps m4, m3 - pmovzxbw m3, m3 - pmovzxbw m4, m4 - psubw m3, m2 - psubw m4, m2 - psraw m3, 1 - psraw m4, 1 - paddw m0, m3 - paddw m1, m4 - packuswb m0, m1 -.quit: - movu [r0], m0 - RET - -INIT_XMM sse4 -%if ARCH_X86_64 == 1 -cglobal intra_pred_ang16_26, 3,8,5 - mov r7, r4mp - %define bfilter r7w -%else -cglobal intra_pred_ang16_26, 5,7,5,0-4 - %define bfilter dword[rsp] - mov bfilter, r4 -%endif - movu m0, [r2 + 1] - - lea r4, [r1 * 3] - lea r3, [r0 + r1 * 4] - lea r5, [r3 + r1 * 4] - lea r6, [r5 + r1 * 4] - - movu [r0], m0 - movu [r0 + r1], m0 - movu [r0 + r1 * 2], m0 - movu [r0 + r4], m0 - movu [r3], m0 - movu [r3 + r1], m0 - movu [r3 + r1 * 2], m0 - movu [r3 + r4], m0 - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - - movu [r6], m0 - movu [r6 + r1], m0 - movu [r6 + r1 * 2], m0 - movu [r6 + r4], m0 - -; filter - cmp bfilter, byte 0 - jz .quit - - pxor m4, m4 - pshufb m0, m4 - pmovzxbw m0, m0 - mova m1, m0 - movu m2, [r2 + 32] - pinsrb m2, [r2], 0 - movu m3, [r2 + 1 + 32] - - pshufb m2, m4 - pmovzxbw m2, m2 - movhlps m4, m3 - pmovzxbw m3, m3 - pmovzxbw m4, m4 - psubw m3, m2 - psubw m4, m2 - psraw m3, 1 - psraw m4, 1 - paddw m0, m3 - paddw m1, m4 - packuswb m0, m1 - - pextrb [r0], m0, 0 - pextrb [r0 + r1], m0, 1 - pextrb [r0 + r1 * 2], m0, 2 - pextrb [r0 + r4], m0, 3 - pextrb [r3], m0, 4 - pextrb [r3 + r1], m0, 5 - pextrb [r3 + r1 * 2], m0, 6 - pextrb [r3 + r4], m0, 7 - pextrb [r5], m0, 8 - pextrb [r5 + r1], m0, 9 - pextrb [r5 + r1 * 2], m0, 10 - pextrb [r5 + r4], m0, 11 - pextrb [r6], m0, 12 - pextrb [r6 + r1], m0, 13 - pextrb [r6 + r1 * 2], m0, 14 - pextrb [r6 + r4], m0, 15 -.quit: - RET - -INIT_XMM sse4 -cglobal intra_pred_ang16_11, 3,7,8 +cglobal intra_pred_ang16_8, 3,7,8 + add r2, 32 lea r3, [ang_table + 16 * 16] + mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] - movu m3, [r2 + 32] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - pinsrb m3, [r2], 0 - mova m2, m3 - palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] +.loop: + movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] + pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] pmulhrsw m4, m7 - pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] - pmulhrsw m0, m7 - packuswb m4, m0 + pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] + pmulhrsw m2, m7 + packuswb m4, m2 - pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] + pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] + pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] + pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] - pmulhrsw m0, m7 - packuswb m6, m0 + pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] + pmulhrsw m2, m7 + packuswb m6, m2 - pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] + palignr m2, m0, m1, 2 + palignr m3, m0, m1, 4 + + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 - pmaddubsw m0, m3, [r3] ; [16] + pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] + pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] + pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] + pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] - pmulhrsw m6, m7 - packuswb m5, m6 + pmaddubsw m2, [r3 + 12 * 16] ; [28] + pmulhrsw m2, m7 + packuswb m5, m2 - pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] + pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] + pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] + pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] pmulhrsw m1, m7 - packuswb m1, m1 - punpcklqdq m1, m2 ;[00] + pmaddubsw m3, [r3] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 lea r0, [r6 + r1 * 4] lea r6, [r6 + r1 * 8] + add r2, 8 + dec r4 + jnz .loop + RET - movu m3, [r2 + 40] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - mova m2, m3 - palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] +INIT_XMM sse4 +cglobal intra_pred_ang16_28, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] - pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] - pmulhrsw m4, m7 - pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] - pmulhrsw m0, m7 - packuswb m4, m0 +.loop: + movu m1, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m3, m1, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpckhbw m0, m1, m3 ;[x 16 16 15 15 14 14 13 13 12 12 11 11 10 10 9] + punpcklbw m1, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] + pmaddubsw m4, m1, [r3 - 11 * 16] ; [5] + pmulhrsw m4, m7 + pmaddubsw m2, m1, [r3 - 6 * 16] ; [10] + pmulhrsw m2, m7 + packuswb m4, m2 + + pmaddubsw m5, m1, [r3 - 1 * 16] ; [15] + pmulhrsw m5, m7 + pmaddubsw m6, m1, [r3 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] + pmaddubsw m6, m1, [r3 + 9 * 16] ; [25] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] - pmulhrsw m0, m7 - packuswb m6, m0 + pmaddubsw m2, m1, [r3 + 14 * 16] ; [30] + pmulhrsw m2, m7 + packuswb m6, m2 - pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] + palignr m2, m0, m1, 2 + palignr m3, m0, m1, 4 + + pmaddubsw m1, m2, [r3 - 13 * 16] ; [3] pmulhrsw m1, m7 - pmaddubsw m0, m3, [r3] ; [16] + pmaddubsw m0, m2, [r3 - 8 * 16] ; [8] pmulhrsw m0, m7 packuswb m1, m0 - TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] + pmaddubsw m4, m2, [r3 - 3 * 16] ; [13] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] + pmaddubsw m5, m2, [r3 + 2 * 16] ; [18] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] + pmaddubsw m5, m2, [r3 + 7 * 16] ; [23] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] - pmulhrsw m6, m7 - packuswb m5, m6 + pmaddubsw m2, [r3 + 12 * 16] ; [28] + pmulhrsw m2, m7 + packuswb m5, m2 - pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] + pmaddubsw m6, m3, [r3 - 15 * 16] ; [01] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] + pmaddubsw m1, m3, [r3 - 10 * 16] ; [06] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] + pmaddubsw m1, m3, [r3 - 5 * 16] ; [11] pmulhrsw m1, m7 - packuswb m1, m1 - punpcklqdq m1, m2 ;[00] + pmaddubsw m3, [r3] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 - TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop RET INIT_XMM sse4 -cglobal intra_pred_ang16_25, 3,7,8 +cglobal intra_pred_ang16_9, 3,7,8 + add r2, 32 lea r3, [ang_table + 16 * 16] mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride - mov r6, r0 + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] .loop: - movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - mova m2, m3 - palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + movu m2, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m3, m2, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpcklbw m2, m3 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] + pmaddubsw m4, m2, [r3 - 14 * 16] ; [2] pmulhrsw m4, m7 - pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] + pmaddubsw m0, m2, [r3 - 12 * 16] ; [4] pmulhrsw m0, m7 packuswb m4, m0 - pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] + pmaddubsw m5, m2, [r3 - 10 * 16] ; [6] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] + pmaddubsw m6, m2, [r3 - 8 * 16] ; [8] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] + pmaddubsw m6, m2, [r3 - 6 * 16] ; [10] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] + pmaddubsw m0, m2, [r3 - 4 * 16] ; [12] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] + pmaddubsw m1, m2, [r3 - 2 * 16] ; [14] pmulhrsw m1, m7 - pmaddubsw m0, m3, [r3] ; [16] + pmaddubsw m0, m2, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 - TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] + pmaddubsw m4, m2, [r3 + 2 * 16] ; [18] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] + pmaddubsw m5, m2, [r3 + 4 * 16] ; [20] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] + pmaddubsw m5, m2, [r3 + 6 * 16] ; [22] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] + pmaddubsw m6, m2, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] + pmaddubsw m6, m2, [r3 + 10 * 16] ; [26] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] + pmaddubsw m1, m2, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] + pmaddubsw m1, m2, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 packuswb m1, m1 - movh [r0 ], m4 - movhps [r0 + r1 ], m4 - movh [r0 + r1 * 2], m5 - movhps [r0 + r5 ], m5 - lea r0, [r0 + r1 * 4] - movh [r0 ], m6 - movhps [r0 + r1 ], m6 - movh [r0 + r1 * 2], m1 - movh [r0 + r5 ], m2 + punpcklqdq m1, m3 ; [00] - lea r0, [r6 + 8] + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] add r2, 8 dec r4 jnz .loop RET INIT_XMM sse4 -cglobal intra_pred_ang16_12, 4,7,8 - lea r4, [ang_table + 16 * 16] +cglobal intra_pred_ang16_27, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 2 lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mov r6, r0 mova m7, [pw_1024] - movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - pinsrb m3, [r2], 0 - punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] - punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] - movu m2, [r2] - pshufb m2, [c_mode16_12] - - palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] +.loop: + movu m3, [r2 + 1] ;[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + palignr m2, m3, 1 ;[x 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + punpcklbw m3, m2 ;[9 8 8 7 7 6 6 5 5 4 4 3 3 2 2 1] - pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] + pmaddubsw m4, m3, [r3 - 14 * 16] ; [2] pmulhrsw m4, m7 - pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] - pmulhrsw m1, m7 - packuswb m4, m1 + pmaddubsw m0, m3, [r3 - 12 * 16] ; [4] + pmulhrsw m0, m7 + packuswb m4, m0 - pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] + pmaddubsw m5, m3, [r3 - 10 * 16] ; [6] pmulhrsw m5, m7 - pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r3 - 8 * 16] ; [8] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] + pmaddubsw m6, m3, [r3 - 6 * 16] ; [10] pmulhrsw m6, m7 - pmaddubsw m0, [r4 - 14 * 16] ; [2] + pmaddubsw m0, m3, [r3 - 4 * 16] ; [12] pmulhrsw m0, m7 packuswb m6, m0 - palignr m3, m2, 15 - - pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m1, m3, [r3 - 2 * 16] ; [14] pmulhrsw m1, m7 - pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmaddubsw m0, m3, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 - TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmaddubsw m4, m3, [r3 + 2 * 16] ; [18] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m5, m3, [r3 + 4 * 16] ; [20] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] + pmaddubsw m5, m3, [r3 + 6 * 16] ; [22] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmaddubsw m6, m3, [r3 + 10 * 16] ; [26] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m1, m3, [r3 + 12 * 16] ; [28] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] + pmaddubsw m1, m3, [r3 + 14 * 16] ; [30] pmulhrsw m1, m7 - pmaddubsw m3, [r4] ; [16] - pmulhrsw m3, m7 - packuswb m1, m3 - - TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + packuswb m1, m1 - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] + movh [r0 ], m4 + movhps [r0 + r1 ], m4 + movh [r0 + r1 * 2], m5 + movhps [r0 + r5 ], m5 + lea r0, [r0 + r1 * 4] + movh [r0 ], m6 + movhps [r0 + r1 ], m6 + movh [r0 + r1 * 2], m1 + movh [r0 + r5 ], m2 - movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] - punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] - movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + RET - pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] - pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] - pmulhrsw m5, m7 - packuswb m4, m5 +INIT_XMM sse4 +cglobal intra_pred_ang16_10, 5,6,8 + lea r5, [r1 * 3] + pxor m7, m7 - pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] - pmulhrsw m6, m7 - packuswb m5, m6 + movu m0, [r2 + 1 + 32] + palignr m1, m0, 1 + pshufb m1, m7 + palignr m2, m0, 2 + pshufb m2, m7 + palignr m3, m0, 3 + pshufb m3, m7 + palignr m4, m0, 4 + pshufb m4, m7 + palignr m5, m0, 5 + pshufb m5, m7 + palignr m6, m0, 6 + pshufb m6, m7 - pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] - pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] - pmulhrsw m0, m7 - packuswb m6, m0 + movu [r0 + r1], m1 + movu [r0 + r1 * 2], m2 + movu [r0 + r5], m3 + lea r3, [r0 + r1 * 4] + movu [r3], m4 + movu [r3 + r1], m5 + movu [r3 + r1 * 2], m6 - palignr m3, m2, 14 + palignr m1, m0, 7 + pshufb m1, m7 + movhlps m2, m0 + pshufb m2, m7 + palignr m3, m0, 9 + pshufb m3, m7 + palignr m4, m0, 10 + pshufb m4, m7 + palignr m5, m0, 11 + pshufb m5, m7 + palignr m6, m0, 12 + pshufb m6, m7 - pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] - pmulhrsw m1, m7 - pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] - pmulhrsw m0, m7 - packuswb m1, m0 + movu [r3 + r5], m1 + lea r3, [r3 + r1 * 4] + movu [r3], m2 + movu [r3 + r1], m3 + movu [r3 + r1 * 2], m4 + movu [r3 + r5], m5 + lea r3, [r3 + r1 * 4] + movu [r3], m6 - TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 + palignr m1, m0, 13 + pshufb m1, m7 + palignr m2, m0, 14 + pshufb m2, m7 + palignr m3, m0, 15 + pshufb m3, m7 + pshufb m0, m7 - pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] - pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] - pmulhrsw m5, m7 - packuswb m4, m5 + movu [r3 + r1], m1 + movu [r3 + r1 * 2], m2 + movu [r3 + r5], m3 - pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] - pmulhrsw m6, m7 - packuswb m5, m6 +; filter + cmp r4w, byte 0 + jz .quit + pmovzxbw m0, m0 + mova m1, m0 + movu m2, [r2] + movu m3, [r2 + 1] - pslldq m2, 1 - palignr m3, m2, 14 + pshufb m2, m7 + pmovzxbw m2, m2 + movhlps m4, m3 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + psubw m3, m2 + psubw m4, m2 + psraw m3, 1 + psraw m4, 1 + paddw m0, m3 + paddw m1, m4 + packuswb m0, m1 +.quit: + movu [r0], m0 + RET - pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] - pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] - pmulhrsw m1, m7 - packuswb m6, m1 +INIT_XMM sse4 +%if ARCH_X86_64 == 1 +cglobal intra_pred_ang16_26, 3,8,5 + mov r7, r4mp + %define bfilter r7w +%else +cglobal intra_pred_ang16_26, 5,7,5,0-4 + %define bfilter dword[rsp] + mov bfilter, r4 +%endif + movu m0, [r2 + 1] - pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] - pmulhrsw m1, m7 - pmaddubsw m3, [r4] ; [16] - pmulhrsw m3, m7 - packuswb m1, m3 + lea r4, [r1 * 3] + lea r3, [r0 + r1 * 4] + lea r5, [r3 + r1 * 4] + lea r6, [r5 + r1 * 4] - TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r4], m0 + movu [r3], m0 + movu [r3 + r1], m0 + movu [r3 + r1 * 2], m0 + movu [r3 + r4], m0 + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + + movu [r6], m0 + movu [r6 + r1], m0 + movu [r6 + r1 * 2], m0 + movu [r6 + r4], m0 + +; filter + cmp bfilter, byte 0 + jz .quit + + pxor m4, m4 + pshufb m0, m4 + pmovzxbw m0, m0 + mova m1, m0 + movu m2, [r2 + 32] + pinsrb m2, [r2], 0 + movu m3, [r2 + 1 + 32] + + pshufb m2, m4 + pmovzxbw m2, m2 + movhlps m4, m3 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + psubw m3, m2 + psubw m4, m2 + psraw m3, 1 + psraw m4, 1 + paddw m0, m3 + paddw m1, m4 + packuswb m0, m1 + + pextrb [r0], m0, 0 + pextrb [r0 + r1], m0, 1 + pextrb [r0 + r1 * 2], m0, 2 + pextrb [r0 + r4], m0, 3 + pextrb [r3], m0, 4 + pextrb [r3 + r1], m0, 5 + pextrb [r3 + r1 * 2], m0, 6 + pextrb [r3 + r4], m0, 7 + pextrb [r5], m0, 8 + pextrb [r5 + r1], m0, 9 + pextrb [r5 + r1 * 2], m0, 10 + pextrb [r5 + r4], m0, 11 + pextrb [r6], m0, 12 + pextrb [r6 + r1], m0, 13 + pextrb [r6 + r1 * 2], m0, 14 + pextrb [r6 + r4], m0, 15 +.quit: RET INIT_XMM sse4 -cglobal intra_pred_ang16_24, 4,7,8 - lea r4, [ang_table + 16 * 16] +cglobal intra_pred_ang16_11, 3,7,8 + lea r3, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride - mov r6, r0 + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride mova m7, [pw_1024] - movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] - punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] - movu m2, [r2 + 32] - pshufb m2, [c_mode16_12] - - palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + movu m3, [r2 + 32] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pinsrb m3, [r2], 0 + mova m2, m3 + palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] + pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] pmulhrsw m4, m7 - pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] - pmulhrsw m1, m7 - packuswb m4, m1 + pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] + pmulhrsw m0, m7 + packuswb m4, m0 - pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] + pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] pmulhrsw m5, m7 - pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] + pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] pmulhrsw m6, m7 - pmaddubsw m0, [r4 - 14 * 16] ; [2] + pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] pmulhrsw m0, m7 packuswb m6, m0 - palignr m3, m2, 15 - - pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 - pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmaddubsw m0, m3, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 - TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] + pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] pmulhrsw m6, m7 packuswb m5, m6 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] + pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] pmulhrsw m1, m7 - pmaddubsw m3, [r4] ; [16] - pmulhrsw m3, m7 - packuswb m1, m3 + packuswb m1, m1 + punpcklqdq m1, m2 ;[00] - TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 - lea r0, [r6 + 8] + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] - movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] - punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] - movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] + movu m3, [r2 + 40] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + mova m2, m3 + palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] + pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] - pmulhrsw m5, m7 - packuswb m4, m5 + pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] + pmulhrsw m0, m7 + packuswb m4, m0 - pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] + pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] + pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] + pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] pmulhrsw m0, m7 packuswb m6, m0 - palignr m3, m2, 14 - - pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] pmulhrsw m1, m7 - pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmaddubsw m0, m3, [r3] ; [16] pmulhrsw m0, m7 packuswb m1, m0 - TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] + pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] + pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] pmulhrsw m1, m7 - pmaddubsw m3, [r4] ; [16] - pmulhrsw m3, m7 - packuswb m1, m3 + packuswb m1, m1 + punpcklqdq m1, m2 ;[00] - TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 RET INIT_XMM sse4 -cglobal intra_pred_ang16_13, 4,7,8 +cglobal intra_pred_ang16_25, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 2 + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] + +.loop: + movu m3, [r2] ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + mova m2, m3 + palignr m1, m3, 1 ;[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + punpcklbw m3, m1 ;[8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, m3, [r3 + 14 * 16] ; [30] + pmulhrsw m4, m7 + pmaddubsw m0, m3, [r3 + 12 * 16] ; [28] + pmulhrsw m0, m7 + packuswb m4, m0 + + pmaddubsw m5, m3, [r3 + 10 * 16] ; [26] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r3 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r3 + 6 * 16] ; [22] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r3 + 4 * 16] ; [20] + pmulhrsw m0, m7 + packuswb m6, m0 + + pmaddubsw m1, m3, [r3 + 2 * 16] ; [18] + pmulhrsw m1, m7 + pmaddubsw m0, m3, [r3] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r3 - 2 * 16] ; [14] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r3 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r3 - 6 * 16] ; [10] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r3 - 8 * 16] ; [08] + pmulhrsw m6, m7 + packuswb m5, m6 + + pmaddubsw m6, m3, [r3 - 10 * 16] ; [06] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r3 - 12 * 16] ; [04] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r3 - 14 * 16] ; [02] + pmulhrsw m1, m7 + packuswb m1, m1 + + movh [r0 ], m4 + movhps [r0 + r1 ], m4 + movh [r0 + r1 * 2], m5 + movhps [r0 + r5 ], m5 + lea r0, [r0 + r1 * 4] + movh [r0 ], m6 + movhps [r0 + r1 ], m6 + movh [r0 + r1 * 2], m1 + movh [r0 + r5 ], m2 + + lea r0, [r6 + 8] + add r2, 8 + dec r4 + jnz .loop + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_12, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride @@ -4027,70 +5147,62 @@ movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m3, [r2], 0 - punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] - pshufb m2, [c_mode16_13] + pshufb m2, [c_mode16_12] - palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] + pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 - pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] - pmulhrsw m0, m7 - packuswb m4, m0 + pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] + pmulhrsw m1, m7 + packuswb m4, m1 - pmaddubsw m5, [r4 - 11 * 16] ; [05] + pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] pmulhrsw m5, m7 - - palignr m3, m2, 15 - - pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] + pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m0, [r4 - 14 * 16] ; [2] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] - pmulhrsw m1, m7 - - palignr m3, m2, 14 + palignr m3, m2, 15 + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 16] ; [15] + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] + pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 @@ -4106,64 +5218,56 @@ punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] - pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] + pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] pmulhrsw m5, m7 - - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] + pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] - pmulhrsw m1, m7 - - pslldq m2, 1 - palignr m3, m2, 14 + palignr m3, m2, 14 + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 16] ; [15] + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] + pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 @@ -4173,77 +5277,69 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_23, 4,7,8 +cglobal intra_pred_ang16_24, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpckhbw m0, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] - pshufb m2, [c_mode16_13] + pshufb m2, [c_mode16_12] - palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + palignr m0, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] + pmaddubsw m4, m0, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 - pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] - pmulhrsw m0, m7 - packuswb m4, m0 + pmaddubsw m1, m0, [r4 + 6 * 16] ; [22] + pmulhrsw m1, m7 + packuswb m4, m1 - pmaddubsw m5, [r4 - 11 * 16] ; [05] + pmaddubsw m5, m0, [r4 + 1 * 16] ; [17] pmulhrsw m5, m7 - - palignr m3, m2, 15 - - pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m6, m0, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] + pmaddubsw m6, m0, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m0, [r4 - 14 * 16] ; [2] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] - pmulhrsw m1, m7 - - palignr m3, m2, 14 + palignr m3, m2, 15 + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 16] ; [15] + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] + pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 @@ -4258,64 +5354,56 @@ punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] - pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] + pmaddubsw m4, m3, [r4 + 11 * 16] ; [27] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m5, m3, [r4 + 1 * 16] ; [17] pmulhrsw m5, m7 - - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] + pmaddubsw m6, m3, [r4 - 9 * 16] ; [7] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m0, m3, [r4 - 14 * 16] ; [2] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] - pmulhrsw m1, m7 - - pslldq m2, 1 - palignr m3, m2, 14 + palignr m3, m2, 14 + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 16] ; [15] + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m5, m3, [r4 - 7 * 16] ; [09] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] + pmaddubsw m1, m3, [r4 + 5 * 16] ; [21] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 @@ -4325,7 +5413,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_14, 4,7,8 +cglobal intra_pred_ang16_13, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride @@ -4336,36 +5424,34 @@ punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] - pshufb m2, [c_mode16_14] + pshufb m2, [c_mode16_13] palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] + pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 - pmaddubsw m5, [r4 - 10 * 16] ; [06] + pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m4, m0 + + pmaddubsw m5, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 - packuswb m4, m5 palignr m3, m2, 15 - pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 - pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] @@ -4374,35 +5460,31 @@ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] - pmulhrsw m5, m7 - packuswb m4, m5 - - pmaddubsw m5, m3, [r4 + 16] ; [17] + pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 @@ -4418,30 +5500,28 @@ punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] - pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 @@ -4453,35 +5533,31 @@ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] - pmulhrsw m5, m7 - packuswb m4, m5 - - pmaddubsw m5, m3, [r4 + 16] ; [17] + pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 @@ -4491,7 +5567,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_22, 4,7,8 +cglobal intra_pred_ang16_23, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 @@ -4501,36 +5577,34 @@ punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] - pshufb m2, [c_mode16_14] + pshufb m2, [c_mode16_13] palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] + pmaddubsw m4, m5, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 - pmaddubsw m5, [r4 - 10 * 16] ; [06] + pmaddubsw m0, m5, [r4 - 2 * 16] ; [14] + pmulhrsw m0, m7 + packuswb m4, m0 + + pmaddubsw m5, [r4 - 11 * 16] ; [05] pmulhrsw m5, m7 - packuswb m4, m5 palignr m3, m2, 15 - pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 - pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] @@ -4539,35 +5613,31 @@ TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] - pmulhrsw m5, m7 - packuswb m4, m5 - - pmaddubsw m5, m3, [r4 + 16] ; [17] + pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 @@ -4582,30 +5652,28 @@ punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] - pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] + pmaddubsw m4, m3, [r4 + 7 * 16] ; [23] pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmaddubsw m6, m3, [r4 + 3 * 16] ; [19] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m0, m3, [r4 - 6 * 16] ; [10] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 @@ -4617,35 +5685,31 @@ TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m5, m7 + packuswb m4, m5 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] - pmulhrsw m5, m7 - packuswb m4, m5 - - pmaddubsw m5, m3, [r4 + 16] ; [17] + pmaddubsw m5, m3, [r4 + 13 * 16] ; [29] pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] pmulhrsw m6, m7 - pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m1, m3, [r4 - 14 * 16] ; [02] pmulhrsw m1, m7 packuswb m6, m1 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmaddubsw m1, m3, [r4 + 9 * 16] ; [25] pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 @@ -4655,7 +5719,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_15, 4,7,8 +cglobal intra_pred_ang16_14, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride @@ -4663,42 +5727,36 @@ movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] pinsrb m3, [r2], 0 - punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] - pshufb m2, [c_mode16_15] + pshufb m2, [c_mode16_14] - palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, [r4 - 16] ; [15] + pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 - - palignr m3, m2, 15 - - pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmaddubsw m5, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] - pmulhrsw m5, m7 - - palignr m3, m2, 14 + palignr m3, m2, 15 - pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] - pmulhrsw m6, m7 - - pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 @@ -4710,42 +5768,36 @@ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m5, m3, [r4 + 16] ; [17] pmulhrsw m5, m7 - - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] - pmulhrsw m6, m7 - pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] - pmulhrsw m1, m7 - pslldq m2, 1 palignr m3, m2, 14 + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 @@ -4758,38 +5810,32 @@ movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] - movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] - pmaddubsw m4, m3, [r4 - 16] ; [15] + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 - - palignr m3, m2, 14 - - pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] - pmulhrsw m5, m7 - - pslldq m2, 1 - palignr m3, m2, 14 + palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] - pmulhrsw m6, m7 - pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 @@ -4801,42 +5847,36 @@ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m5, m3, [r4 + 16] ; [17] pmulhrsw m5, m7 - - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] - pmulhrsw m6, m7 - pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] - pmulhrsw m1, m7 - pslldq m2, 1 palignr m3, m2, 14 + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 @@ -4845,50 +5885,43 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_21, 4,7,8 +cglobal intra_pred_ang16_22, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 mova m7, [pw_1024] movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpckhbw m5, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] - pinsrb m2, [r2], 0 - pshufb m2, [c_mode16_15] + pshufb m2, [c_mode16_14] - palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + palignr m5, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, [r4 - 16] ; [15] + pmaddubsw m4, m5, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 - - palignr m3, m2, 15 - - pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmaddubsw m5, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] - pmulhrsw m5, m7 - - palignr m3, m2, 14 + palignr m3, m2, 15 - pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] - pmulhrsw m6, m7 - - pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 @@ -4900,42 +5933,36 @@ TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m5, m3, [r4 + 16] ; [17] pmulhrsw m5, m7 - - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] - pmulhrsw m6, m7 - pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] - pmulhrsw m1, m7 - pslldq m2, 1 palignr m3, m2, 14 + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 @@ -4947,38 +5974,32 @@ movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] - movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 x x x x x x x] - pmaddubsw m4, m3, [r4 - 16] ; [15] + pmaddubsw m4, m3, [r4 + 3 * 16] ; [19] pmulhrsw m4, m7 - - palignr m3, m2, 14 - - pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] + pmaddubsw m5, m3, [r4 - 10 * 16] ; [06] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] - pmulhrsw m5, m7 - - pslldq m2, 1 - palignr m3, m2, 14 + palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] - pmulhrsw m6, m7 - pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m6, m3, [r4 + 15 * 16] ; [31] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 + 2 * 16] ; [18] pmulhrsw m0, m7 packuswb m6, m0 - pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] pmulhrsw m1, m7 pslldq m2, 1 @@ -4990,42 +6011,36 @@ TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m5, m3, [r4 + 16] ; [17] pmulhrsw m5, m7 - - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 - pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] - pmulhrsw m6, m7 - pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] - pmulhrsw m1, m7 - pslldq m2, 1 palignr m3, m2, 14 + pmaddubsw m1, m3, [r4 + 13 * 16] ; [29] + pmulhrsw m1, m7 pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 @@ -5034,7 +6049,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_16, 4,7,8 +cglobal intra_pred_ang16_15, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride @@ -5045,43 +6060,42 @@ punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] - pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] + pshufb m2, [c_mode16_15] + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, [r4 - 5 * 16] ; [11] + pmaddubsw m4, [r4 - 16] ; [15] pmulhrsw m4, m7 palignr m3, m2, 15 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] + pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] - pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] pmulhrsw m0, m7 packuswb m6, m0 - pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] - palignr m3, m2, 14 - - pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 - pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] + pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] @@ -5090,42 +6104,40 @@ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] pmulhrsw m4, m7 - pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 - pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 - 16] ; [15] + pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] pmulhrsw m6, m7 - pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 - pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] + pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] @@ -5140,42 +6152,38 @@ movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] - palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] - movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] - pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] + pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] - pmulhrsw m6, m7 - - pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] pmulhrsw m0, m7 packuswb m6, m0 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 pslldq m2, 1 @@ -5187,39 +6195,37 @@ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 - 16] ; [15] + pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 @@ -5233,7 +6239,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_20, 4,7,8 +cglobal intra_pred_ang16_21, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 @@ -5244,43 +6250,42 @@ punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] pinsrb m2, [r2], 0 - pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] + pshufb m2, [c_mode16_15] + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, [r4 - 5 * 16] ; [11] + pmaddubsw m4, [r4 - 16] ; [15] pmulhrsw m4, m7 palignr m3, m2, 15 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] + pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] - pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] pmulhrsw m0, m7 packuswb m6, m0 - pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] - palignr m3, m2, 14 - - pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 - pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] + pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] @@ -5289,42 +6294,40 @@ TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] pmulhrsw m4, m7 - pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 - pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 - 16] ; [15] + pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] pmulhrsw m6, m7 - pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 - pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] + pslldq m2, 1 palignr m3, m2, 14 pmaddubsw m3, [r4] ; [16] @@ -5338,42 +6341,38 @@ movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] - palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] - movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0 0 0 0 0 0 0 15L] - pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmaddubsw m4, m3, [r4 - 16] ; [15] pmulhrsw m4, m7 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m5, m3, [r4 + 14 * 16] ; [30] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] + pmaddubsw m5, m3, [r4 - 3 * 16] ; [13] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m6, m3, [r4 + 12 * 16] ; [28] pmulhrsw m6, m7 packuswb m5, m6 + pmaddubsw m6, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] - pmulhrsw m6, m7 - - pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmaddubsw m0, m3, [r4 + 10 * 16] ; [26] pmulhrsw m0, m7 packuswb m6, m0 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmaddubsw m1, m3, [r4 - 7 * 16] ; [09] pmulhrsw m1, m7 pslldq m2, 1 @@ -5385,39 +6384,37 @@ TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmaddubsw m4, m3, [r4 - 9 * 16] ; [07] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 + pmaddubsw m5, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] - pmulhrsw m5, m7 - pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m6, m3, [r4 + 4 * 16] ; [20] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m6, m3, [r4 - 16] ; [15] + pmaddubsw m6, m3, [r4 - 13 * 16] ; [03] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmaddubsw m1, m3, [r4 + 2 * 16] ; [18] pmulhrsw m1, m7 packuswb m6, m1 - pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmaddubsw m1, m3, [r4 - 15 * 16] ; [01] pmulhrsw m1, m7 pslldq m2, 1 @@ -5431,7 +6428,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_17, 4,7,8 +cglobal intra_pred_ang16_16, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride @@ -5442,97 +6439,90 @@ punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2] - pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] + pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, [r4 - 10 * 16] ; [06] + pmaddubsw m4, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 palignr m3, m2, 15 - pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 - palignr m3, m2, 14 - - pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] pmulhrsw m5, m7 - pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] - pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] + pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] + pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] pmulhrsw m0, m7 packuswb m6, m0 - pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] + pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] pmulhrsw m1, m7 - pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] + pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] palignr m3, m2, 14 - pmaddubsw m0, m3, [r4] ; [16] + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] - palignr m3, m2, 14 - - pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] pmulhrsw m4, m7 - pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] + pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] - pmulhrsw m5, m7 - - pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] + pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 - pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] + pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m6, m3, [r4 - 16] ; [15] pmulhrsw m6, m7 - pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] + pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 - pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] - pmulhrsw m1, m7 - pmaddubsw m3, [r4 - 16 * 16] ; [00] + pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 @@ -5544,97 +6534,92 @@ movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] - palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] - movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x] + palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] - pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] pmulhrsw m5, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] pmulhrsw m6, m7 - pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] + + pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] pmulhrsw m0, m7 packuswb m6, m0 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] pmulhrsw m1, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m0, m3, [r4] ; [16] + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] pmulhrsw m0, m7 packuswb m1, m0 TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 - pslldq m2, 1 - palignr m3, m2, 14 - - pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] + pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] pmulhrsw m4, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] pmulhrsw m5, m7 packuswb m4, m5 - pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] - pmulhrsw m5, m7 - pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] pmulhrsw m6, m7 packuswb m5, m6 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] + pmaddubsw m6, m3, [r4 - 16] ; [15] pmulhrsw m6, m7 pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] pmulhrsw m1, m7 packuswb m6, m1 + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + pslldq m2, 1 palignr m3, m2, 14 - pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] - pmulhrsw m1, m7 - pmaddubsw m3, [r4 - 16 * 16] ; [00] + pmaddubsw m3, [r4] ; [16] pmulhrsw m3, m7 packuswb m1, m3 @@ -5642,7 +6627,7 @@ RET INIT_XMM sse4 -cglobal intra_pred_ang16_19, 4,7,8 +cglobal intra_pred_ang16_20, 4,7,8 lea r4, [ang_table + 16 * 16] lea r5, [r1 * 3] ; r5 -> 3 * stride mov r6, r0 @@ -5653,28 +6638,226 @@ punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] movu m2, [r2 + 32] pinsrb m2, [r2], 0 - pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] + pshufb m2, [c_mode16_16] ; [2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8] palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] - pmaddubsw m4, [r4 - 10 * 16] ; [06] + pmaddubsw m4, [r4 - 5 * 16] ; [11] pmulhrsw m4, m7 palignr m3, m2, 15 - pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] pmulhrsw m5, m7 packuswb m4, m5 - palignr m3, m2, 14 - - pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] pmulhrsw m5, m7 - pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] - pinsrb m2, [r2 + 5 + 32], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] palignr m3, m2, 14 - pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [3, 5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 ; [5, 6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [6, 8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m4, m7 + + pslldq m2, 1 ; [8, 9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 ; [9, 11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [11, 12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 16] ; [15] + pmulhrsw m6, m7 + + pslldq m2, 1 ; [12, 14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [14, 15, 0, 2, 3, 5, 6, 8, x, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + + movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 2, 3, 5, 6, 8, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] + + pmaddubsw m4, m3, [r4 - 5 * 16] ; [11] + pmulhrsw m4, m7 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 15 * 16] ; [01] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 7 * 16] ; [23] + pmulhrsw m6, m7 + + pmaddubsw m0, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 3 * 16] ; [13] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pmaddubsw m4, m3, [r4 - 13 * 16] ; [03] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 9 * 16] ; [25] + pmulhrsw m5, m7 + pmaddubsw m6, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 16] ; [15] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + packuswb m6, m1 + + pmaddubsw m1, m3, [r4 - 11 * 16] ; [05] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m3, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_17, 4,7,8 + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] + + movu m3, [r2 + 32] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + pinsrb m3, [r2], 0 + punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r2] + pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, [r4 - 10 * 16] ; [06] + pmulhrsw m4, m7 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m5, m7 + + pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] + pinsrb m2, [r2 + 5], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] pmulhrsw m6, m7 packuswb m5, m6 @@ -5700,7 +6883,7 @@ pmulhrsw m0, m7 packuswb m1, m0 - TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] palignr m3, m2, 14 @@ -5747,15 +6930,16 @@ pmulhrsw m3, m7 packuswb m1, m3 - TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 - lea r0, [r6 + 8] + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] - movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m1, [r2 + 1 + 32] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] - palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] - movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] + palignr m2, m2, 6 ; [x, x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 1, 2, 4, 5, x, x, x] pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] pmulhrsw m4, m7 @@ -5801,7 +6985,7 @@ pmulhrsw m0, m7 packuswb m1, m0 - TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1 pslldq m2, 1 palignr m3, m2, 14 @@ -5848,43 +7032,253 @@ pmulhrsw m3, m7 packuswb m1, m3 - TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1 RET INIT_XMM sse4 -cglobal intra_pred_ang16_18, 4,5,3 - movu m0, [r2] - movu m1, [r2 + 32] - mova m2, [c_mode16_18] - pshufb m1, m2 +cglobal intra_pred_ang16_19, 4,7,8 + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] - lea r2, [r1 * 2] - lea r3, [r1 * 3] - lea r4, [r1 * 4] - movu [r0], m0 - palignr m2, m0, m1, 15 - movu [r0 + r1], m2 - palignr m2, m0, m1, 14 - movu [r0 + r2], m2 - palignr m2, m0, m1, 13 - movu [r0 + r3], m2 - lea r0, [r0 + r4] - palignr m2, m0, m1, 12 - movu [r0], m2 - palignr m2, m0, m1, 11 - movu [r0 + r1], m2 - palignr m2, m0, m1, 10 - movu [r0 + r2], m2 - palignr m2, m0, m1, 9 - movu [r0 + r3], m2 - lea r0, [r0 + r4] - palignr m2, m0, m1, 8 - movu [r0], m2 - palignr m2, m0, m1, 7 - movu [r0 + r1], m2 - palignr m2, m0, m1, 6 - movu [r0 + r2], m2 - palignr m2, m0, m1, 5 + movu m3, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + punpckhbw m4, m3, m3 ; [15 15 14 14 13 13 12 12 11 11 10 10 9 9 8 8] + punpcklbw m3, m3 ; [7 7 6 6 5 5 4 4 3 3 2 2 1 1 0 0] + movu m2, [r2 + 32] + pinsrb m2, [r2], 0 + pshufb m2, [c_mode16_17] ; [1, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4] + palignr m4, m3, 1 ; [8 7 7 6 6 5 5 4 4 3 3 2 2 1 1 0] + + pmaddubsw m4, [r4 - 10 * 16] ; [06] + pmulhrsw m4, m7 + + palignr m3, m2, 15 + + pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m5, m7 + + pslldq m2, 1 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, x] + pinsrb m2, [r2 + 5 + 32], 0 ; [2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 ; [5, 6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + + pslldq m2, 1 ; [6, 7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pslldq m2, 1 ; [7, 9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m4, m7 + + pslldq m2, 1 ; [9, 10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m5, m7 + + pslldq m2, 1 ; [10, 11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 ; [11, 12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m6, m7 + + pslldq m2, 1 ; [12, 14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 ; [14, 15, 0, 1, 2, 4, 5, x, x, x, x, x, x, x, x, x] + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + pmaddubsw m3, [r4 - 16 * 16] ; [00] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + + lea r0, [r6 + 8] + + movu m1, [r2 + 1] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + pslldq m3, m1, 1 ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 x] + punpckhbw m3, m1 ; [16 15 15 14 14 13 13 12 12 11 11 10 10 9 9 8] + palignr m2, m2, 6 ; [x, x, x, x, x, 14, 15, 0, 1, 2, 4, 5, x, x, x] + movlhps m2, m1 ; [8 7 6 5 4 3 2 1 0, 2, 3, 5, 6, 8, x, x] + + pmaddubsw m4, m3, [r4 - 10 * 16] ; [06] + pmulhrsw m4, m7 + + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 - 4 * 16] ; [12] + pmulhrsw m5, m7 + packuswb m4, m5 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 2 * 16] ; [18] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 8 * 16] ; [24] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 + 14 * 16] ; [30] + pmulhrsw m6, m7 + pmaddubsw m0, m3, [r4 - 12 * 16] ; [04] + pmulhrsw m0, m7 + packuswb m6, m0 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 - 6 * 16] ; [10] + pmulhrsw m1, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m0, m3, [r4] ; [16] + pmulhrsw m0, m7 + packuswb m1, m0 + + TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m4, m3, [r4 + 6 * 16] ; [22] + pmulhrsw m4, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m5, m3, [r4 + 12 * 16] ; [28] + pmulhrsw m5, m7 + packuswb m4, m5 + + pmaddubsw m5, m3, [r4 - 14 * 16] ; [02] + pmulhrsw m5, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 8 * 16] ; [08] + pmulhrsw m6, m7 + packuswb m5, m6 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m6, m3, [r4 - 2 * 16] ; [14] + pmulhrsw m6, m7 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 4 * 16] ; [20] + pmulhrsw m1, m7 + packuswb m6, m1 + + pslldq m2, 1 + palignr m3, m2, 14 + + pmaddubsw m1, m3, [r4 + 10 * 16] ; [26] + pmulhrsw m1, m7 + pmaddubsw m3, [r4 - 16 * 16] ; [00] + pmulhrsw m3, m7 + packuswb m1, m3 + + TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1 + RET + +INIT_XMM sse4 +cglobal intra_pred_ang16_18, 4,5,3 + movu m0, [r2] + movu m1, [r2 + 32] + mova m2, [c_mode16_18] + pshufb m1, m2 + + lea r2, [r1 * 2] + lea r3, [r1 * 3] + lea r4, [r1 * 4] + movu [r0], m0 + palignr m2, m0, m1, 15 + movu [r0 + r1], m2 + palignr m2, m0, m1, 14 + movu [r0 + r2], m2 + palignr m2, m0, m1, 13 + movu [r0 + r3], m2 + lea r0, [r0 + r4] + palignr m2, m0, m1, 12 + movu [r0], m2 + palignr m2, m0, m1, 11 + movu [r0 + r1], m2 + palignr m2, m0, m1, 10 + movu [r0 + r2], m2 + palignr m2, m0, m1, 9 + movu [r0 + r3], m2 + lea r0, [r0 + r4] + palignr m2, m0, m1, 8 + movu [r0], m2 + palignr m2, m0, m1, 7 + movu [r0 + r1], m2 + palignr m2, m0, m1, 6 + movu [r0 + r2], m2 + palignr m2, m0, m1, 5 movu [r0 + r3], m2 lea r0, [r0 + r4] palignr m2, m0, m1, 4 @@ -7693,24337 +9087,4432 @@ palignr m2, m0, 14 pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] pmulhrsw m6, m7 - pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] - pmulhrsw m1, m7 - packuswb m6, m1 - pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] - pmulhrsw m1, m7 - pmaddubsw m3, m2, [r4 - 16 * 16] ; [00] - pmulhrsw m3, m7 - packuswb m1, m3 - TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 -%endmacro - -%macro MODE_13_23 2 - movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] - palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] - punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] - punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] - palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] - pmaddubsw m4, m0, [r4 + 7 * 16] ; [23] - pmulhrsw m4, m7 - pmaddubsw m3, m0, [r4 - 2 * 16] ; [14] - pmulhrsw m3, m7 - packuswb m4, m3 - pmaddubsw m5, m0, [r4 - 11 * 16] ; [05] - pmulhrsw m5, m7 - pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] - pmulhrsw m6, m7 - packuswb m5, m6 - pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] - pmulhrsw m6, m7 - pmaddubsw m3, m2, [r4 - 6 * 16] ; [10] - pmulhrsw m3, m7 - packuswb m6, m3 - pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] - pmulhrsw m1, m7 - movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1] - palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] - punpckhbw m0, m2, m3 - punpcklbw m2, m3 - palignr m0, m2, 2 - pmaddubsw m3, m0, [r4 + 8 * 16] ; [24] - pmulhrsw m3, m7 - packuswb m1, m3 - mova m3, m0 - TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 16] ; [15] - pmulhrsw m4, m7 - pmaddubsw m5, m3, [r4 - 10 * 16] ; [6] - pmulhrsw m5, m7 - packuswb m4, m5 - pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] - pmulhrsw m5, m7 - pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] - pmulhrsw m6, m7 - packuswb m5, m6 - pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] - pmulhrsw m6, m7 - pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] - pmulhrsw m1, m7 - packuswb m6, m1 - movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] - palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] - punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] - punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] - palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] - pmaddubsw m1, m0, [r4 + 9 * 16] ; [25] - pmulhrsw m1, m7 - pmaddubsw m3, m0, [r4] ; [16] - pmulhrsw m3, m7 - packuswb m1, m3 - mova m3, m0 - TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 - pmaddubsw m4, m3, [r4 - 9 * 16] ; [7] - pmulhrsw m4, m7 - pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] - pmulhrsw m3, m7 - packuswb m4, m3 - pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] - pmulhrsw m5, m7 - pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] - pmulhrsw m6, m7 - packuswb m5, m6 - pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] - pmulhrsw m6, m7 - movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] - palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] - punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] - punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] - palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] - pmaddubsw m3, m0, [r4 + 10 * 16] ; [26] - pmulhrsw m3, m7 - packuswb m6, m3 - pmaddubsw m1, m0, [r4 + 16] ; [17] - pmulhrsw m1, m7 - pmaddubsw m3, m0, [r4 - 8 * 16] ; [8] - pmulhrsw m3, m7 - packuswb m1, m3 - TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 - pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] - pmulhrsw m4, m7 - pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] - pmulhrsw m5, m7 - packuswb m4, m5 - pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] - pmulhrsw m5, m7 - pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] - pmulhrsw m6, m7 - packuswb m5, m6 - movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] - %if ((%1 & %2) == 1) - pinsrb m2, [r3], 0 - %endif - palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] - punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] - pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] - pmulhrsw m6, m7 - pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] - pmulhrsw m1, m7 - packuswb m6, m1 - pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] - pmulhrsw m1, m7 - movu m0, [pb_fact0] - pshufb m2, m0 - pmovzxbw m2, m2 - packuswb m1, m2 - TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 -%endmacro - -INIT_XMM sse4 -cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize) -%define above [rsp + 0 * mmsize] - mov r3, r2 - add r2, 64 - lea r4, [ang_table + 16 * 16] - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride - mova m7, [pw_1024] - - MODE_13_23_ROW0 1 - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] - add r2, 7 - - MODE_13_23 1, 1 - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] - add r2, 8 - mov r3, 2 -.loop: - MODE_13_23 1, 0 - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] - add r2, 8 - dec r3 - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_14, 3,7,8 - ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line - mov r6, rsp - sub rsp, 64+gprsize - and rsp, ~63 - mov [rsp+64], r6 - - ; collect reference pixel - movu m0, [r2] - movu m1, [r2 + 15] - pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] - pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] - pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] - palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] - mova [rsp], m0 - movu m0, [r2 + 1 + 64] - movu m1, [r2 + 1 + 16 + 64] - movu [rsp + 13], m0 - movu [rsp + 13 + 16], m1 - mov [rsp + 63], byte 4 - - ; filter - lea r2, [rsp + 13] ; r2 -> [0] - lea r3, [c_shuf8_0] ; r3 -> shuffle8 - lea r4, [ang_table] ; r4 -> ang_table - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride - mova m5, [pw_1024] ; m5 -> 1024 - mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - -.loop: - ; Row[0 - 7] - movu m7, [r2 - 4] - palignr m0, m7, 3 - mova m1, m0 - palignr m2, m7, 2 - mova m3, m2 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m4 - PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24 - - ; Row[8 - 15] - movu m7, [r2 - 7] - palignr m0, m7, 3 - palignr m1, m7, 2 - mova m2, m1 - mova m3, m1 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m7 - PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16 - - ; Row[16 - 23] - movu m7, [r2 - 10] - palignr m0, m7, 3 - palignr m1, m7, 2 - mova m2, m1 - palignr m3, m7, 1 - mova m4, m3 - mova m5, m3 - mova m6, m7 - PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8 - - ; Row[24 - 31] - movu m7, [r2 - 13] - palignr m0, m7, 2 - mova m1, m0 - mova m2, m0 - palignr m3, m7, 1 - mova m4, m3 - mova m5, m7 - mova m6, m7 - PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0 - - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] - add r2, 8 - dec byte [rsp + 63] - jnz .loop - mov rsp, [rsp+64] - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_15, 4,7,8 - ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line - mov r6, rsp - sub rsp, 64+gprsize - and rsp, ~63 - mov [rsp+64], r6 - - ; collect reference pixel - movu m0, [r2] - movu m1, [r2 + 15] - pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] - pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] - mova [rsp], m1 - movu [rsp + 8], m0 - movu m0, [r2 + 1 + 64] - movu m1, [r2 + 1 + 16 + 64] - movu [rsp + 17], m0 - movu [rsp + 17 + 16], m1 - mov [rsp + 63], byte 4 - - ; filter - lea r2, [rsp + 17] ; r2 -> [0] - lea r3, [c_shuf8_0] ; r3 -> shuffle8 - lea r4, [ang_table] ; r4 -> ang_table - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride - mova m5, [pw_1024] ; m5 -> 1024 - mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - -.loop: - ; Row[0 - 7] - movu m7, [r2 - 5] - palignr m0, m7, 4 - palignr m1, m7, 3 - mova m2, m1 - palignr m3, m7, 2 - mova m4, m3 - palignr m5, m7, 1 - mova m6, m5 - PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24 - - ; Row[8 - 15] - movu m7, [r2 - 9] - palignr m0, m7, 4 - palignr m1, m7, 3 - mova m2, m1 - palignr m3, m7, 2 - mova m4, m3 - palignr m5, m7, 1 - mova m6, m5 - PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16 - - ; Row[16 - 23] - movu m7, [r2 - 13] - palignr m0, m7, 3 - mova m1, m0 - palignr m2, m7, 2 - mova m3, m2 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m7 - PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8 - - ; Row[24 - 31] - movu m7, [r2 - 17] - palignr m0, m7, 3 - mova m1, m0 - palignr m2, m7, 2 - mova m3, m2 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m7 - PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0 - - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] - add r2, 8 - dec byte [rsp + 63] - jnz .loop - mov rsp, [rsp+64] - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_16, 4,7,8 - ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line - mov r6, rsp - sub rsp, 64+gprsize - and rsp, ~63 - mov [rsp+64], r6 - - ; collect reference pixel - movu m0, [r2] - movu m1, [r2 + 15] - pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] - pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] - mova [rsp], m1 - movu [rsp + 10], m0 - movu m0, [r2 + 1 + 64] - movu m1, [r2 + 1 + 16 + 64] - movu [rsp + 21], m0 - movu [rsp + 21 + 16], m1 - mov [rsp + 63], byte 4 - - ; filter - lea r2, [rsp + 21] ; r2 -> [0] - lea r3, [c_shuf8_0] ; r3 -> shuffle8 - lea r4, [ang_table] ; r4 -> ang_table - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride - mova m5, [pw_1024] ; m5 -> 1024 - mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - -.loop: - ; Row[0 - 7] - movu m7, [r2 - 6] - palignr m0, m7, 5 - palignr m1, m7, 4 - mova m2, m1 - palignr m3, m7, 3 - palignr m4, m7, 2 - mova m5, m4 - palignr m6, m7, 1 - PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24 - - ; Row[8 - 15] - movu m7, [r2 - 11] - palignr m0, m7, 5 - palignr m1, m7, 4 - palignr m2, m7, 3 - mova m3, m2 - palignr m4, m7, 2 - palignr m5, m7, 1 - mova m6, m5 - PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16 - - ; Row[16 - 23] - movu m7, [r2 - 16] - palignr m0, m7, 4 - mova m1, m0 - palignr m2, m7, 3 - palignr m3, m7, 2 - mova m4, m3 - palignr m5, m7, 1 - mova m6, m7 - PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8 - - ; Row[24 - 31] - movu m7, [r2 - 21] - palignr m0, m7, 4 - palignr m1, m7, 3 - mova m2, m1 - palignr m3, m7, 2 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m7 - PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0 - - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] - add r2, 8 - dec byte [rsp + 63] - jnz .loop - mov rsp, [rsp+64] - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_17, 4,7,8 - ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line - mov r6, rsp - sub rsp, 64+gprsize - and rsp, ~63 - mov [rsp+64], r6 - - ; collect reference pixel - movu m0, [r2] - movu m1, [r2 + 16] - pshufb m0, [c_mode32_17_0] - pshufb m1, [c_mode32_17_0] - mova [rsp ], m1 - movu [rsp + 13], m0 - movu m0, [r2 + 1 + 64] - movu m1, [r2 + 1 + 16 + 64] - movu [rsp + 26], m0 - movu [rsp + 26 + 16], m1 - mov [rsp + 63], byte 4 - - ; filter - lea r2, [rsp + 25] ; r2 -> [0] - lea r3, [c_shuf8_0] ; r3 -> shuffle8 - lea r4, [ang_table] ; r4 -> ang_table - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride - mova m5, [pw_1024] ; m5 -> 1024 - mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - -.loop: - ; Row[0 - 7] - movu m7, [r2 - 6] - palignr m0, m7, 6 - palignr m1, m7, 5 - palignr m2, m7, 4 - palignr m3, m7, 3 - palignr m4, m7, 2 - mova m5, m4 - palignr m6, m7, 1 - PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16 - - ; Row[7 - 15] - movu m7, [r2 - 12] - palignr m0, m7, 5 - palignr m1, m7, 4 - mova m2, m1 - palignr m3, m7, 3 - palignr m4, m7, 2 - palignr m5, m7, 1 - mova m6, m7 - PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0 - - ; Row[16 - 23] - movu m7, [r2 - 19] - palignr m0, m7, 6 - palignr m1, m7, 5 - palignr m2, m7, 4 - palignr m3, m7, 3 - palignr m4, m7, 2 - mova m5, m4 - palignr m6, m7, 1 - PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16 - - ; Row[24 - 31] - movu m7, [r2 - 25] - palignr m0, m7, 5 - palignr m1, m7, 4 - mova m2, m1 - palignr m3, m7, 3 - palignr m4, m7, 2 - palignr m5, m7, 1 - mova m6, m7 - PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0 - - lea r0, [r6 + r1 * 4] - lea r6, [r6 + r1 * 8] - add r2, 8 - dec byte [rsp + 63] - jnz .loop - mov rsp, [rsp+64] - - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_18, 4,5,5 - movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] - movu m1, [r2 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16] - movu m2, [r2 + 1 + 64] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] - movu m3, [r2 + 17 + 64] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] - - lea r2, [r1 * 2] - lea r3, [r1 * 3] - lea r4, [r1 * 4] - - movu [r0], m0 - movu [r0 + 16], m1 - - pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] - pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32] - - palignr m4, m0, m2, 15 - movu [r0 + r1], m4 - palignr m4, m1, m0, 15 - movu [r0 + r1 + 16], m4 - palignr m4, m0, m2, 14 - movu [r0 + r2], m4 - palignr m4, m1, m0, 14 - movu [r0 + r2 + 16], m4 - palignr m4, m0, m2, 13 - movu [r0 + r3], m4 - palignr m4, m1, m0, 13 - movu [r0 + r3 + 16], m4 - - lea r0, [r0 + r4] - - palignr m4, m0, m2, 12 - movu [r0], m4 - palignr m4, m1, m0, 12 - movu [r0 + 16], m4 - palignr m4, m0, m2, 11 - movu [r0 + r1], m4 - palignr m4, m1, m0, 11 - movu [r0 + r1 + 16], m4 - palignr m4, m0, m2, 10 - movu [r0 + r2], m4 - palignr m4, m1, m0, 10 - movu [r0 + r2 + 16], m4 - palignr m4, m0, m2, 9 - movu [r0 + r3], m4 - palignr m4, m1, m0, 9 - movu [r0 + r3 + 16], m4 - - lea r0, [r0 + r4] - - palignr m4, m0, m2, 8 - movu [r0], m4 - palignr m4, m1, m0, 8 - movu [r0 + 16], m4 - palignr m4, m0, m2, 7 - movu [r0 + r1], m4 - palignr m4, m1, m0, 7 - movu [r0 + r1 + 16], m4 - palignr m4, m0, m2, 6 - movu [r0 + r2], m4 - palignr m4, m1, m0, 6 - movu [r0 + r2 + 16], m4 - palignr m4, m0, m2, 5 - movu [r0 + r3], m4 - palignr m4, m1, m0, 5 - movu [r0 + r3 + 16], m4 - - lea r0, [r0 + r4] - - palignr m4, m0, m2, 4 - movu [r0], m4 - palignr m4, m1, m0, 4 - movu [r0 + 16], m4 - palignr m4, m0, m2, 3 - movu [r0 + r1], m4 - palignr m4, m1, m0, 3 - movu [r0 + r1 + 16], m4 - palignr m4, m0, m2, 2 - movu [r0 + r2], m4 - palignr m4, m1, m0, 2 - movu [r0 + r2 + 16], m4 - palignr m4, m0, m2, 1 - movu [r0 + r3], m4 - palignr m4, m1, m0, 1 - movu [r0 + r3 + 16], m4 - - lea r0, [r0 + r4] - - movu [r0], m2 - movu [r0 + 16], m0 - palignr m4, m2, m3, 15 - movu [r0 + r1], m4 - palignr m4, m0, m2, 15 - movu [r0 + r1 + 16], m4 - palignr m4, m2, m3, 14 - movu [r0 + r2], m4 - palignr m4, m0, m2, 14 - movu [r0 + r2 + 16], m4 - palignr m4, m2, m3, 13 - movu [r0 + r3], m4 - palignr m4, m0, m2, 13 - movu [r0 + r3 + 16], m4 - - lea r0, [r0 + r4] - - palignr m4, m2, m3, 12 - movu [r0], m4 - palignr m4, m0, m2, 12 - movu [r0 + 16], m4 - palignr m4, m2, m3, 11 - movu [r0 + r1], m4 - palignr m4, m0, m2, 11 - movu [r0 + r1 + 16], m4 - palignr m4, m2, m3, 10 - movu [r0 + r2], m4 - palignr m4, m0, m2, 10 - movu [r0 + r2 + 16], m4 - palignr m4, m2, m3, 9 - movu [r0 + r3], m4 - palignr m4, m0, m2, 9 - movu [r0 + r3 + 16], m4 - - lea r0, [r0 + r4] - - palignr m4, m2, m3, 8 - movu [r0], m4 - palignr m4, m0, m2, 8 - movu [r0 + 16], m4 - palignr m4, m2, m3, 7 - movu [r0 + r1], m4 - palignr m4, m0, m2, 7 - movu [r0 + r1 + 16], m4 - palignr m4, m2, m3, 6 - movu [r0 + r2], m4 - palignr m4, m0, m2, 6 - movu [r0 + r2 + 16], m4 - palignr m4, m2, m3, 5 - movu [r0 + r3], m4 - palignr m4, m0, m2, 5 - movu [r0 + r3 + 16], m4 - - lea r0, [r0 + r4] - - palignr m4, m2, m3, 4 - movu [r0], m4 - palignr m4, m0, m2, 4 - movu [r0 + 16], m4 - palignr m4, m2, m3, 3 - movu [r0 + r1], m4 - palignr m4, m0, m2, 3 - movu [r0 + r1 + 16], m4 - palignr m4, m2, m3, 2 - movu [r0 + r2], m4 - palignr m4, m0, m2, 2 - movu [r0 + r2 + 16], m4 - palignr m4, m2, m3, 1 - movu [r0 + r3], m4 - palignr m4, m0, m2, 1 - movu [r0 + r3 + 16], m4 - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_19, 4,7,8 - ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line - mov r6, rsp - sub rsp, 64+gprsize - and rsp, ~63 - mov [rsp+64], r6 - - ; collect reference pixel - movu m0, [r2 + 64] - pinsrb m0, [r2], 0 - movu m1, [r2 + 16 + 64] - pshufb m0, [c_mode32_17_0] - pshufb m1, [c_mode32_17_0] - mova [rsp ], m1 - movu [rsp + 13], m0 - movu m0, [r2 + 1] - movu m1, [r2 + 1 + 16] - movu [rsp + 26], m0 - movu [rsp + 26 + 16], m1 - mov [rsp + 63], byte 4 - - ; filter - lea r2, [rsp + 25] ; r2 -> [0] - lea r3, [c_shuf8_0] ; r3 -> shuffle8 - lea r4, [ang_table] ; r4 -> ang_table - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0] ; r6 -> r0 - mova m5, [pw_1024] ; m5 -> 1024 - mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - -.loop: - ; Row[0 - 7] - movu m7, [r2 - 6] - palignr m0, m7, 6 - palignr m1, m7, 5 - palignr m2, m7, 4 - palignr m3, m7, 3 - palignr m4, m7, 2 - mova m5, m4 - palignr m6, m7, 1 - PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16 - - ; Row[7 - 15] - movu m7, [r2 - 12] - palignr m0, m7, 5 - palignr m1, m7, 4 - mova m2, m1 - palignr m3, m7, 3 - palignr m4, m7, 2 - palignr m5, m7, 1 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0 - - ; Row[16 - 23] - movu m7, [r2 - 19] - palignr m0, m7, 6 - palignr m1, m7, 5 - palignr m2, m7, 4 - palignr m3, m7, 3 - palignr m4, m7, 2 - mova m5, m4 - palignr m6, m7, 1 - lea r0, [r0 + r1 * 4] - PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16 - - ; Row[24 - 31] - movu m7, [r2 - 25] - palignr m0, m7, 5 - palignr m1, m7, 4 - mova m2, m1 - palignr m3, m7, 3 - palignr m4, m7, 2 - palignr m5, m7, 1 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0 - - add r6, 8 - mov r0, r6 - add r2, 8 - dec byte [rsp + 63] - jnz .loop - mov rsp, [rsp+64] - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_20, 4,7,8 - ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line - mov r6, rsp - sub rsp, 64+gprsize - and rsp, ~63 - mov [rsp+64], r6 - - ; collect reference pixel - movu m0, [r2 + 64] - pinsrb m0, [r2], 0 - movu m1, [r2 + 15 + 64] - pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] - pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] - mova [rsp], m1 - movu [rsp + 10], m0 - movu m0, [r2 + 1] - movu m1, [r2 + 1 + 16] - movu [rsp + 21], m0 - movu [rsp + 21 + 16], m1 - mov [rsp + 63], byte 4 - - ; filter - lea r2, [rsp + 21] ; r2 -> [0] - lea r3, [c_shuf8_0] ; r3 -> shuffle8 - lea r4, [ang_table] ; r4 -> ang_table - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0] ; r6 -> r0 - mova m5, [pw_1024] ; m5 -> 1024 - mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - -.loop: - ; Row[0 - 7] - movu m7, [r2 - 6] - palignr m0, m7, 5 - palignr m1, m7, 4 - mova m2, m1 - palignr m3, m7, 3 - palignr m4, m7, 2 - mova m5, m4 - palignr m6, m7, 1 - PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24 - - ; Row[8 - 15] - movu m7, [r2 - 11] - palignr m0, m7, 5 - palignr m1, m7, 4 - palignr m2, m7, 3 - mova m3, m2 - palignr m4, m7, 2 - palignr m5, m7, 1 - mova m6, m5 - lea r0, [r0 + r1 * 4] - PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16 - - ; Row[16 - 23] - movu m7, [r2 - 16] - palignr m0, m7, 4 - mova m1, m0 - palignr m2, m7, 3 - palignr m3, m7, 2 - mova m4, m3 - palignr m5, m7, 1 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8 - - ; Row[24 - 31] - movu m7, [r2 - 21] - palignr m0, m7, 4 - palignr m1, m7, 3 - mova m2, m1 - palignr m3, m7, 2 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0 - - add r6, 8 - mov r0, r6 - add r2, 8 - dec byte [rsp + 63] - jnz .loop - mov rsp, [rsp+64] - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_21, 4,7,8 - ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line - mov r6, rsp - sub rsp, 64+gprsize - and rsp, ~63 - mov [rsp+64], r6 - - ; collect reference pixel - movu m0, [r2 + 64] - pinsrb m0, [r2], 0 - movu m1, [r2 + 15 + 64] - pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] - pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] - mova [rsp], m1 - movu [rsp + 8], m0 - movu m0, [r2 + 1] - movu m1, [r2 + 1 + 16] - movu [rsp + 17], m0 - movu [rsp + 17 + 16], m1 - mov [rsp + 63], byte 4 - - ; filter - lea r2, [rsp + 17] ; r2 -> [0] - lea r3, [c_shuf8_0] ; r3 -> shuffle8 - lea r4, [ang_table] ; r4 -> ang_table - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0] ; r6 -> r0 - mova m5, [pw_1024] ; m5 -> 1024 - mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - -.loop: - ; Row[0 - 7] - movu m7, [r2 - 5] - palignr m0, m7, 4 - palignr m1, m7, 3 - mova m2, m1 - palignr m3, m7, 2 - mova m4, m3 - palignr m5, m7, 1 - mova m6, m5 - PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24 - - ; Row[8 - 15] - movu m7, [r2 - 9] - palignr m0, m7, 4 - palignr m1, m7, 3 - mova m2, m1 - palignr m3, m7, 2 - mova m4, m3 - palignr m5, m7, 1 - mova m6, m5 - lea r0, [r0 + r1 * 4] - PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16 - - ; Row[16 - 23] - movu m7, [r2 - 13] - palignr m0, m7, 3 - mova m1, m0 - palignr m2, m7, 2 - mova m3, m2 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8 - - ; Row[24 - 31] - movu m7, [r2 - 17] - palignr m0, m7, 3 - mova m1, m0 - palignr m2, m7, 2 - mova m3, m2 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0 - - add r6, 8 - mov r0, r6 - add r2, 8 - dec byte [rsp + 63] - jnz .loop - mov rsp, [rsp+64] - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_22, 4,7,8 - ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line - mov r6, rsp - sub rsp, 64+gprsize - and rsp, ~63 - mov [rsp+64], r6 - - ; collect reference pixel - movu m0, [r2 + 64] - pinsrb m0, [r2], 0 - movu m1, [r2 + 15 + 64] - pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] - pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] - pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] - palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] - mova [rsp], m0 - movu m0, [r2 + 1] - movu m1, [r2 + 1 + 16] - movu [rsp + 13], m0 - movu [rsp + 13 + 16], m1 - mov [rsp + 63], byte 4 - - ; filter - lea r2, [rsp + 13] ; r2 -> [0] - lea r3, [c_shuf8_0] ; r3 -> shuffle8 - lea r4, [ang_table] ; r4 -> ang_table - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0] ; r6 -> r0 - mova m5, [pw_1024] ; m5 -> 1024 - mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - -.loop: - ; Row[0 - 7] - movu m7, [r2 - 4] - palignr m0, m7, 3 - mova m1, m0 - palignr m2, m7, 2 - mova m3, m2 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m4 - PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24 - - ; Row[8 - 15] - movu m7, [r2 - 7] - palignr m0, m7, 3 - palignr m1, m7, 2 - mova m2, m1 - mova m3, m1 - palignr m4, m7, 1 - mova m5, m4 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16 - - ; Row[16 - 23] - movu m7, [r2 - 10] - palignr m0, m7, 3 - palignr m1, m7, 2 - mova m2, m1 - palignr m3, m7, 1 - mova m4, m3 - mova m5, m3 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8 - - ; Row[24 - 31] - movu m7, [r2 - 13] - palignr m0, m7, 2 - mova m1, m0 - mova m2, m0 - palignr m3, m7, 1 - mova m4, m3 - mova m5, m7 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0 - - add r6, 8 - mov r0, r6 - add r2, 8 - dec byte [rsp + 63] - jnz .loop - mov rsp, [rsp+64] - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize) -%define above [rsp + 0 * mmsize] - lea r3, [r2 + 64] - lea r4, [ang_table + 16 * 16] - lea r5, [r1 * 3] ; r5 -> 3 * stride - mov r6, r0 - mova m7, [pw_1024] - - MODE_13_23_ROW0 0 - add r6, 8 - mov r0, r6 - add r2, 7 - mov r3, 3 -.loop: - MODE_13_23 0, 0 - add r6, 8 - mov r0, r6 - add r2, 8 - dec r3 - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize) - %define above [rsp + 0 * mmsize] - lea r3, [r2 + 64] - lea r4, [ang_table + 16 * 16] - lea r5, [r1 * 3] ; r5 -> 3 * stride - mov r6, r0 - mova m7, [pw_1024] - - MODE_12_24_ROW0 0 - add r6, 8 - mov r0, r6 - add r2, 7 - mov r3, 3 -.loop: - MODE_12_24 0 - add r6, 8 - mov r0, r6 - add r2, 8 - dec r3 - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_25, 4,7,8 - ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line - mov r6, rsp - sub rsp, 64+gprsize - and rsp, ~63 - mov [rsp+64], r6 - - ; collect reference pixel - movu m0, [r2 + 16 + 64] - pxor m1, m1 - pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] - mova [rsp], m0 - movu m0, [r2] - movu m1, [r2 + 16] - movu m2, [r2 + 32] - movu [rsp + 1], m0 - movu [rsp + 1 + 16], m1 - movu [rsp + 1 + 32], m2 - mov [rsp + 63], byte 4 - - ; filter - lea r2, [rsp + 1] ; r2 -> [0] - lea r3, [c_shuf8_0] ; r3 -> shuffle8 - lea r4, [ang_table] ; r4 -> ang_table - lea r5, [r1 * 3] ; r5 -> 3 * stride - lea r6, [r0] ; r6 -> r0 - mova m5, [pw_1024] ; m5 -> 1024 - mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - -.loop: - ; Row[0 - 7] - movu m7, [r2] - mova m0, m7 - mova m1, m7 - mova m2, m7 - mova m3, m7 - mova m4, m7 - mova m5, m7 - mova m6, m7 - PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16 - - ; Row[8 - 15] - movu m7, [r2] - mova m0, m7 - mova m1, m7 - mova m2, m7 - mova m3, m7 - mova m4, m7 - mova m5, m7 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0 - - ; Row[16 - 23] - movu m7, [r2 - 1] - mova m0, m7 - mova m1, m7 - mova m2, m7 - mova m3, m7 - mova m4, m7 - mova m5, m7 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16 - - ; Row[24 - 31] - movu m7, [r2 - 1] - mova m0, m7 - mova m1, m7 - mova m2, m7 - mova m3, m7 - mova m4, m7 - mova m5, m7 - mova m6, m7 - lea r0, [r0 + r1 * 4] - PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0 - - add r6, 8 - mov r0, r6 - add r2, 8 - dec byte [rsp + 63] - jnz .loop - mov rsp, [rsp+64] - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize) -%define m8 [rsp + 0 * mmsize] -%define m9 [rsp + 1 * mmsize] - mov r6, 2 - movu m0, [r2 + 64] - pinsrb m0, [r2], 0 - movu m1, [r2 + 1 + 64] - mova m8, m0 - mova m9, m1 - mov r3d, r4d - lea r4, [r1 * 3] - -.loop: - movu m0, [r2 + 1] - - movu [r0], m0 - movu [r0 + r1], m0 - movu [r0 + r1 * 2], m0 - movu [r0 + r4], m0 - lea r5, [r0 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - lea r5, [r5 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - lea r5, [r5 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - lea r5, [r0 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - lea r5, [r5 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - lea r5, [r5 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - lea r5, [r5 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - lea r5, [r5 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - lea r5, [r5 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - lea r5, [r5 + r1 * 4] - movu [r5], m0 - movu [r5 + r1], m0 - movu [r5 + r1 * 2], m0 - movu [r5 + r4], m0 - -; filter - cmp r3d, byte 0 - jz .quit - - pxor m4, m4 - pshufb m0, m4 - pmovzxbw m0, m0 - mova m1, m0 - movu m2, m8 - movu m3, m9 - - pshufb m2, m4 - pmovzxbw m2, m2 - movhlps m4, m3 - pmovzxbw m3, m3 - pmovzxbw m4, m4 - psubw m3, m2 - psubw m4, m2 - psraw m3, 1 - psraw m4, 1 - paddw m0, m3 - paddw m1, m4 - packuswb m0, m1 - - pextrb [r0], m0, 0 - pextrb [r0 + r1], m0, 1 - pextrb [r0 + r1 * 2], m0, 2 - pextrb [r0 + r4], m0, 3 - lea r5, [r0 + r1 * 4] - pextrb [r5], m0, 4 - pextrb [r5 + r1], m0, 5 - pextrb [r5 + r1 * 2], m0, 6 - pextrb [r5 + r4], m0, 7 - lea r5, [r5 + r1 * 4] - pextrb [r5], m0, 8 - pextrb [r5 + r1], m0, 9 - pextrb [r5 + r1 * 2], m0, 10 - pextrb [r5 + r4], m0, 11 - lea r5, [r5 + r1 * 4] - pextrb [r5], m0, 12 - pextrb [r5 + r1], m0, 13 - pextrb [r5 + r1 * 2], m0, 14 - pextrb [r5 + r4], m0, 15 - -.quit: - lea r2, [r2 + 16] - add r0, 16 - dec r6d - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_27, 3,7,8 - lea r3, [ang_table + 16 * 16] - mov r4d, 4 - lea r5, [r1 * 3] - mov r6, r0 - mova m7, [pw_1024] -.loop: - MODE_9_27 0 - add r6, 8 - mov r0, r6 - add r2, 8 - dec r4 - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_28, 3,7,8 - lea r3, [ang_table + 16 * 16] - mov r4d, 4 - lea r5, [r1 * 3] - mov r6, r0 - mova m7, [pw_1024] -.loop: - MODE_8_28 0 - add r6, 8 - mov r0, r6 - add r2, 8 - dec r4 - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_29, 3,7,8 - lea r3, [ang_table + 16 * 16] - mov r4d, 4 - lea r5, [r1 * 3] - mov r6, r0 - mova m7, [pw_1024] -.loop: - MODE_7_29 0 - add r6, 8 - mov r0, r6 - add r2, 8 - dec r4 - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_30, 3,7,8 - lea r3, [ang_table + 16 * 16] - mov r4d, 4 - lea r5, [r1 * 3] - mov r6, r0 - mova m7, [pw_1024] -.loop: - MODE_6_30 0 - add r6, 8 - mov r0, r6 - add r2, 8 - dec r4 - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_31, 3,7,8 - lea r3, [ang_table + 16 * 16] - mov r4d, 4 - lea r5, [r1 * 3] - mov r6, r0 - mova m7, [pw_1024] -.loop: - MODE_5_31 0 - add r6, 8 - mov r0, r6 - add r2, 8 - dec r4 - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_32, 3,7,8 - lea r3, [ang_table + 16 * 16] - mov r4d, 4 - lea r5, [r1 * 3] - mov r6, r0 - mova m7, [pw_1024] -.loop: - MODE_4_32 0 - add r6, 8 - mov r0, r6 - add r2, 8 - dec r4 - jnz .loop - RET - -INIT_XMM sse4 -cglobal intra_pred_ang32_33, 3,7,8 - lea r3, [ang_table + 16 * 16] - mov r4d, 4 - lea r5, [r1 * 3] - mov r6, r0 - mova m7, [pw_1024] -.loop: - MODE_3_33 0 - add r6, 8 - mov r0, r6 - add r2, 8 - dec r4 - jnz .loop - RET - -;----------------------------------------------------------------------------- -; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) -;----------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal all_angs_pred_4x4, 4, 4, 8 - -; mode 2 - -movh m0, [r1 + 10] -movd [r0], m0 - -palignr m1, m0, 1 -movd [r0 + 4], m1 - -palignr m1, m0, 2 -movd [r0 + 8], m1 - -palignr m1, m0, 3 -movd [r0 + 12], m1 - -; mode 3 - -mova m2, [pw_1024] - -pslldq m1, m0, 1 -pinsrb m1, [r1 + 9], 0 -punpcklbw m1, m0 - -lea r3, [ang_table] - -pmaddubsw m6, m1, [r3 + 26 * 16] -pmulhrsw m6, m2 -packuswb m6, m6 -movd [r0 + 16], m6 - -palignr m0, m1, 2 - -mova m7, [r3 + 20 * 16] - -pmaddubsw m3, m0, m7 -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 20], m3 - -; mode 6 [row 3] -movd [r0 + 76], m3 - -palignr m3, m1, 4 - -pmaddubsw m4, m3, [r3 + 14 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 24], m4 - -palignr m4, m1, 6 - -pmaddubsw m4, [r3 + 8 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 28], m4 - -; mode 4 - -pmaddubsw m5, m1, [r3 + 21 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 32], m5 - -pmaddubsw m5, m0, [r3 + 10 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 36], m5 - -pmaddubsw m5, m0, [r3 + 31 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 40], m5 - -pmaddubsw m4, m3, m7 -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 44], m4 - -; mode 5 - -pmaddubsw m5, m1, [r3 + 17 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 48], m5 - -pmaddubsw m5, m0, [r3 + 2 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 52], m5 - -pmaddubsw m5, m0, [r3 + 19 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 56], m5 - -pmaddubsw m4, m3, [r3 + 4 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 60], m4 - -; mode 6 - -pmaddubsw m5, m1, [r3 + 13 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 64], m5 - -movd [r0 + 68], m6 - -pmaddubsw m5, m0, [r3 + 7 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 72], m5 - -; mode 7 - -pmaddubsw m5, m1, [r3 + 9 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 80], m5 - -pmaddubsw m5, m1, [r3 + 18 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 84], m5 - -pmaddubsw m5, m1, [r3 + 27 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 88], m5 - -pmaddubsw m5, m0, [r3 + 4 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 92], m5 - -; mode 8 - -pmaddubsw m5, m1, [r3 + 5 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 96], m5 - -pmaddubsw m5, m1, [r3 + 10 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 100], m5 - -pmaddubsw m5, m1, [r3 + 15 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 104], m5 - -pmaddubsw m5, m1, [r3 + 20 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 108], m5 - -; mode 9 - -pmaddubsw m5, m1, [r3 + 2 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 112], m5 - -pmaddubsw m5, m1, [r3 + 4 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 116], m5 - -pmaddubsw m5, m1, [r3 + 6 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 120], m5 - -pmaddubsw m5, m1, [r3 + 8 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 124], m5 - -; mode 10 - -movd m3, [r1 + 9] -pshufd m4, m3, 0 -movu [r0 + 128], m4 - -pxor m5, m5 -movd m7, [r1 + 1] -pshufd m4, m7, 0 -punpcklbw m4, m5 - -pinsrb m7, [r1], 0 -pshufb m6, m7, m5 -punpcklbw m6, m5 - -psubw m4, m6 -psraw m4, 1 - -pshufb m6, m3, m5 -punpcklbw m6, m5 - -paddw m4, m6 -packuswb m4, m5 - -pextrb [r0 + 128], m4, 0 -pextrb [r0 + 132], m4, 1 -pextrb [r0 + 136], m4, 2 -pextrb [r0 + 140], m4, 3 - -; mode 11 - -pslldq m1, m1, 2 -pinsrb m1, [r1], 0 -pinsrb m1, [r1 + 9], 1 - -pmaddubsw m3, m1, [r3 + 30 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 144], m3 - -pmaddubsw m3, m1, [r3 + 28 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 148], m3 - -pmaddubsw m3, m1, [r3 + 26 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 152], m3 - -pmaddubsw m3, m1, [r3 + 24 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 156], m3 - -; mode 12 - -pmaddubsw m3, m1, [r3 + 27 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 160], m3 - -pmaddubsw m3, m1, [r3 + 22 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 164], m3 - -pmaddubsw m3, m1, [r3 + 17 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 168], m3 - -pmaddubsw m3, m1, [r3 + 12 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 172], m3 - -; mode 13 - -pmaddubsw m3, m1, [r3 + 23 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 176], m3 - -pmaddubsw m3, m1, [r3 + 14 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 180], m3 - -pmaddubsw m3, m1, [r3 + 5 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 184], m3 - -pslldq m5, m1, 2 -pinsrb m5, [r1 + 0], 1 -pinsrb m5, [r1 + 4], 0 - -pmaddubsw m4, m5, [r3 + 28 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 188], m4 - -; mode 14 - -pmaddubsw m4, m1, [r3 + 19 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 192], m4 - -pmaddubsw m7, m1, [r3 + 6 * 16] -pmulhrsw m7, m2 -packuswb m7, m7 -movd [r0 + 196], m7 - -pinsrb m5, [r1 + 2], 0 - -pmaddubsw m4, m5, [r3 + 25 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 200], m4 - -pmaddubsw m4, m5, [r3 + 12 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 204], m4 - -; mode 15 - -pmaddubsw m4, m1, [r3 + 15 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 208], m4 - -pmaddubsw m4, m5, [r3 + 30 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 212], m4 - -pmaddubsw m4, m5, [r3 + 13 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 216], m4 - -pslldq m4, m5, 2 -pinsrb m4, [r1 + 2], 1 -pinsrb m4, [r1 + 4], 0 - -pmaddubsw m6, m4, [r3 + 28 * 16] -pmulhrsw m6, m2 -packuswb m6, m6 -movd [r0 + 220], m6 - -; mode 16 - -pmaddubsw m6, m1, [r3 + 11 * 16] -pmulhrsw m6, m2 -packuswb m6, m6 -movd [r0 + 224], m6 - -pmaddubsw m6, m5, [r3 + 22 * 16] -pmulhrsw m6, m2 -packuswb m6, m6 -movd [r0 + 228], m6 - -pmaddubsw m6, m5, [r3 + 1 * 16] -pmulhrsw m6, m2 -packuswb m6, m6 -movd [r0 + 232], m6 - -pinsrb m4, [r1 + 3], 0 - -pmaddubsw m4, [r3 + 12 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 236], m4 - -; mode 17 - -movd [r0 + 240], m7 - -pslldq m1, 2 -pinsrb m1, [r1 + 1], 0 -pinsrb m1, [r1 + 0], 1 - -pmaddubsw m3, m1, [r3 + 12 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 244], m3 - -pslldq m1, 2 -pinsrb m1, [r1 + 1], 1 -pinsrb m1, [r1 + 2], 0 - -pmaddubsw m3, m1, [r3 + 18 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 248], m3 - -pslldq m1, 2 -pinsrb m1, [r1 + 2], 1 -pinsrb m1, [r1 + 4], 0 - -pmaddubsw m1, [r3 + 24 * 16] -pmulhrsw m1, m2 -packuswb m1, m1 -movd [r0 + 252], m1 - -; mode 18 - -movh m1, [r1] -movd [r0 + 256], m1 - -pslldq m3, m1, 1 -pinsrb m3, [r1 + 9], 0 -movd [r0 + 260], m3 - -pslldq m4, m3, 1 -pinsrb m4, [r1 + 10], 0 -movd [r0 + 264], m4 - -pslldq m4, 1 -pinsrb m4, [r1 + 11], 0 -movd [r0 + 268], m4 - -; mode 19 - -palignr m3, m1, 1 -punpcklbw m1, m3 - -pmaddubsw m7, m1, [r3 + 6 * 16] -pmulhrsw m7, m2 -packuswb m7, m7 -movd [r0 + 272], m7 - -pslldq m3, m1, 2 -pinsrb m3, [r1], 1 -pinsrb m3, [r1 + 9], 0 - -pmaddubsw m4, m3, [r3 + 12 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 276], m4 - -pslldq m4, m3, 2 -pinsrb m4, [r1 + 9], 1 -pinsrb m4, [r1 + 10], 0 - -pmaddubsw m5, m4, [r3 + 18 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 280], m5 - -pslldq m4, 2 -pinsrb m4, [r1 + 10], 1 -pinsrb m4, [r1 + 12], 0 - -pmaddubsw m4, [r3 + 24 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 284], m4 - -; mode 20 - -pmaddubsw m4, m1, [r3 + 11 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 288], m4 - -pinsrb m3, [r1 + 10], 0 - -pmaddubsw m4, m3, [r3 + 22 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 292], m4 - -pmaddubsw m4, m3, [r3 + 1 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 296], m4 - -pslldq m6, m3, 2 -pinsrb m6, [r1 + 10], 1 -pinsrb m6, [r1 + 11], 0 - -pmaddubsw m5, m6, [r3 + 12 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 300], m5 - -; mode 21 - -pmaddubsw m4, m1, [r3 + 15 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 304], m4 - -pmaddubsw m4, m3, [r3 + 30 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 308], m4 - -pmaddubsw m4, m3, [r3 + 13 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 312], m4 - -pinsrb m6, [r1 + 12], 0 - -pmaddubsw m6, [r3 + 28 * 16] -pmulhrsw m6, m2 -packuswb m6, m6 -movd [r0 + 316], m6 - -; mode 22 - -pmaddubsw m4, m1, [r3 + 19 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 320], m4 - -movd [r0 + 324], m7 - -pmaddubsw m4, m3, [r3 + 25 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 328], m4 - -pmaddubsw m4, m3, [r3 + 12 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 332], m4 - -; mode 23 - -pmaddubsw m4, m1, [r3 + 23 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 336], m4 - -pmaddubsw m4, m1, [r3 + 14 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 340], m4 - -pmaddubsw m4, m1, [r3 + 5 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 344], m4 - -pinsrb m3, [r1 + 12], 0 - -pmaddubsw m3, [r3 + 28 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 348], m3 - -; mode 24 - -pmaddubsw m3, m1, [r3 + 27 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 352], m3 - -pmaddubsw m3, m1, [r3 + 22 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 356], m3 - -pmaddubsw m3, m1, [r3 + 17 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 360], m3 - -pmaddubsw m3, m1, [r3 + 12 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 364], m3 - -; mode 25 - -pmaddubsw m3, m1, [r3 + 30 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 368], m3 - -pmaddubsw m3, m1, [r3 + 28 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 372], m3 - -pmaddubsw m3, m1, [r3 + 26 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 376], m3 - -pmaddubsw m1, [r3 + 24 * 16] -pmulhrsw m1, m2 -packuswb m1, m1 -movd [r0 + 380], m1 - -; mode 26 - -movh m1, [r1 + 1] -pshufd m3, m1, 0 -movu [r0 + 384], m3 - -pxor m4, m4 -movd m5, [r1 + 9] -pshufd m5, m5, 0 -punpcklbw m5, m4 - -pinsrb m6, [r1], 0 -pshufb m6, m4 -punpcklbw m6, m4 - -psubw m5, m6 -psraw m5, 1 - -pshufb m6, m1, m4 -punpcklbw m6, m4 - -paddw m5, m6 -packuswb m5, m4 - -pextrb [r0 + 384], m5, 0 -pextrb [r0 + 388], m5, 1 -pextrb [r0 + 392], m5, 2 -pextrb [r0 + 396], m5, 3 - -; mode 27 - -palignr m3, m1, 1 -punpcklbw m1, m3 - -pmaddubsw m3, m1, [r3 + 2 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 400], m3 - -pmaddubsw m3, m1, [r3 + 4 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 404], m3 - -pmaddubsw m3, m1, [r3 + 6 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 408], m3 - -pmaddubsw m3, m1, [r3 + 8 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 412], m3 - -; mode 28 - -pmaddubsw m3, m1, [r3 + 5 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 416], m3 - -pmaddubsw m3, m1, [r3 + 10 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 420], m3 - -pmaddubsw m3, m1, [r3 + 15 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 424], m3 - -pmaddubsw m3, m1, [r3 + 20 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 428], m3 - -; mode 29 - -pmaddubsw m3, m1, [r3 + 9 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 432], m3 - -pmaddubsw m3, m1, [r3 + 18 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 436], m3 - -pmaddubsw m3, m1, [r3 + 27 * 16] -pmulhrsw m3, m2 -packuswb m3, m3 -movd [r0 + 440], m3 - -palignr m3, m1, 2 - -pmaddubsw m4, m3, [r3 + 4 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 444], m4 - -; mode 30 - -pmaddubsw m4, m1, [r3 + 13 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 448], m4 - -pmaddubsw m7, m1, [r3 + 26 * 16] -pmulhrsw m7, m2 -packuswb m7, m7 -movd [r0 + 452], m7 - -pmaddubsw m5, m3, [r3 + 7 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 456], m5 - -pmaddubsw m6, m3, [r3 + 20 * 16] -pmulhrsw m6, m2 -packuswb m6, m6 -movd [r0 + 460], m6 - -; mode 31 - -pmaddubsw m4, m1, [r3 + 17 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 464], m4 - -pmaddubsw m5, m3, [r3 + 2 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 468], m5 - -pmaddubsw m5, m3, [r3 + 19 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 472], m5 - -palignr m4, m3, 2 - -pmaddubsw m5, m4, [r3 + 4 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 476], m5 - -; mode 32 - -pmaddubsw m5, m1, [r3 + 21 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 480], m5 - -pmaddubsw m5, m3, [r3 + 10 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 484], m5 - -pmaddubsw m5, m3, [r3 + 31 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 488], m5 - -pmaddubsw m5, m4, [r3 + 20 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 492], m5 - -; mode 33 - -movd [r0 + 496], m7 - -movd [r0 + 500], m6 - -pmaddubsw m5, m4, [r3 + 14 * 16] -pmulhrsw m5, m2 -packuswb m5, m5 -movd [r0 + 504], m5 - -psrldq m4, 2 - -pmaddubsw m4, [r3 + 8 * 16] -pmulhrsw m4, m2 -packuswb m4, m4 -movd [r0 + 508], m4 - -; mode 34 - -movh m7, [r1 + 2] -movd [r0 + 512], m7 - -psrldq m7, 1 -movd [r0 + 516], m7 - -psrldq m7, 1 -movd [r0 + 520], m7 - -psrldq m7, 1 -movd [r0 + 524], m7 - -RET - -;------------------------------------------------------------------------------ -; void all_angs_pred_8x8(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) -;------------------------------------------------------------------------------ -INIT_XMM sse4 -cglobal all_angs_pred_8x8, 3,4,8 - ; mode 2 - - movu m0, [r2 + 18] - palignr m1, m0, 1 - punpcklqdq m2, m0, m1 - movu [r0], m2 - - palignr m1, m0, 2 - palignr m2, m0, 3 - punpcklqdq m1, m2 - movu [r0 + 16], m1 - - palignr m1, m0, 4 - palignr m2, m0, 5 - punpcklqdq m1, m2 - movu [r0 + 32], m1 - - palignr m1, m0, 6 - palignr m2, m0, 7 - punpcklqdq m1, m2 - movu [r0 + 48], m1 - - ; mode 3 [row 0, 1] - - mova m7, [pw_1024] - lea r3, [ang_table] - - movu m0, [r1 + 17] - - palignr m1, m0, 1 - palignr m2, m0, 2 - - punpcklbw m3, m0, m1 - pmaddubsw m4, m3, [r3 + 26 * 16] - pmulhrsw m4, m7 - - punpcklbw m1, m2 - pmaddubsw m5, m1, [r3 + 20 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - - movu [r0 + 64], m4 - - ; mode 6 [row 1] - - movh [r0 + 264], m4 - - ; mode 6 [row 3] - - movhps [r0 + 280], m4 - - ; mode 4 [row 0, 1] - - pmaddubsw m4, m3, [r3 + 21 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m1, [r3 + 10 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 128], m4 - - ; mode 5 [row 0, 1] - - pmaddubsw m4, m3, [r3 + 17 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m1, [r3 + 2 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 192], m4 - - ; mode 6 [row 0] - - pmaddubsw m4, m3, [r3 + 13 * 16] - pmulhrsw m4, m7 - - pxor m5, m5 - - packuswb m4, m5 - movh [r0 + 256], m4 - - ; mode 7 [row 0, 1] - - pmaddubsw m4, m3, [r3 + 9 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 18 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 320], m4 - - ; mode 8 [row 0, 1] - - pmaddubsw m4, m3, [r3 + 5 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 10 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 384], m4 - - ; mode 8 [row 2, 3] - - pmaddubsw m4, m3, [r3 + 15 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 20 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 400], m4 - - ; mode 8 [row 4, 5] - - pmaddubsw m4, m3, [r3 + 25 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 30 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 416], m4 - - ; mode 8 [row 6, 7] - - pmaddubsw m4, m1, [r3 + 3 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m1, [r3 + 8 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 432], m4 - - ; mode 9 [row 0, 1] - - pmaddubsw m4, m3, [r3 + 2 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 4 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 448], m4 - - ; mode 9 [row 2, 3] - - pmaddubsw m4, m3, [r3 + 6 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 8 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 464], m4 - - ; mode 9 [row 4, 5] - - pmaddubsw m4, m3, [r3 + 10 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 12 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 480], m4 - - ; mode 9 [row 6, 7] - - pmaddubsw m4, m3, [r3 + 14 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 16 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 496], m4 - - ; mode 7 [row 2, 3] - - pmaddubsw m4, m3, [r3 + 27 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m1, [r3 + 4 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 336], m4 - - ; mode 7 [row 4, 5] - - pmaddubsw m4, m1, [r3 + 13 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m1, [r3 + 22 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 352], m4 - - ; mode 6 [row 2] - - pmaddubsw m4, m1, [r3 + 7 * 16] - pmulhrsw m4, m7 - - pxor m5, m5 - - packuswb m4, m5 - movh [r0 + 272], m4 - - ; mode 3 [row 2, 3] - - palignr m1, m0, 3 - palignr m3, m0, 4 - - punpcklbw m2, m1 - pmaddubsw m5, m2, [r3 + 14 * 16] - pmulhrsw m5, m7 - - punpcklbw m1, m3 - pmaddubsw m6, m1, [r3 + 8 * 16] - pmulhrsw m6, m7 - - packuswb m5, m6 - movu [r0 + 80], m5 - - ; mode 6 [row 7] - - movhps [r0 + 312], m5 - - ; mode 6 [row 5] - - movh [r0 + 296], m5 - - ; mode 4 [calculate and store row 4, 5] - - pmaddubsw m4, m1, [r3 + 9 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m1, [r3 + 30 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 160], m4 - - ; mode 5 [row 4, 5] - - pmaddubsw m4, m2, [r3 + 21 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m1, [r3 + 6 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 224], m4 - - ; mode 6 [row 4, 5] - - pmaddubsw m5, m2, [r3 + 1 * 16] - pmulhrsw m5, m7 - - pxor m6, m6 - - packuswb m5, m6 - movh [r0 + 288], m5 - - ; mode 6 [row 6, 7] - - pmaddubsw m5, m2, [r3 + 27 * 16] - pmulhrsw m5, m7 - - pxor m6, m6 - - packuswb m5, m6 - movh [r0 + 304], m5 - - ; mode 5 [calculate row 6] - - pmaddubsw m6, m1, [r3 + 23 * 16] - pmulhrsw m6, m7 - - ; mode 3 [row 4, 5] - - palignr m1, m0, 5 - - punpcklbw m3, m1 - pmaddubsw m4, m3, [r3 + 2 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 28 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 96], m4 - - ; mode 4 [calculate row 7] - - pmaddubsw m5, m3, [r3 + 19 * 16] - pmulhrsw m5, m7 - - ; mode 5 [calculate row 6] - - pmaddubsw m4, m3, [r3 + 8 * 16] - pmulhrsw m4, m7 - - packuswb m6, m4 - movu [r0 + 240], m6 - - ; mode 3 [row 6, 7] - - palignr m2, m0, 6 - palignr m3, m0, 7 - - punpcklbw m1, m2 - pmaddubsw m4, m1, [r3 + 22 * 16] - pmulhrsw m4, m7 - - punpcklbw m2, m3 - pmaddubsw m2, [r3 + 16 * 16] - pmulhrsw m2, m7 - - packuswb m4, m2 - movu [r0 + 112], m4 - - ; mode 4 [calculate row 7] - - pmaddubsw m2, m1, [r3 + 8 * 16] - pmulhrsw m2, m7 - - ; mode 4 [store row 6 and 7] - - packuswb m5, m2 - movu [r0 + 176], m5 - - ; mode 4 [row 2, 3] - - palignr m1, m0, 1 - palignr m2, m0, 2 - palignr m3, m0, 3 - - punpcklbw m1, m2 - pmaddubsw m4, m1, [r3 + 31 * 16] - pmulhrsw m4, m7 - - punpcklbw m2, m3 - pmaddubsw m5, m2, [r3 + 20 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 144], m4 - - ; mode 5 [row 2, 3] - - pmaddubsw m4, m1, [r3 + 19 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m2, [r3 + 4 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 208], m4 - - ; mode 7 [row 6, 7] - - pmaddubsw m4, m1, [r3 + 31 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m2, [r3 + 8 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 368], m4 - - ; mode 10 - - pshufb m1, m0, [tab_Si] - movu [r0 + 512], m1 - movu [r0 + 528], m1 - movu [r0 + 544], m1 - movu [r0 + 560], m1 - - pxor m0, m0 - - pshufb m1, m1, m0 - punpcklbw m1, m0 - - movu m2, [r1] - - pshufb m3, m2, m0 - punpcklbw m3, m0 - - psrldq m4, m2, 1 - punpcklbw m4, m0 - - movu m2, [r1 + 9] - punpcklbw m2, m0 - - psubw m4, m3 - psubw m2, m3 - - psraw m4, 1 - psraw m2, 1 - - paddw m4, m1 - paddw m2, m1 - - packuswb m4, m2 - - pextrb [r0 + 512], m4, 0 - pextrb [r0 + 520], m4, 1 - pextrb [r0 + 528], m4, 2 - pextrb [r0 + 536], m4, 3 - pextrb [r0 + 544], m4, 4 - pextrb [r0 + 552], m4, 5 - pextrb [r0 + 560], m4, 6 - pextrb [r0 + 568], m4, 7 - - ; mode 11 [row 0, 1] - - movu m0, [r1 + 16] - pinsrb m0, [r1], 0 - palignr m1, m0, 1 - punpcklbw m2, m0, m1 - - pmaddubsw m3, m2, [r3 + 30 * 16] - pmulhrsw m3, m7 - - pmaddubsw m4, m2, [r3 + 28 * 16] - pmulhrsw m4, m7 - - packuswb m3, m4 - movu [r0 + 576], m3 - - ; mode 11 [row 2, 3] - - pmaddubsw m3, m2, [r3 + 26 * 16] - pmulhrsw m3, m7 - - pmaddubsw m4, m2, [r3 + 24 * 16] - pmulhrsw m4, m7 - - packuswb m3, m4 - movu [r0 + 592], m3 - - ; mode 11 [row 4, 5] - - pmaddubsw m3, m2, [r3 + 22 * 16] - pmulhrsw m3, m7 - - pmaddubsw m4, m2, [r3 + 20 * 16] - pmulhrsw m4, m7 - - packuswb m5, m3, m4 - movu [r0 + 608], m5 - - ; mode 12 [row 0, 1] - - pmaddubsw m4, m2, [r3 + 27 * 16] - pmulhrsw m4, m7 - - packuswb m4, m3 - movu [r0 + 640], m4 - - ; mode 11 [row 6, 7] - - pmaddubsw m3, m2, [r3 + 18 * 16] - pmulhrsw m3, m7 - - pmaddubsw m4, m2, [r3 + 16 * 16] - pmulhrsw m4, m7 - - packuswb m3, m4 - movu [r0 + 624], m3 - - ; mode 12 [row 2, 3] - - pmaddubsw m3, m2, [r3 + 17 * 16] - pmulhrsw m3, m7 - - pmaddubsw m4, m2, [r3 + 12 * 16] - pmulhrsw m4, m7 - - packuswb m3, m4 - movu [r0 + 656], m3 - - ; mode 12 [row 4, 5] - - pmaddubsw m3, m2, [r3 + 7 * 16] - pmulhrsw m3, m7 - - pmaddubsw m4, m2, [r3 + 2 * 16] - pmulhrsw m4, m7 - - packuswb m3, m4 - movu [r0 + 672], m3 - - ; mode 12 [row 6, 7] - - pslldq m3, m2, 2 - pinsrb m3, [r1 + 0], 1 - pinsrb m3, [r1 + 6], 0 - - pmaddubsw m4, m3, [r3 + 29 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 24 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 688], m4 - - ; mode 13 [row 0, 1] - - pmaddubsw m4, m2, [r3 + 23 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m2, [r3 + 14 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 704], m4 - - ; mode 13 [row 2, 3] - - pmaddubsw m4, m2, [r3 + 5 * 16] - pmulhrsw m4, m7 - - pinsrb m3, [r1 + 4], 0 - pmaddubsw m5, m3, [r3 + 28 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 720], m4 - - ; mode 13 [row 4, 5] - - pmaddubsw m4, m3, [r3 + 19 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 10 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 736], m4 - - ; mode 13 [row 6, 7] - - pmaddubsw m4, m3, [r3 + 1 * 16] - pmulhrsw m4, m7 - - pslldq m5, m3, 2 - pinsrb m5, [r1 + 4], 1 - pinsrb m5, [r1 + 7], 0 - - pmaddubsw m5, [r3 + 24 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 752], m4 - - ; mode 14 [row 0, 1] - - pmaddubsw m4, m2, [r3 + 19 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m2, [r3 + 6 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 768], m4 - - ; mode 14 [row 2, 3] - - pinsrb m3, [r1 + 2], 0 - - pmaddubsw m4, m3, [r3 + 25 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 12 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 784], m4 - - ; mode 14 [row 4, 5] - - pslldq m1, m3, 2 - pinsrb m1, [r1 + 2], 1 - pinsrb m1, [r1 + 5], 0 - - pmaddubsw m4, m1, [r3 + 31 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m1, [r3 + 18 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 800], m4 - - ; mode 14 [row 6, 7] - - pmaddubsw m4, m1, [r3 + 5 * 16] - pmulhrsw m4, m7 - - pslldq m1, 2 - pinsrb m1, [r1 + 5], 1 - pinsrb m1, [r1 + 7], 0 - - pmaddubsw m5, m1, [r3 + 24 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 816], m4 - - ; mode 15 [row 0, 1] - - pmaddubsw m4, m2, [r3 + 15 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 30 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 832], m4 - - ; mode 15 [row 2, 3] - - pmaddubsw m4, m3, [r3 + 13 * 16] - pmulhrsw m4, m7 - - pslldq m1, m3, 2 - pinsrb m1, [r1 + 2], 1 - pinsrb m1, [r1 + 4], 0 - - pmaddubsw m5, m1, [r3 + 28 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 848], m4 - - ; mode 15 [row 4, 5] - - pmaddubsw m4, m1, [r3 + 11 * 16] - pmulhrsw m4, m7 - - pslldq m1, 2 - pinsrb m1, [r1 + 4], 1 - pinsrb m1, [r1 + 6], 0 - - pmaddubsw m5, m1, [r3 + 26 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 864], m4 - - ; mode 15 [row 6, 7] - - pmaddubsw m4, m1, [r3 + 9 * 16] - pmulhrsw m4, m7 - - pslldq m1, 2 - pinsrb m1, [r1 + 6], 1 - pinsrb m1, [r1 + 8], 0 - - pmaddubsw m1, [r3 + 24 * 16] - pmulhrsw m1, m7 - - packuswb m4, m1 - movu [r0 + 880], m4 - - ; mode 16 [row 0, 1] - - pmaddubsw m4, m2, [r3 + 11 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 22 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 896], m4 - - ; mode 16 [row 2, 3] - - pmaddubsw m4, m3, [r3 + 1 * 16] - pmulhrsw m4, m7 - - pslldq m3, 2 - pinsrb m3, [r1 + 2], 1 - pinsrb m3, [r1 + 3], 0 - - pmaddubsw m5, m3, [r3 + 12 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 912], m4 - - ; mode 16 [row 4, 5] - - pslldq m3, 2 - pinsrb m3, [r1 + 3], 1 - pinsrb m3, [r1 + 5], 0 - - pmaddubsw m4, m3, [r3 + 23 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m3, [r3 + 2 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 928], m4 - - ; mode 16 [row 6, 7] - - pslldq m3, 2 - pinsrb m3, [r1 + 5], 1 - pinsrb m3, [r1 + 6], 0 - - pmaddubsw m4, m3, [r3 + 13 * 16] - pmulhrsw m4, m7 - - pslldq m3, 2 - pinsrb m3, [r1 + 6], 1 - pinsrb m3, [r1 + 8], 0 - - pmaddubsw m3, [r3 + 24 * 16] - pmulhrsw m3, m7 - - packuswb m4, m3 - movu [r0 + 944], m4 - - ; mode 17 [row 0, 1] - - pmaddubsw m4, m2, [r3 + 6 * 16] - pmulhrsw m4, m7 - - pslldq m2, 2 - pinsrb m2, [r1 + 0], 1 - pinsrb m2, [r1 + 1], 0 - - pmaddubsw m3, m2, [r3 + 12 * 16] - pmulhrsw m3, m7 - - packuswb m4, m3 - movu [r0 + 960], m4 - - ; mode 17 [row 2, 3] - - pslldq m2, 2 - pinsrb m2, [r1 + 1], 1 - pinsrb m2, [r1 + 2], 0 - - pmaddubsw m4, m2, [r3 + 18 * 16] - pmulhrsw m4, m7 - - pslldq m2, 2 - pinsrb m2, [r1 + 2], 1 - pinsrb m2, [r1 + 4], 0 - - pmaddubsw m3, m2, [r3 + 24 * 16] - pmulhrsw m3, m7 - - packuswb m4, m3 - movu [r0 + 976], m4 - - ; mode 17 [row 4, 5] - - pslldq m2, 2 - pinsrb m2, [r1 + 4], 1 - pinsrb m2, [r1 + 5], 0 - - pmaddubsw m4, m2, [r3 + 30 * 16] - pmulhrsw m4, m7 - - pmaddubsw m3, m2, [r3 + 4 * 16] - pmulhrsw m3, m7 - - packuswb m4, m3 - movu [r0 + 992], m4 - - ; mode 17 [row 6, 7] - - pslldq m2, 2 - pinsrb m2, [r1 + 5], 1 - pinsrb m2, [r1 + 6], 0 - - pmaddubsw m4, m2, [r3 + 10 * 16] - pmulhrsw m4, m7 - - pslldq m2, 2 - pinsrb m2, [r1 + 6], 1 - pinsrb m2, [r1 + 7], 0 - - pmaddubsw m3, m2, [r3 + 16 * 16] - pmulhrsw m3, m7 - - packuswb m4, m3 - movu [r0 + 1008], m4 - - ; mode 18 [row 0, 1, 2, 3, 4, 5, 6, 7] - - movh m1, [r2] - - pslldq m2, m1, 1 - pinsrb m2, [r2 + 1 + 16], 0 - punpcklqdq m1, m2 - movu [r0 + 1024], m1 - - pslldq m2, 1 - pinsrb m2, [r2 + 2 + 16], 0 - - pslldq m0, m2, 1 - pinsrb m0, [r2 + 3 + 16], 0 - punpcklqdq m2, m0 - movu [r0 + 1040], m2 - - pslldq m0, 1 - pinsrb m0, [r2 + 4 + 16], 0 - - pslldq m2, m0, 1 - pinsrb m2, [r2 + 5 + 16], 0 - punpcklqdq m0, m2 - movu [r0 + 1056], m0 - - pslldq m2, 1 - pinsrb m2, [r2 + 6 + 16], 0 - - pslldq m0, m2, 1 - pinsrb m0, [r2 + 7 + 16], 0 - punpcklqdq m2, m0 - movu [r0 + 1072], m2 - - ; mode 19 [row 0, 1] - - movu m0, [r1] - palignr m1, m0, 1 - punpcklbw m0, m1 - - pmaddubsw m1, m0, [r3 + 6 * 16] - pmulhrsw m1, m7 - - pslldq m2, m0, 2 - pinsrb m2, [r1], 1 - pinsrb m2, [r1 + 1 + 16], 0 - - pmaddubsw m3, m2, [r3 + 12 * 16] - pmulhrsw m3, m7 - - packuswb m1, m3 - movu [r0 + 1088], m1 - - ; mode 19 [row 2, 3] - - pslldq m2, 2 - pinsrb m2, [r1 + 1 + 16], 1 - pinsrb m2, [r1 + 2 + 16], 0 - - pmaddubsw m4, m2, [r3 + 18 * 16] - pmulhrsw m4, m7 - - pslldq m2, 2 - pinsrb m2, [r1 + 2 + 16], 1 - pinsrb m2, [r1 + 4 + 16], 0 - - pmaddubsw m5, m2, [r3 + 24 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 1104], m4 - - ; mode 19 [row 4, 5] - - pslldq m2, 2 - pinsrb m2, [r1 + 4 + 16], 1 - pinsrb m2, [r1 + 5 + 16], 0 - - pmaddubsw m4, m2, [r3 + 30 * 16] - pmulhrsw m4, m7 - - pmaddubsw m5, m2, [r3 + 4 * 16] - pmulhrsw m5, m7 - - packuswb m4, m5 - movu [r0 + 1120], m4 - - ; mode 19 [row 6, 7] - - pslldq m2, 2 - pinsrb m2, [r1 + 5 + 16], 1 - pinsrb m2, [r1 + 6 + 16], 0 - - pmaddubsw m4, m2, [r3 + 10 * 16] - pmulhrsw m4, m7 - - pslldq m2, 2 - pinsrb m2, [r1 + 6 + 16], 1 - pinsrb m2, [r1 + 7 + 16], 0 - - pmaddubsw m2, [r3 + 16 * 16] - pmulhrsw m2, m7 - - packuswb m4, m2 - movu [r0 + 1136], m4 - - ; mode 20 [row 0, 1] - - pmaddubsw m3, m0, [r3 + 11 * 16] - pmulhrsw m3, m7 - - pslldq m1, m0, 2 - pinsrb m1, [r1 + 0], 1 - pinsrb m1, [r1 + 2 + 16], 0 - - pmaddubsw m4, m1, [r3 + 22 * 16] - pmulhrsw m4, m7 - - packuswb m3, m4 - movu [r0 + 1152], m3 - - ; mode 20 [row 2, 3] - - pmaddubsw m3, m1, [r3 + 1 * 16] - pmulhrsw m3, m7 - - pslldq m2, m1, 2 - pinsrb m2, [r1 + 2 + 16], 1 - pinsrb m2, [r1 + 3 + 16], 0 - - pmaddubsw m4, m2, [r3 + 12 * 16] - pmulhrsw m4, m7 - - packuswb m3, m4 - movu [r0 + 1168], m3 - - ; mode 20 [row 4, 5] - - pslldq m2, 2 - pinsrb m2, [r1 + 3 + 16], 1 - pinsrb m2, [r1 + 5 + 16], 0 - - pmaddubsw m3, m2, [r3 + 23 * 16] - pmulhrsw m3, m7 - - pmaddubsw m4, m2, [r3 + 2 * 16] - pmulhrsw m4, m7 - - packuswb m3, m4 - movu [r0 + 1184], m3 + pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] + pmulhrsw m1, m7 + pmaddubsw m3, m2, [r4 - 16 * 16] ; [00] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro - ; mode 20 [row 6, 7] +%macro MODE_13_23 2 + movu m2, [r2] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] + punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] + palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] + pmaddubsw m4, m0, [r4 + 7 * 16] ; [23] + pmulhrsw m4, m7 + pmaddubsw m3, m0, [r4 - 2 * 16] ; [14] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m0, [r4 - 11 * 16] ; [05] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 + 12 * 16] ; [28] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 + 3 * 16] ; [19] + pmulhrsw m6, m7 + pmaddubsw m3, m2, [r4 - 6 * 16] ; [10] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, m2, [r4 - 15 * 16] ; [1] + pmulhrsw m1, m7 + movu m2, [r2 - 2] ; [14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1] + palignr m3, m2, 1 ; [x, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + punpckhbw m0, m2, m3 + punpcklbw m2, m3 + palignr m0, m2, 2 + pmaddubsw m3, m0, [r4 + 8 * 16] ; [24] + pmulhrsw m3, m7 + packuswb m1, m3 + mova m3, m0 + TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1 + pmaddubsw m4, m3, [r4 - 16] ; [15] + pmulhrsw m4, m7 + pmaddubsw m5, m3, [r4 - 10 * 16] ; [6] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r4 + 13 * 16] ; [29] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 + 4 * 16] ; [20] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 - 5 * 16] ; [11] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r4 - 14 * 16] ; [2] + pmulhrsw m1, m7 + packuswb m6, m1 + movu m2, [r2 - 4] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] + punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] + palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] + pmaddubsw m1, m0, [r4 + 9 * 16] ; [25] + pmulhrsw m1, m7 + pmaddubsw m3, m0, [r4] ; [16] + pmulhrsw m3, m7 + packuswb m1, m3 + mova m3, m0 + TRANSPOSE_STORE_8x8 1, %1, m4, m5, m6, m1 + pmaddubsw m4, m3, [r4 - 9 * 16] ; [7] + pmulhrsw m4, m7 + pmaddubsw m3, m2, [r4 + 14 * 16] ; [30] + pmulhrsw m3, m7 + packuswb m4, m3 + pmaddubsw m5, m2, [r4 + 5 * 16] ; [21] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 - 4 * 16] ; [12] + pmulhrsw m6, m7 + packuswb m5, m6 + pmaddubsw m6, m2, [r4 - 13 * 16] ; [3] + pmulhrsw m6, m7 + movu m2, [r2 - 6] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + punpckhbw m0, m2, m1 ; [x, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8] + punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] + palignr m0, m2, 2 ; [9, 8, 8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1] + pmaddubsw m3, m0, [r4 + 10 * 16] ; [26] + pmulhrsw m3, m7 + packuswb m6, m3 + pmaddubsw m1, m0, [r4 + 16] ; [17] + pmulhrsw m1, m7 + pmaddubsw m3, m0, [r4 - 8 * 16] ; [8] + pmulhrsw m3, m7 + packuswb m1, m3 + TRANSPOSE_STORE_8x8 2, %1, m4, m5, m6, m1 + pmaddubsw m4, m2, [r4 + 15 * 16] ; [31] + pmulhrsw m4, m7 + pmaddubsw m5, m2, [r4 + 6 * 16] ; [22] + pmulhrsw m5, m7 + packuswb m4, m5 + pmaddubsw m5, m2, [r4 - 3 * 16] ; [13] + pmulhrsw m5, m7 + pmaddubsw m6, m2, [r4 - 12 * 16] ; [04] + pmulhrsw m6, m7 + packuswb m5, m6 + movu m2, [r2 - 7] ; [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0] + %if ((%1 & %2) == 1) + pinsrb m2, [r3], 0 + %endif + palignr m1, m2, 1 ; [x ,15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + punpcklbw m2, m1 ; [8, 7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0] + pmaddubsw m6, m2, [r4 + 11 * 16] ; [27] + pmulhrsw m6, m7 + pmaddubsw m1, m2, [r4 + 2 * 16] ; [18] + pmulhrsw m1, m7 + packuswb m6, m1 + pmaddubsw m1, m2, [r4 - 7 * 16] ; [09] + pmulhrsw m1, m7 + movu m0, [pb_fact0] + pshufb m2, m0 + pmovzxbw m2, m2 + packuswb m1, m2 + TRANSPOSE_STORE_8x8 3, %1, m4, m5, m6, m1 +%endmacro - pslldq m2, 2 - pinsrb m2, [r1 + 5 + 16], 1 - pinsrb m2, [r1 + 6 + 16], 0 +INIT_XMM sse4 +cglobal intra_pred_ang32_13, 3,7,8,0-(1*mmsize) +%define above [rsp + 0 * mmsize] + mov r3, r2 + add r2, 64 + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m7, [pw_1024] - pmaddubsw m3, m2, [r3 + 13 * 16] - pmulhrsw m3, m7 + MODE_13_23_ROW0 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 7 - pslldq m2, 2 - pinsrb m2, [r1 + 6 + 16], 1 - pinsrb m2, [r1 + 8 + 16], 0 + MODE_13_23 1, 1 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + mov r3, 2 +.loop: + MODE_13_23 1, 0 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec r3 + jnz .loop + RET - pmaddubsw m4, m2, [r3 + 24 * 16] - pmulhrsw m4, m7 +INIT_XMM sse4 +cglobal intra_pred_ang32_14, 3,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 - packuswb m3, m4 - movu [r0 + 1200], m3 + ; collect reference pixel + movu m0, [r2] + movu m1, [r2 + 15] + pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] + pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] + pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] + palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] + mova [rsp], m0 + movu m0, [r2 + 1 + 64] + movu m1, [r2 + 1 + 16 + 64] + movu [rsp + 13], m0 + movu [rsp + 13 + 16], m1 + mov [rsp + 63], byte 4 - ; mode 21 [row 0, 1] + ; filter + lea r2, [rsp + 13] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - pmaddubsw m2, m0, [r3 + 15 * 16] - pmulhrsw m2, m7 +.loop: + ; Row[0 - 7] + movu m7, [r2 - 4] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m4 + PROC32_8x8 0, 1, 19,6,25,12,31,18,5,24 - pmaddubsw m3, m1, [r3 + 30 * 16] - pmulhrsw m3, m7 + ; Row[8 - 15] + movu m7, [r2 - 7] + palignr m0, m7, 3 + palignr m1, m7, 2 + mova m2, m1 + mova m3, m1 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + PROC32_8x8 1, 1, 11,30,17,4,23,10,29,16 - packuswb m2, m3 - movu [r0 + 1216], m2 + ; Row[16 - 23] + movu m7, [r2 - 10] + palignr m0, m7, 3 + palignr m1, m7, 2 + mova m2, m1 + palignr m3, m7, 1 + mova m4, m3 + mova m5, m3 + mova m6, m7 + PROC32_8x8 2, 1, 3,22,9,28,15,2,21,8 - ; mode 21 [row 2, 3] + ; Row[24 - 31] + movu m7, [r2 - 13] + palignr m0, m7, 2 + mova m1, m0 + mova m2, m0 + palignr m3, m7, 1 + mova m4, m3 + mova m5, m7 + mova m6, m7 + PROC32_8x8 3, 1, 27,14,1,20,7,26,13,0 - pmaddubsw m2, m1, [r3 + 13 * 16] - pmulhrsw m2, m7 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET - pslldq m3, m1, 2 - pinsrb m3, [r1 + 2 + 16], 1 - pinsrb m3, [r1 + 4 + 16], 0 +INIT_XMM sse4 +cglobal intra_pred_ang32_15, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 - pmaddubsw m4, m3, [r3 + 28 * 16] - pmulhrsw m4, m7 + ; collect reference pixel + movu m0, [r2] + movu m1, [r2 + 15] + pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] + pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] + mova [rsp], m1 + movu [rsp + 8], m0 + movu m0, [r2 + 1 + 64] + movu m1, [r2 + 1 + 16 + 64] + movu [rsp + 17], m0 + movu [rsp + 17 + 16], m1 + mov [rsp + 63], byte 4 - packuswb m2, m4 - movu [r0 + 1232], m2 + ; filter + lea r2, [rsp + 17] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - ; mode 21 [row 4, 5] +.loop: + ; Row[0 - 7] + movu m7, [r2 - 5] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m5 + PROC32_8x8 0, 1, 15,30,13,28,11,26,9,24 - pmaddubsw m2, m3, [r3 + 11 * 16] - pmulhrsw m2, m7 + ; Row[8 - 15] + movu m7, [r2 - 9] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m5 + PROC32_8x8 1, 1, 7,22,5,20,3,18,1,16 - pslldq m3, 2 - pinsrb m3, [r1 + 4 + 16], 1 - pinsrb m3, [r1 + 6 + 16], 0 + ; Row[16 - 23] + movu m7, [r2 - 13] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + PROC32_8x8 2, 1, 31,14,29,12,27,10,25,8 - pmaddubsw m4, m3, [r3 + 26 * 16] - pmulhrsw m4, m7 + ; Row[24 - 31] + movu m7, [r2 - 17] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + PROC32_8x8 3, 1, 23,6,21,4,19,2,17,0 - packuswb m2, m4 - movu [r0 + 1248], m2 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET - ; mode 21 [row 6, 7] +INIT_XMM sse4 +cglobal intra_pred_ang32_16, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 - pmaddubsw m2, m3, [r3 + 9 * 16] - pmulhrsw m2, m7 + ; collect reference pixel + movu m0, [r2] + movu m1, [r2 + 15] + pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] + pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] + mova [rsp], m1 + movu [rsp + 10], m0 + movu m0, [r2 + 1 + 64] + movu m1, [r2 + 1 + 16 + 64] + movu [rsp + 21], m0 + movu [rsp + 21 + 16], m1 + mov [rsp + 63], byte 4 - pslldq m3, 2 - pinsrb m3, [r1 + 6 + 16], 1 - pinsrb m3, [r1 + 8 + 16], 0 + ; filter + lea r2, [rsp + 21] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - pmaddubsw m4, m3, [r3 + 24 * 16] - pmulhrsw m4, m7 +.loop: + ; Row[0 - 7] + movu m7, [r2 - 6] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 0, 1, 11,22,1,12,23,2,13,24 - packuswb m2, m4 - movu [r0 + 1264], m2 + ; Row[8 - 15] + movu m7, [r2 - 11] + palignr m0, m7, 5 + palignr m1, m7, 4 + palignr m2, m7, 3 + mova m3, m2 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m5 + PROC32_8x8 1, 1, 3,14,25,4,15,26,5,16 - ; mode 22 [row 0, 1] + ; Row[16 - 23] + movu m7, [r2 - 16] + palignr m0, m7, 4 + mova m1, m0 + palignr m2, m7, 3 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m7 + PROC32_8x8 2, 1, 27,6,17,28,7,18,29,8 - pmaddubsw m2, m0, [r3 + 19 * 16] - pmulhrsw m2, m7 + ; Row[24 - 31] + movu m7, [r2 - 21] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + PROC32_8x8 3, 1, 19,30,9,20,31,10,21,0 - pmaddubsw m4, m0, [r3 + 6 * 16] - pmulhrsw m4, m7 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET - packuswb m2, m4 - movu [r0 + 1280], m2 +INIT_XMM sse4 +cglobal intra_pred_ang32_17, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 - ; mode 22 [row 2, 3] + ; collect reference pixel + movu m0, [r2] + movu m1, [r2 + 16] + pshufb m0, [c_mode32_17_0] + pshufb m1, [c_mode32_17_0] + mova [rsp ], m1 + movu [rsp + 13], m0 + movu m0, [r2 + 1 + 64] + movu m1, [r2 + 1 + 16 + 64] + movu [rsp + 26], m0 + movu [rsp + 26 + 16], m1 + mov [rsp + 63], byte 4 - pmaddubsw m2, m1, [r3 + 25 * 16] - pmulhrsw m2, m7 + ; filter + lea r2, [rsp + 25] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - pmaddubsw m3, m1, [r3 + 12 * 16] - pmulhrsw m3, m7 +.loop: + ; Row[0 - 7] + movu m7, [r2 - 6] + palignr m0, m7, 6 + palignr m1, m7, 5 + palignr m2, m7, 4 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 0, 1, 6,12,18,24,30,4,10,16 - packuswb m2, m3 - movu [r0 + 1296], m2 + ; Row[7 - 15] + movu m7, [r2 - 12] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m7 + PROC32_8x8 1, 1, 22,28,2,8,14,20,26,0 - ; mode 22 [row 4, 5] + ; Row[16 - 23] + movu m7, [r2 - 19] + palignr m0, m7, 6 + palignr m1, m7, 5 + palignr m2, m7, 4 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 2, 1, 6,12,18,24,30,4,10,16 - pslldq m1, 2 - pinsrb m1, [r1 + 5 + 16], 0 - pinsrb m1, [r1 + 2 + 16], 1 + ; Row[24 - 31] + movu m7, [r2 - 25] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m7 + PROC32_8x8 3, 1, 22,28,2,8,14,20,26,0 - pmaddubsw m2, m1, [r3 + 31 * 16] - pmulhrsw m2, m7 + lea r0, [r6 + r1 * 4] + lea r6, [r6 + r1 * 8] + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] - pmaddubsw m3, m1, [r3 + 18 * 16] - pmulhrsw m3, m7 + RET - packuswb m2, m3 - movu [r0 + 1312], m2 +INIT_XMM sse4 +cglobal intra_pred_ang32_18, 4,5,5 + movu m0, [r2] ; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] + movu m1, [r2 + 16] ; [31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16] + movu m2, [r2 + 1 + 64] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + movu m3, [r2 + 17 + 64] ; [32 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17] - ; mode 22 [row 6, 7] + lea r2, [r1 * 2] + lea r3, [r1 * 3] + lea r4, [r1 * 4] - pmaddubsw m2, m1, [r3 + 5 * 16] - pmulhrsw m2, m7 + movu [r0], m0 + movu [r0 + 16], m1 - pslldq m1, 2 - pinsrb m1, [r1 + 5 + 16], 1 - pinsrb m1, [r1 + 7 + 16], 0 + pshufb m2, [c_mode32_18_0] ; [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16] + pshufb m3, [c_mode32_18_0] ; [17 18 19 20 21 22 23 24 25 26 27 28 19 30 31 32] - pmaddubsw m1, [r3 + 24 * 16] - pmulhrsw m1, m7 + palignr m4, m0, m2, 15 + movu [r0 + r1], m4 + palignr m4, m1, m0, 15 + movu [r0 + r1 + 16], m4 + palignr m4, m0, m2, 14 + movu [r0 + r2], m4 + palignr m4, m1, m0, 14 + movu [r0 + r2 + 16], m4 + palignr m4, m0, m2, 13 + movu [r0 + r3], m4 + palignr m4, m1, m0, 13 + movu [r0 + r3 + 16], m4 - packuswb m2, m1 - movu [r0 + 1328], m2 + lea r0, [r0 + r4] - ; mode 23 [row 0, 1] + palignr m4, m0, m2, 12 + movu [r0], m4 + palignr m4, m1, m0, 12 + movu [r0 + 16], m4 + palignr m4, m0, m2, 11 + movu [r0 + r1], m4 + palignr m4, m1, m0, 11 + movu [r0 + r1 + 16], m4 + palignr m4, m0, m2, 10 + movu [r0 + r2], m4 + palignr m4, m1, m0, 10 + movu [r0 + r2 + 16], m4 + palignr m4, m0, m2, 9 + movu [r0 + r3], m4 + palignr m4, m1, m0, 9 + movu [r0 + r3 + 16], m4 - pmaddubsw m2, m0, [r3 + 23 * 16] - pmulhrsw m2, m7 + lea r0, [r0 + r4] - pmaddubsw m3, m0, [r3 + 14 * 16] - pmulhrsw m3, m7 + palignr m4, m0, m2, 8 + movu [r0], m4 + palignr m4, m1, m0, 8 + movu [r0 + 16], m4 + palignr m4, m0, m2, 7 + movu [r0 + r1], m4 + palignr m4, m1, m0, 7 + movu [r0 + r1 + 16], m4 + palignr m4, m0, m2, 6 + movu [r0 + r2], m4 + palignr m4, m1, m0, 6 + movu [r0 + r2 + 16], m4 + palignr m4, m0, m2, 5 + movu [r0 + r3], m4 + palignr m4, m1, m0, 5 + movu [r0 + r3 + 16], m4 - packuswb m2, m3 - movu [r0 + 1344], m2 + lea r0, [r0 + r4] - ; mode 23 [row 2, 3] + palignr m4, m0, m2, 4 + movu [r0], m4 + palignr m4, m1, m0, 4 + movu [r0 + 16], m4 + palignr m4, m0, m2, 3 + movu [r0 + r1], m4 + palignr m4, m1, m0, 3 + movu [r0 + r1 + 16], m4 + palignr m4, m0, m2, 2 + movu [r0 + r2], m4 + palignr m4, m1, m0, 2 + movu [r0 + r2 + 16], m4 + palignr m4, m0, m2, 1 + movu [r0 + r3], m4 + palignr m4, m1, m0, 1 + movu [r0 + r3 + 16], m4 - pmaddubsw m2, m0, [r3 + 5 * 16] - pmulhrsw m2, m7 + lea r0, [r0 + r4] - pslldq m1, m0, 2 - pinsrb m1, [r1], 1 - pinsrb m1, [r1 + 4 + 16], 0 + movu [r0], m2 + movu [r0 + 16], m0 + palignr m4, m2, m3, 15 + movu [r0 + r1], m4 + palignr m4, m0, m2, 15 + movu [r0 + r1 + 16], m4 + palignr m4, m2, m3, 14 + movu [r0 + r2], m4 + palignr m4, m0, m2, 14 + movu [r0 + r2 + 16], m4 + palignr m4, m2, m3, 13 + movu [r0 + r3], m4 + palignr m4, m0, m2, 13 + movu [r0 + r3 + 16], m4 - pmaddubsw m3, m1, [r3 + 28 * 16] - pmulhrsw m3, m7 + lea r0, [r0 + r4] - packuswb m2, m3 - movu [r0 + 1360], m2 + palignr m4, m2, m3, 12 + movu [r0], m4 + palignr m4, m0, m2, 12 + movu [r0 + 16], m4 + palignr m4, m2, m3, 11 + movu [r0 + r1], m4 + palignr m4, m0, m2, 11 + movu [r0 + r1 + 16], m4 + palignr m4, m2, m3, 10 + movu [r0 + r2], m4 + palignr m4, m0, m2, 10 + movu [r0 + r2 + 16], m4 + palignr m4, m2, m3, 9 + movu [r0 + r3], m4 + palignr m4, m0, m2, 9 + movu [r0 + r3 + 16], m4 - ; mode 23 [row 4, 5] + lea r0, [r0 + r4] - pmaddubsw m2, m1, [r3 + 19 * 16] - pmulhrsw m2, m7 + palignr m4, m2, m3, 8 + movu [r0], m4 + palignr m4, m0, m2, 8 + movu [r0 + 16], m4 + palignr m4, m2, m3, 7 + movu [r0 + r1], m4 + palignr m4, m0, m2, 7 + movu [r0 + r1 + 16], m4 + palignr m4, m2, m3, 6 + movu [r0 + r2], m4 + palignr m4, m0, m2, 6 + movu [r0 + r2 + 16], m4 + palignr m4, m2, m3, 5 + movu [r0 + r3], m4 + palignr m4, m0, m2, 5 + movu [r0 + r3 + 16], m4 - pmaddubsw m3, m1, [r3 + 10 * 16] - pmulhrsw m3, m7 + lea r0, [r0 + r4] - packuswb m2, m3 - movu [r0 + 1376], m2 + palignr m4, m2, m3, 4 + movu [r0], m4 + palignr m4, m0, m2, 4 + movu [r0 + 16], m4 + palignr m4, m2, m3, 3 + movu [r0 + r1], m4 + palignr m4, m0, m2, 3 + movu [r0 + r1 + 16], m4 + palignr m4, m2, m3, 2 + movu [r0 + r2], m4 + palignr m4, m0, m2, 2 + movu [r0 + r2 + 16], m4 + palignr m4, m2, m3, 1 + movu [r0 + r3], m4 + palignr m4, m0, m2, 1 + movu [r0 + r3 + 16], m4 + RET - ; mode 23 [row 6, 7] +INIT_XMM sse4 +cglobal intra_pred_ang32_19, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 - pmaddubsw m2, m1, [r3 + 1 * 16] - pmulhrsw m2, m7 + ; collect reference pixel + movu m0, [r2 + 64] + pinsrb m0, [r2], 0 + movu m1, [r2 + 16 + 64] + pshufb m0, [c_mode32_17_0] + pshufb m1, [c_mode32_17_0] + mova [rsp ], m1 + movu [rsp + 13], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 26], m0 + movu [rsp + 26 + 16], m1 + mov [rsp + 63], byte 4 - pslldq m3, m1, 2 - pinsrb m3, [r1 + 4 + 16], 1 - pinsrb m3, [r1 + 7 + 16], 0 + ; filter + lea r2, [rsp + 25] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - pmaddubsw m3, [r3 + 24 * 16] - pmulhrsw m3, m7 +.loop: + ; Row[0 - 7] + movu m7, [r2 - 6] + palignr m0, m7, 6 + palignr m1, m7, 5 + palignr m2, m7, 4 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 0, 0, 6,12,18,24,30,4,10,16 - packuswb m2, m3 - movu [r0 + 1392], m2 + ; Row[7 - 15] + movu m7, [r2 - 12] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 22,28,2,8,14,20,26,0 - ; mode 24 [row 0, 1] + ; Row[16 - 23] + movu m7, [r2 - 19] + palignr m0, m7, 6 + palignr m1, m7, 5 + palignr m2, m7, 4 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 6,12,18,24,30,4,10,16 - pmaddubsw m2, m0, [r3 + 27 * 16] - pmulhrsw m2, m7 + ; Row[24 - 31] + movu m7, [r2 - 25] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 22,28,2,8,14,20,26,0 - pmaddubsw m5, m0, [r3 + 22 * 16] - pmulhrsw m5, m7 + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET - packuswb m2, m5 - movu [r0 + 1408], m2 +INIT_XMM sse4 +cglobal intra_pred_ang32_20, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 - ; mode 24 [row 2, 3] + ; collect reference pixel + movu m0, [r2 + 64] + pinsrb m0, [r2], 0 + movu m1, [r2 + 15 + 64] + pshufb m0, [c_mode32_16_0] ; [x x x x x 0 2 3 5 6 8 9 11 12 14 15] + pshufb m1, [c_mode32_16_0] ; [x x x x x 15 17 18 20 21 23 24 26 27 29 30] + mova [rsp], m1 + movu [rsp + 10], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 21], m0 + movu [rsp + 21 + 16], m1 + mov [rsp + 63], byte 4 - pmaddubsw m2, m0, [r3 + 17 * 16] - pmulhrsw m2, m7 + ; filter + lea r2, [rsp + 21] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - pmaddubsw m3, m0, [r3 + 12 * 16] - pmulhrsw m3, m7 +.loop: + ; Row[0 - 7] + movu m7, [r2 - 6] + palignr m0, m7, 5 + palignr m1, m7, 4 + mova m2, m1 + palignr m3, m7, 3 + palignr m4, m7, 2 + mova m5, m4 + palignr m6, m7, 1 + PROC32_8x8 0, 0, 11,22,1,12,23,2,13,24 - packuswb m2, m3 - movu [r0 + 1424], m2 + ; Row[8 - 15] + movu m7, [r2 - 11] + palignr m0, m7, 5 + palignr m1, m7, 4 + palignr m2, m7, 3 + mova m3, m2 + palignr m4, m7, 2 + palignr m5, m7, 1 + mova m6, m5 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 3,14,25,4,15,26,5,16 - ; mode 24 [row 4, 5] + ; Row[16 - 23] + movu m7, [r2 - 16] + palignr m0, m7, 4 + mova m1, m0 + palignr m2, m7, 3 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 27,6,17,28,7,18,29,8 - pmaddubsw m2, m0, [r3 + 7 * 16] - pmulhrsw m2, m7 + ; Row[24 - 31] + movu m7, [r2 - 21] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 19,30,9,20,31,10,21,0 - pmaddubsw m3, m0, [r3 + 2 * 16] - pmulhrsw m3, m7 + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET - packuswb m2, m3 - movu [r0 + 1440], m2 +INIT_XMM sse4 +cglobal intra_pred_ang32_21, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 - ; mode 24 [row 6, 7] + ; collect reference pixel + movu m0, [r2 + 64] + pinsrb m0, [r2], 0 + movu m1, [r2 + 15 + 64] + pshufb m0, [c_mode32_15_0] ; [x x x x x x x 0 2 4 6 8 9 11 13 15] + pshufb m1, [c_mode32_15_0] ; [x x x x x x x 15 17 19 21 23 24 26 28 30] + mova [rsp], m1 + movu [rsp + 8], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 17], m0 + movu [rsp + 17 + 16], m1 + mov [rsp + 63], byte 4 - pinsrb m1, [r1 + 6 + 16], 0 + ; filter + lea r2, [rsp + 17] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - pmaddubsw m2, m1, [r3 + 29 * 16] - pmulhrsw m2, m7 +.loop: + ; Row[0 - 7] + movu m7, [r2 - 5] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m5 + PROC32_8x8 0, 0, 15,30,13,28,11,26,9,24 - pmaddubsw m1, [r3 + 24 * 16] - pmulhrsw m1, m7 + ; Row[8 - 15] + movu m7, [r2 - 9] + palignr m0, m7, 4 + palignr m1, m7, 3 + mova m2, m1 + palignr m3, m7, 2 + mova m4, m3 + palignr m5, m7, 1 + mova m6, m5 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 7,22,5,20,3,18,1,16 - packuswb m2, m1 - movu [r0 + 1456], m2 + ; Row[16 - 23] + movu m7, [r2 - 13] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 31,14,29,12,27,10,25,8 - ; mode 25 [row 0, 1] + ; Row[24 - 31] + movu m7, [r2 - 17] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 23,6,21,4,19,2,17,0 - pmaddubsw m2, m0, [r3 + 30 * 16] - pmulhrsw m2, m7 + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET - pmaddubsw m1, m0, [r3 + 28 * 16] - pmulhrsw m1, m7 +INIT_XMM sse4 +cglobal intra_pred_ang32_22, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 - packuswb m2, m1 - movu [r0 + 1472], m2 + ; collect reference pixel + movu m0, [r2 + 64] + pinsrb m0, [r2], 0 + movu m1, [r2 + 15 + 64] + pshufb m0, [c_mode32_14_0] ; [x x x x x x x x x 0 2 5 7 10 12 15] + pshufb m1, [c_mode32_14_0] ; [x x x x x x x x x 15 17 20 22 25 27 30] + pslldq m1, 10 ; [17 20 22 25 27 30 x x x x x x x x x x x] + palignr m0, m1, 10 ; [x x x 0 2 5 7 10 12 15 17 20 22 25 27 30] + mova [rsp], m0 + movu m0, [r2 + 1] + movu m1, [r2 + 1 + 16] + movu [rsp + 13], m0 + movu [rsp + 13 + 16], m1 + mov [rsp + 63], byte 4 - ; mode 25 [row 2, 3] + ; filter + lea r2, [rsp + 13] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - pmaddubsw m2, m0, [r3 + 26 * 16] - pmulhrsw m2, m7 +.loop: + ; Row[0 - 7] + movu m7, [r2 - 4] + palignr m0, m7, 3 + mova m1, m0 + palignr m2, m7, 2 + mova m3, m2 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m4 + PROC32_8x8 0, 0, 19,6,25,12,31,18,5,24 - pmaddubsw m1, m0, [r3 + 24 * 16] - pmulhrsw m1, m7 + ; Row[8 - 15] + movu m7, [r2 - 7] + palignr m0, m7, 3 + palignr m1, m7, 2 + mova m2, m1 + mova m3, m1 + palignr m4, m7, 1 + mova m5, m4 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 11,30,17,4,23,10,29,16 - packuswb m2, m1 - movu [r0 + 1488], m2 + ; Row[16 - 23] + movu m7, [r2 - 10] + palignr m0, m7, 3 + palignr m1, m7, 2 + mova m2, m1 + palignr m3, m7, 1 + mova m4, m3 + mova m5, m3 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 3,22,9,28,15,2,21,8 - ; mode 25 [row 4, 5] + ; Row[24 - 31] + movu m7, [r2 - 13] + palignr m0, m7, 2 + mova m1, m0 + mova m2, m0 + palignr m3, m7, 1 + mova m4, m3 + mova m5, m7 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 27,14,1,20,7,26,13,0 - pmaddubsw m1, m0, [r3 + 20 * 16] - pmulhrsw m1, m7 + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET - packuswb m5, m1 - movu [r0 + 1504], m5 +INIT_XMM sse4 +cglobal intra_pred_ang32_23, 4,7,8,0-(1*mmsize) +%define above [rsp + 0 * mmsize] + lea r3, [r2 + 64] + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] - ; mode 25 [row 6, 7] + MODE_13_23_ROW0 0 + add r6, 8 + mov r0, r6 + add r2, 7 + mov r3, 3 +.loop: + MODE_13_23 0, 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r3 + jnz .loop + RET - pmaddubsw m2, m0, [r3 + 18 * 16] - pmulhrsw m2, m7 +INIT_XMM sse4 +cglobal intra_pred_ang32_24, 4,7,8,0-(1*mmsize) + %define above [rsp + 0 * mmsize] + lea r3, [r2 + 64] + lea r4, [ang_table + 16 * 16] + lea r5, [r1 * 3] ; r5 -> 3 * stride + mov r6, r0 + mova m7, [pw_1024] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m7 + MODE_12_24_ROW0 0 + add r6, 8 + mov r0, r6 + add r2, 7 + mov r3, 3 +.loop: + MODE_12_24 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r3 + jnz .loop + RET - packuswb m2, m1 - movu [r0 + 1520], m2 +INIT_XMM sse4 +cglobal intra_pred_ang32_25, 4,7,8 + ; NOTE: alignment stack to 64 bytes, so all of local data in same cache line + mov r6, rsp + sub rsp, 64+gprsize + and rsp, ~63 + mov [rsp+64], r6 - ; mode 26 + ; collect reference pixel + movu m0, [r2 + 16 + 64] + pxor m1, m1 + pshufb m0, m1 ; [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] + mova [rsp], m0 + movu m0, [r2] + movu m1, [r2 + 16] + movu m2, [r2 + 32] + movu [rsp + 1], m0 + movu [rsp + 1 + 16], m1 + movu [rsp + 1 + 32], m2 + mov [rsp + 63], byte 4 - movu m0, [r1 + 1] + ; filter + lea r2, [rsp + 1] ; r2 -> [0] + lea r3, [c_shuf8_0] ; r3 -> shuffle8 + lea r4, [ang_table] ; r4 -> ang_table + lea r5, [r1 * 3] ; r5 -> 3 * stride + lea r6, [r0] ; r6 -> r0 + mova m5, [pw_1024] ; m5 -> 1024 + mova m6, [c_deinterval8] ; m6 -> c_deinterval8 - pshufb m1, m0, [tab_Si] - movu [r0 + 1536], m1 - movu [r0 + 1552], m1 - movu [r0 + 1568], m1 - movu [r0 + 1584], m1 +.loop: + ; Row[0 - 7] + movu m7, [r2] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + PROC32_8x8 0, 0, 30,28,26,24,22,20,18,16 - pxor m5, m5 + ; Row[8 - 15] + movu m7, [r2] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 1, 0, 14,12,10,8,6,4,2,0 - pshufb m1, m1, m5 - punpcklbw m1, m5 + ; Row[16 - 23] + movu m7, [r2 - 1] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 2, 0, 30,28,26,24,22,20,18,16 - movu m2, [r1 + 16] - pinsrb m2, [r1], 0 + ; Row[24 - 31] + movu m7, [r2 - 1] + mova m0, m7 + mova m1, m7 + mova m2, m7 + mova m3, m7 + mova m4, m7 + mova m5, m7 + mova m6, m7 + lea r0, [r0 + r1 * 4] + PROC32_8x8 3, 0, 14,12,10,8,6,4,2,0 - pshufb m3, m2, m5 - punpcklbw m3, m5 + add r6, 8 + mov r0, r6 + add r2, 8 + dec byte [rsp + 63] + jnz .loop + mov rsp, [rsp+64] + RET - psrldq m4, m2, 1 - punpcklbw m4, m5 +INIT_XMM sse4 +cglobal intra_pred_ang32_26, 5,7,7,0-(2*mmsize) +%define m8 [rsp + 0 * mmsize] +%define m9 [rsp + 1 * mmsize] + mov r6, 2 + movu m0, [r2 + 64] + pinsrb m0, [r2], 0 + movu m1, [r2 + 1 + 64] + mova m8, m0 + mova m9, m1 + mov r3d, r4d + lea r4, [r1 * 3] - movu m2, [r1 + 9 + 16] - punpcklbw m2, m5 +.loop: + movu m0, [r2 + 1] - psubw m4, m3 - psubw m2, m3 + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r4], m0 + lea r5, [r0 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r0 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 + lea r5, [r5 + r1 * 4] + movu [r5], m0 + movu [r5 + r1], m0 + movu [r5 + r1 * 2], m0 + movu [r5 + r4], m0 - psraw m4, 1 - psraw m2, 1 +; filter + cmp r3d, byte 0 + jz .quit - paddw m4, m1 - paddw m2, m1 + pxor m4, m4 + pshufb m0, m4 + pmovzxbw m0, m0 + mova m1, m0 + movu m2, m8 + movu m3, m9 - packuswb m4, m2 + pshufb m2, m4 + pmovzxbw m2, m2 + movhlps m4, m3 + pmovzxbw m3, m3 + pmovzxbw m4, m4 + psubw m3, m2 + psubw m4, m2 + psraw m3, 1 + psraw m4, 1 + paddw m0, m3 + paddw m1, m4 + packuswb m0, m1 - pextrb [r0 + 1536], m4, 0 - pextrb [r0 + 1544], m4, 1 - pextrb [r0 + 1552], m4, 2 - pextrb [r0 + 1560], m4, 3 - pextrb [r0 + 1568], m4, 4 - pextrb [r0 + 1576], m4, 5 - pextrb [r0 + 1584], m4, 6 - pextrb [r0 + 1592], m4, 7 + pextrb [r0], m0, 0 + pextrb [r0 + r1], m0, 1 + pextrb [r0 + r1 * 2], m0, 2 + pextrb [r0 + r4], m0, 3 + lea r5, [r0 + r1 * 4] + pextrb [r5], m0, 4 + pextrb [r5 + r1], m0, 5 + pextrb [r5 + r1 * 2], m0, 6 + pextrb [r5 + r4], m0, 7 + lea r5, [r5 + r1 * 4] + pextrb [r5], m0, 8 + pextrb [r5 + r1], m0, 9 + pextrb [r5 + r1 * 2], m0, 10 + pextrb [r5 + r4], m0, 11 + lea r5, [r5 + r1 * 4] + pextrb [r5], m0, 12 + pextrb [r5 + r1], m0, 13 + pextrb [r5 + r1 * 2], m0, 14 + pextrb [r5 + r4], m0, 15 - ; mode 27 [row 0, 1] +.quit: + lea r2, [r2 + 16] + add r0, 16 + dec r6d + jnz .loop + RET - palignr m6, m0, 1 - punpcklbw m4, m0, m6 +INIT_XMM sse4 +cglobal intra_pred_ang32_27, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_9_27 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET - pmaddubsw m1, m4, [r3 + 2 * 16] - pmulhrsw m1, m7 +INIT_XMM sse4 +cglobal intra_pred_ang32_28, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_8_28 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET - pmaddubsw m2, m4, [r3 + 4 * 16] - pmulhrsw m2, m7 +INIT_XMM sse4 +cglobal intra_pred_ang32_29, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_7_29 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET - packuswb m1, m2 - movu [r0 + 1600], m1 +INIT_XMM sse4 +cglobal intra_pred_ang32_30, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_6_30 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET - ; mode 27 [row 2, 3] +INIT_XMM sse4 +cglobal intra_pred_ang32_31, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_5_31 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET - pmaddubsw m1, m4, [r3 + 6 * 16] - pmulhrsw m1, m7 +INIT_XMM sse4 +cglobal intra_pred_ang32_32, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_4_32 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET - pmaddubsw m2, m4, [r3 + 8 * 16] - pmulhrsw m2, m7 +INIT_XMM sse4 +cglobal intra_pred_ang32_33, 3,7,8 + lea r3, [ang_table + 16 * 16] + mov r4d, 4 + lea r5, [r1 * 3] + mov r6, r0 + mova m7, [pw_1024] +.loop: + MODE_3_33 0 + add r6, 8 + mov r0, r6 + add r2, 8 + dec r4 + jnz .loop + RET - packuswb m1, m2 - movu [r0 + 1616], m1 - ; mode 27 [row 4, 5] +;----------------------------------------------------------------------------------------- +; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter) +;----------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal intra_pred_ang8_3, 3,4,5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 17] + + pshufb m1, m0, [c_ang8_src1_9_2_10] + pshufb m2, m0, [c_ang8_src3_11_4_12] + pshufb m4, m0, [c_ang8_src5_13_5_13] + pshufb m0, [c_ang8_src6_14_7_15] + + pmaddubsw m1, [c_ang8_26_20] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_14_8] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_2_28] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_22_16] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + vperm2i128 m2, m1, m4, 00100000b + vperm2i128 m1, m1, m4, 00110001b + punpcklbw m4, m2, m1 + punpckhbw m2, m1 + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + mova m0, [trans8_shuf] + vpermd m1, m0, m1 + vpermd m4, m0, m4 + + lea r3, [3 * r1] + movq [r0], xm1 + movhps [r0 + r1], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + movhps [r0 + r1], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_33, 3,4,5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 1] + + pshufb m1, m0, [c_ang8_src1_9_2_10] + pshufb m2, m0, [c_ang8_src3_11_4_12] + pshufb m4, m0, [c_ang8_src5_13_5_13] + pshufb m0, [c_ang8_src6_14_7_15] + + pmaddubsw m1, [c_ang8_26_20] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_14_8] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_2_28] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_22_16] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + lea r3, [3 * r1] + movq [r0], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm1 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm4 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_4, 3,4,5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 17] + + pshufb m1, m0, [c_ang8_src1_9_2_10] + pshufb m2, m0, [c_ang8_src2_10_3_11] + pshufb m4, m0, [c_ang8_src4_12_4_12] + pshufb m0, [c_ang8_src5_13_6_14] + + pmaddubsw m1, [c_ang8_21_10] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_31_20] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_9_30] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_19_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + vperm2i128 m2, m1, m4, 00100000b + vperm2i128 m1, m1, m4, 00110001b + punpcklbw m4, m2, m1 + punpckhbw m2, m1 + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + mova m0, [trans8_shuf] + vpermd m1, m0, m1 + vpermd m4, m0, m4 + + lea r3, [3 * r1] + movq [r0], xm1 + movhps [r0 + r1], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + movhps [r0 + r1], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_32, 3,4,5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 1] + + pshufb m1, m0, [c_ang8_src1_9_2_10] + pshufb m2, m0, [c_ang8_src2_10_3_11] + pshufb m4, m0, [c_ang8_src4_12_4_12] + pshufb m0, [c_ang8_src5_13_6_14] + + pmaddubsw m1, [c_ang8_21_10] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_31_20] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_9_30] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_19_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + lea r3, [3 * r1] + movq [r0], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm1 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm4 + movhps [r0 + r3], xm2 + RET + + +INIT_YMM avx2 +cglobal intra_pred_ang8_5, 3, 4, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 17] + + pshufb m1, m0, [c_ang8_src1_9_2_10] + pshufb m2, m0, [c_ang8_src2_10_3_11] + pshufb m4, m0, [c_ang8_src3_11_4_12] + pshufb m0, [c_ang8_src4_12_5_13] + + pmaddubsw m1, [c_ang8_17_2] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_19_4] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_21_6] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_23_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + vperm2i128 m2, m1, m4, 00100000b + vperm2i128 m1, m1, m4, 00110001b + punpcklbw m4, m2, m1 + punpckhbw m2, m1 + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + mova m0, [trans8_shuf] + vpermd m1, m0, m1 + vpermd m4, m0, m4 + + lea r3, [3 * r1] + movq [r0], xm1 + movhps [r0 + r1], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + movhps [r0 + r1], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_31, 3, 4, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 1] + + pshufb m1, m0, [c_ang8_src1_9_2_10] + pshufb m2, m0, [c_ang8_src2_10_3_11] + pshufb m4, m0, [c_ang8_src3_11_4_12] + pshufb m0, [c_ang8_src4_12_5_13] + + pmaddubsw m1, [c_ang8_17_2] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_19_4] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_21_6] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_23_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + lea r3, [3 * r1] + movq [r0], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm1 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm4 + movhps [r0 + r3], xm2 + RET + + +INIT_YMM avx2 +cglobal intra_pred_ang8_6, 3, 4, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 17] + + pshufb m1, m0, [intra_pred_shuff_0_8] + pshufb m2, m0, [c_ang8_src2_10_2_10] + pshufb m4, m0, [c_ang8_src3_11_3_11] + pshufb m0, [c_ang8_src3_11_4_12] + + pmaddubsw m1, [c_ang8_13_26] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_7_20] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_1_14] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_27_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + vperm2i128 m2, m1, m4, 00100000b + vperm2i128 m1, m1, m4, 00110001b + punpcklbw m4, m2, m1 + punpckhbw m2, m1 + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + mova m0, [trans8_shuf] + vpermd m1, m0, m1 + vpermd m4, m0, m4 + + lea r3, [3 * r1] + movq [r0], xm1 + movhps [r0 + r1], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + movhps [r0 + r1], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_30, 3, 4, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 1] + + pshufb m1, m0, [intra_pred_shuff_0_8] + pshufb m2, m0, [c_ang8_src2_10_2_10] + pshufb m4, m0, [c_ang8_src3_11_3_11] + pshufb m0, [c_ang8_src3_11_4_12] + + pmaddubsw m1, [c_ang8_13_26] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_7_20] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_1_14] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_27_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + lea r3, [3 * r1] + movq [r0], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm1 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm4 + movhps [r0 + r3], xm2 + RET + + +INIT_YMM avx2 +cglobal intra_pred_ang8_9, 3, 5, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 17] + + pshufb m0, [intra_pred_shuff_0_8] + + lea r4, [c_ang8_mode_27] + pmaddubsw m1, m0, [r4] + pmulhrsw m1, m3 + pmaddubsw m2, m0, [r4 + mmsize] + pmulhrsw m2, m3 + pmaddubsw m4, m0, [r4 + 2 * mmsize] + pmulhrsw m4, m3 + pmaddubsw m0, [r4 + 3 * mmsize] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + vperm2i128 m2, m1, m4, 00100000b + vperm2i128 m1, m1, m4, 00110001b + punpcklbw m4, m2, m1 + punpckhbw m2, m1 + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + mova m0, [trans8_shuf] + vpermd m1, m0, m1 + vpermd m4, m0, m4 + + lea r3, [3 * r1] + movq [r0], xm1 + movhps [r0 + r1], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + movhps [r0 + r1], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_27, 3, 5, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 1] + + pshufb m0, [intra_pred_shuff_0_8] + + lea r4, [c_ang8_mode_27] + pmaddubsw m1, m0, [r4] + pmulhrsw m1, m3 + pmaddubsw m2, m0, [r4 + mmsize] + pmulhrsw m2, m3 + pmaddubsw m4, m0, [r4 + 2 * mmsize] + pmulhrsw m4, m3 + pmaddubsw m0, [r4 + 3 * mmsize] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + lea r3, [3 * r1] + movq [r0], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm1 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm4 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_25, 3, 5, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2] + + pshufb m0, [intra_pred_shuff_0_8] + + lea r4, [c_ang8_mode_25] + pmaddubsw m1, m0, [r4] + pmulhrsw m1, m3 + pmaddubsw m2, m0, [r4 + mmsize] + pmulhrsw m2, m3 + pmaddubsw m4, m0, [r4 + 2 * mmsize] + pmulhrsw m4, m3 + pmaddubsw m0, [r4 + 3 * mmsize] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + lea r3, [3 * r1] + movq [r0], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm1 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm4 + movhps [r0 + r3], xm2 + RET + + +INIT_YMM avx2 +cglobal intra_pred_ang8_7, 3, 4, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 17] + + pshufb m1, m0, [intra_pred_shuff_0_8] + pshufb m2, m0, [c_ang8_src1_9_2_10] + pshufb m4, m0, [c_ang8_src2_10_2_10] + pshufb m0, [c_ang8_src2_10_3_11] + + pmaddubsw m1, [c_ang8_9_18] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_27_4] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_13_22] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_31_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + vperm2i128 m2, m1, m4, 00100000b + vperm2i128 m1, m1, m4, 00110001b + punpcklbw m4, m2, m1 + punpckhbw m2, m1 + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + mova m0, [trans8_shuf] + vpermd m1, m0, m1 + vpermd m4, m0, m4 + + lea r3, [3 * r1] + movq [r0], xm1 + movhps [r0 + r1], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + movhps [r0 + r1], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_29, 3, 4, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 1] + + pshufb m1, m0, [intra_pred_shuff_0_8] + pshufb m2, m0, [c_ang8_src1_9_2_10] + pshufb m4, m0, [c_ang8_src2_10_2_10] + pshufb m0, [c_ang8_src2_10_3_11] + + pmaddubsw m1, [c_ang8_9_18] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_27_4] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_13_22] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_31_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + lea r3, [3 * r1] + movq [r0], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm1 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm4 + movhps [r0 + r3], xm2 + RET + + +INIT_YMM avx2 +cglobal intra_pred_ang8_8, 3, 4, 6 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 17] + mova m5, [intra_pred_shuff_0_8] + + pshufb m1, m0, m5 + pshufb m2, m0, m5 + pshufb m4, m0, m5 + pshufb m0, [c_ang8_src2_10_2_10] + + pmaddubsw m1, [c_ang8_5_10] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_15_20] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_25_30] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_3_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + vperm2i128 m2, m1, m4, 00100000b + vperm2i128 m1, m1, m4, 00110001b + punpcklbw m4, m2, m1 + punpckhbw m2, m1 + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + mova m0, [trans8_shuf] + vpermd m1, m0, m1 + vpermd m4, m0, m4 + + lea r3, [3 * r1] + movq [r0], xm1 + movhps [r0 + r1], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + movhps [r0 + r1], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_28, 3, 4, 6 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2 + 1] + mova m5, [intra_pred_shuff_0_8] + + pshufb m1, m0, m5 + pshufb m2, m0, m5 + pshufb m4, m0, m5 + pshufb m0, [c_ang8_src2_10_2_10] + + pmaddubsw m1, [c_ang8_5_10] + pmulhrsw m1, m3 + pmaddubsw m2, [c_ang8_15_20] + pmulhrsw m2, m3 + pmaddubsw m4, [c_ang8_25_30] + pmulhrsw m4, m3 + pmaddubsw m0, [c_ang8_3_8] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + lea r3, [3 * r1] + movq [r0], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm1 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm4 + movhps [r0 + r3], xm2 + RET + + +INIT_YMM avx2 +cglobal intra_pred_ang8_11, 3, 5, 5 + mova m3, [pw_1024] + movu xm1, [r2 + 16] + pinsrb xm1, [r2], 0 + pshufb xm1, [intra_pred_shuff_0_8] + vinserti128 m0, m1, xm1, 1 + + lea r4, [c_ang8_mode_25] + pmaddubsw m1, m0, [r4] + pmulhrsw m1, m3 + pmaddubsw m2, m0, [r4 + mmsize] + pmulhrsw m2, m3 + pmaddubsw m4, m0, [r4 + 2 * mmsize] + pmulhrsw m4, m3 + pmaddubsw m0, [r4 + 3 * mmsize] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + vperm2i128 m2, m1, m4, 00100000b + vperm2i128 m1, m1, m4, 00110001b + punpcklbw m4, m2, m1 + punpckhbw m2, m1 + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + mova m0, [trans8_shuf] + vpermd m1, m0, m1 + vpermd m4, m0, m4 + + lea r3, [3 * r1] + movq [r0], xm1 + movhps [r0 + r1], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + movhps [r0 + r1], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + RET + + +INIT_YMM avx2 +cglobal intra_pred_ang8_12, 3, 5, 5 + mova m3, [pw_1024] + movu xm1, [r2 + 16] + pinsrb xm1, [r2], 0 + pshufb xm1, [intra_pred_shuff_0_8] + vinserti128 m0, m1, xm1, 1 + + lea r4, [c_ang8_mode_24] + pmaddubsw m1, m0, [r4] + pmulhrsw m1, m3 + pmaddubsw m2, m0, [r4 + mmsize] + pmulhrsw m2, m3 + pmaddubsw m4, m0, [r4 + 2 * mmsize] + pmulhrsw m4, m3 + pslldq xm0, 2 + pinsrb xm0, [r2 + 6], 0 + pinsrb xm0, [r2 + 0], 1 + vinserti128 m0, m0, xm0, 1 + pmaddubsw m0, [r4 + 3 * mmsize] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + vperm2i128 m2, m1, m4, 00100000b + vperm2i128 m1, m1, m4, 00110001b + punpcklbw m4, m2, m1 + punpckhbw m2, m1 + punpcklwd m1, m4, m2 + punpckhwd m4, m2 + mova m0, [trans8_shuf] + vpermd m1, m0, m1 + vpermd m4, m0, m4 + + lea r3, [3 * r1] + movq [r0], xm1 + movhps [r0 + r1], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + movhps [r0 + r1], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + 2 * r1], xm2 + movhps [r0 + r3], xm2 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang8_24, 3, 5, 5 + mova m3, [pw_1024] + vbroadcasti128 m0, [r2] + + pshufb m0, [intra_pred_shuff_0_8] + + lea r4, [c_ang8_mode_24] + pmaddubsw m1, m0, [r4] + pmulhrsw m1, m3 + pmaddubsw m2, m0, [r4 + mmsize] + pmulhrsw m2, m3 + pmaddubsw m4, m0, [r4 + 2 * mmsize] + pmulhrsw m4, m3 + pslldq xm0, 2 + pinsrb xm0, [r2 + 16 + 6], 0 + pinsrb xm0, [r2 + 0], 1 + vinserti128 m0, m0, xm0, 1 + pmaddubsw m0, [r4 + 3 * mmsize] + pmulhrsw m0, m3 + packuswb m1, m2 + packuswb m4, m0 + + lea r3, [3 * r1] + movq [r0], xm1 + vextracti128 xm2, m1, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm1 + movhps [r0 + r3], xm2 + lea r0, [r0 + 4 * r1] + movq [r0], xm4 + vextracti128 xm2, m4, 1 + movq [r0 + r1], xm2 + movhps [r0 + 2 * r1], xm4 + movhps [r0 + r3], xm2 + RET + +%macro INTRA_PRED_ANG16_MC0 3 + pmaddubsw m3, m1, [r4 + %3 * mmsize] + pmulhrsw m3, m0 + pmaddubsw m4, m2, [r4 + %3 * mmsize] + pmulhrsw m4, m0 + packuswb m3, m4 + movu [%1], xm3 + vextracti128 xm4, m3, 1 + movu [%2], xm4 +%endmacro - pmaddubsw m3, m4, [r3 + 10 * 16] - pmulhrsw m3, m7 +%macro INTRA_PRED_ANG16_MC1 1 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, %1 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, (%1 + 1) +%endmacro - pmaddubsw m2, m4, [r3 + 12 * 16] - pmulhrsw m2, m7 +%macro INTRA_PRED_ANG16_MC2 1 + vbroadcasti128 m1, [r2 + %1] + pshufb m1, m5 + vbroadcasti128 m2, [r2 + (%1 + 8)] + pshufb m2, m5 +%endmacro - packuswb m1, m3, m2 - movu [r0 + 1632], m1 +%macro INTRA_PRED_ANG16_MC3 2 + vperm2i128 m1, m1, m2, 00100000b + pmaddubsw m3, m1, [r4 + (%2 * mmsize)] + pmulhrsw m3, m0 + packuswb m3, m3 + vpermq m3, m3, 11011000b + movu [%1], xm3 +%endmacro - ; mode 27 [row 6, 7] +%macro INTRA_PRED_ANG16_MC4 3 + vperm2i128 m1, m1, m2, 00100000b + pmaddubsw m4, m1, [r4 + (%3 * mmsize)] + pmulhrsw m4, m0 + packuswb m3, m4 + vpermq m3, m3, 11011000b + movu [%1], xm3 + vextracti128 xm3, m3, 1 + movu [%2], xm3 +%endmacro - pmaddubsw m1, m4, [r3 + 14 * 16] - pmulhrsw m1, m7 +INIT_YMM avx2 +cglobal intra_pred_ang16_25, 3, 5, 5 + mova m0, [pw_1024] - pmaddubsw m2, m4, [r3 + 16 * 16] - pmulhrsw m2, m7 + vbroadcasti128 m1, [r2] + pshufb m1, [intra_pred_shuff_0_8] + vbroadcasti128 m2, [r2 + 8] + pshufb m2, [intra_pred_shuff_0_8] - packuswb m1, m2 - movu [r0 + 1648], m1 + lea r3, [3 * r1] + lea r4, [c_ang16_mode_25] - ; mode 28 [row 0, 1] + INTRA_PRED_ANG16_MC1 0 - pmaddubsw m1, m4, [r3 + 5 * 16] - pmulhrsw m1, m7 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC1 2 - packuswb m1, m3 - movu [r0 + 1664], m1 + add r4, 4 * mmsize - ; mode 28 [row 2, 3] + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC1 0 - pmaddubsw m1, m4, [r3 + 15 * 16] - pmulhrsw m1, m7 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC1 2 + RET - pmaddubsw m2, m4, [r3 + 20 * 16] - pmulhrsw m2, m7 +INIT_YMM avx2 +cglobal intra_pred_ang16_28, 3, 5, 6 + mova m0, [pw_1024] + mova m5, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_28] - packuswb m1, m2 - movu [r0 + 1680], m1 + INTRA_PRED_ANG16_MC2 1 + INTRA_PRED_ANG16_MC1 0 - ; mode 28 [row 4, 5] + lea r0, [r0 + 4 * r1] - pmaddubsw m1, m4, [r3 + 25 * 16] - pmulhrsw m1, m7 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 - pmaddubsw m2, m4, [r3 + 30 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 2 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3 - packuswb m1, m2 - movu [r0 + 1696], m1 + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize - ; mode 28 [row 6, 7] + INTRA_PRED_ANG16_MC1 0 + INTRA_PRED_ANG16_MC2 3 - palignr m1, m0, 2 - punpcklbw m5, m6, m1 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC1 2 + RET - pmaddubsw m2, m5, [r3 + 3 * 16] - pmulhrsw m2, m7 +INIT_YMM avx2 +cglobal intra_pred_ang16_27, 3, 5, 5 + mova m0, [pw_1024] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_27] - pmaddubsw m3, m5, [r3 + 8 * 16] - pmulhrsw m3, m7 + vbroadcasti128 m1, [r2 + 1] + pshufb m1, [intra_pred_shuff_0_8] + vbroadcasti128 m2, [r2 + 9] + pshufb m2, [intra_pred_shuff_0_8] - packuswb m2, m3 - movu [r0 + 1712], m2 + INTRA_PRED_ANG16_MC1 0 - ; mode 29 [row 0, 1] + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC1 2 - pmaddubsw m2, m4, [r3 + 9 * 16] - pmulhrsw m2, m7 + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize + INTRA_PRED_ANG16_MC1 0 - pmaddubsw m3, m4, [r3 + 18 * 16] - pmulhrsw m3, m7 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 - packuswb m2, m3 - movu [r0 + 1728], m2 + vperm2i128 m1, m1, m2, 00100000b + pmaddubsw m3, m1, [r4 + 3 * mmsize] + pmulhrsw m3, m0 + vbroadcasti128 m2, [r2 + 2] + pshufb m2, [intra_pred_shuff_0_15] + pmaddubsw m2, [r4 + 4 * mmsize] + pmulhrsw m2, m0 + packuswb m3, m2 + vpermq m3, m3, 11011000b + movu [r0 + 2 * r1], xm3 + vextracti128 xm4, m3, 1 + movu [r0 + r3], xm4 + RET - ; mode 29 [row 2, 3] +INIT_YMM avx2 +cglobal intra_pred_ang16_29, 3, 5, 5 + mova m0, [pw_1024] + mova m5, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_29] - pmaddubsw m2, m4, [r3 + 27 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 1 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 + INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1 - pmaddubsw m3, m5, [r3 + 4 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC2 2 + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 - packuswb m2, m3 - movu [r0 + 1744], m2 + lea r0, [r0 + r1 * 4] + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 - ; mode 29 [row 4, 5] + INTRA_PRED_ANG16_MC2 3 + add r4, 4 * mmsize + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 + lea r0, [r0 + r1 * 4] + INTRA_PRED_ANG16_MC3 r0 + r1, 1 - pmaddubsw m2, m5, [r3 + 13 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 4 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2 + lea r0, [r0 + r1 * 4] + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 - pmaddubsw m3, m5, [r3 + 22 * 16] - pmulhrsw m3, m7 + add r4, 4 * mmsize - packuswb m2, m3 - movu [r0 + 1760], m2 + INTRA_PRED_ANG16_MC2 5 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0 + RET - ; mode 29 [row 6, 7] +INIT_YMM avx2 +cglobal intra_pred_ang16_30, 3, 5, 6 + mova m0, [pw_1024] + mova m5, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_30] - pmaddubsw m2, m5, [r3 + 31 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 1 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 - palignr m6, m0, 3 - punpcklbw m1, m6 + INTRA_PRED_ANG16_MC2 2 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1 - pmaddubsw m3, m1, [r3 + 8 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC2 3 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 + INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3 - packuswb m2, m3 - movu [r0 + 1776], m2 + INTRA_PRED_ANG16_MC2 4 + add r4, 4 * mmsize + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 - ; mode 32 [row 2] + INTRA_PRED_ANG16_MC2 5 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 + INTRA_PRED_ANG16_MC3 r0 + r3 , 2 - movh [r0 + 1936], m2 + INTRA_PRED_ANG16_MC2 6 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 - ; mode 30 [row 0, 1] + INTRA_PRED_ANG16_MC2 7 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4 + RET - pmaddubsw m2, m4, [r3 + 13 * 16] - pmulhrsw m2, m7 +INIT_YMM avx2 +cglobal intra_pred_ang16_31, 3, 5, 6 + mova m0, [pw_1024] + mova m5, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_31] - pmaddubsw m3, m4, [r3 + 26 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC2 1 + INTRA_PRED_ANG16_MC3 r0, 0 - packuswb m2, m3 - movu [r0 + 1792], m2 + INTRA_PRED_ANG16_MC2 2 + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 - ; mode 30 [row 2, 3] + INTRA_PRED_ANG16_MC2 3 + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 - pmaddubsw m2, m5, [r3 + 7 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 4 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 - pmaddubsw m3, m5, [r3 + 20 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC2 5 + add r4, 4 * mmsize + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 - packuswb m2, m3 - movu [r0 + 1808], m2 + INTRA_PRED_ANG16_MC2 6 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 - ; mode 33 [row 1] + INTRA_PRED_ANG16_MC2 7 + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 - movhps [r0 + 1992], m2 + INTRA_PRED_ANG16_MC2 8 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 - ; mode 30 [row 4, 5] + INTRA_PRED_ANG16_MC2 9 + INTRA_PRED_ANG16_MC3 r0 + r3, 4 + RET - pmaddubsw m2, m1, [r3 + 1 * 16] - pmulhrsw m2, m7 +INIT_YMM avx2 +cglobal intra_pred_ang16_32, 3, 5, 6 + mova m0, [pw_1024] + mova m5, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_32] - pmaddubsw m3, m1, [r3 + 14 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC2 1 + INTRA_PRED_ANG16_MC3 r0, 0 - packuswb m2, m3 - movu [r0 + 1824], m2 + INTRA_PRED_ANG16_MC2 2 + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 - ; mode 33 [row 2] + INTRA_PRED_ANG16_MC2 3 + INTRA_PRED_ANG16_MC3 r0 + r3, 2 - movhps [r0 + 2000], m2 + INTRA_PRED_ANG16_MC2 4 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 - ; mode 30 [row 6, 7] + INTRA_PRED_ANG16_MC2 5 - pmaddubsw m2, m1, [r3 + 27 * 16] - pmulhrsw m2, m7 + add r4, 4 * mmsize + INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 0 - psrldq m0, 4 - punpcklbw m6, m0 + INTRA_PRED_ANG16_MC2 6 + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 1 + INTRA_PRED_ANG16_MC2 7 - pmaddubsw m3, m6, [r3 + 8 * 16] - pmulhrsw m3, m7 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC3 r0 + r1, 2 + INTRA_PRED_ANG16_MC2 8 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3 + INTRA_PRED_ANG16_MC2 9 - packuswb m2, m3 - movu [r0 + 1840], m2 + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize - ; mode 33 [row 3] + INTRA_PRED_ANG16_MC3 r0, 0 + INTRA_PRED_ANG16_MC2 10 + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 + INTRA_PRED_ANG16_MC2 11 + INTRA_PRED_ANG16_MC3 r0 + r3, 2 + RET - movhps [r0 + 2008], m2 +INIT_YMM avx2 +cglobal intra_pred_ang16_33, 3, 5, 6 + mova m0, [pw_1024] + mova m5, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_33] - ; mode 31 [row 0, 1] + INTRA_PRED_ANG16_MC2 1 + vperm2i128 m1, m1, m2, 00100000b + pmaddubsw m3, m1, [r4 + 0 * mmsize] + pmulhrsw m3, m0 - pmaddubsw m2, m4, [r3 + 17 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 2 + INTRA_PRED_ANG16_MC4 r0, r0 + r1, 1 - pmaddubsw m3, m5, [r3 + 2 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC2 3 + vperm2i128 m1, m1, m2, 00100000b + pmaddubsw m3, m1, [r4 + 2 * mmsize] + pmulhrsw m3, m0 - packuswb m2, m3 - movu [r0 + 1856], m2 + INTRA_PRED_ANG16_MC2 4 + INTRA_PRED_ANG16_MC4 r0 + 2 * r1, r0 + r3, 3 - ; mode 31 [row 2, 3] + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize - pmaddubsw m2, m5, [r3 + 19 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 5 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 - pmaddubsw m3, m1, [r3 + 4 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC2 6 + vperm2i128 m1, m1, m2, 00100000b + pmaddubsw m3, m1, [r4 + 1 * mmsize] + pmulhrsw m3, m0 - packuswb m2, m3 - movu [r0 + 1872], m2 + INTRA_PRED_ANG16_MC2 7 + INTRA_PRED_ANG16_MC4 r0 + 2 * r1, r0 + r3, 2 - ; mode 31 [row 4, 5] + INTRA_PRED_ANG16_MC2 8 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC3 r0, 3 - pmaddubsw m2, m1, [r3 + 21 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 9 + add r4, 4 * mmsize + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 0 - pmaddubsw m3, m6, [r3 + 6 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC2 10 + vperm2i128 m1, m1, m2, 00100000b + pmaddubsw m3, m1, [r4 + 1 * mmsize] + pmulhrsw m3, m0 - packuswb m2, m3 - movu [r0 + 1888], m2 + INTRA_PRED_ANG16_MC2 11 + INTRA_PRED_ANG16_MC4 r0 + r3, r0 + 4 * r1, 2 - ; mode 31 [row 6, 7] + lea r0, [r0 + 4 * r1] - pmaddubsw m2, m6, [r3 + 23 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 12 + vperm2i128 m1, m1, m2, 00100000b + pmaddubsw m3, m1, [r4 + 3 * mmsize] + pmulhrsw m3, m0 - movu m3, [r1 + 6] - punpcklbw m0, m3 + INTRA_PRED_ANG16_MC2 13 + INTRA_PRED_ANG16_MC4 r0 + r1, r0 + 2 * r1, 4 - pmaddubsw m3, m0, [r3 + 8 * 16] - pmulhrsw m3, m7 + add r4, 4 * mmsize - packuswb m2, m3 - movu [r0 + 1904], m2 + INTRA_PRED_ANG16_MC2 14 + INTRA_PRED_ANG16_MC3 r0 + r3, 1 + RET - ; mode 32 [row 0, 1] +INIT_YMM avx2 +cglobal intra_pred_ang16_24, 3, 5, 6 + mova m0, [pw_1024] + mova m5, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_24] - pmaddubsw m2, m4, [r3 + 21 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC2 0 + INTRA_PRED_ANG16_MC1 0 - pmaddubsw m3, m5, [r3 + 10 * 16] - pmulhrsw m3, m7 + lea r0, [r0 + 4 * r1] + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 - packuswb m2, m3 - movu [r0 + 1920], m2 + movu xm1, [r2 - 1] + pinsrb xm1, [r2 + 38], 0 + vinserti128 m1, m1, xm1, 1 + pshufb m1, m5 + vbroadcasti128 m2, [r2 + 7] + pshufb m2, m5 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 3 - ; mode 32 [row 3] + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize - pmaddubsw m2, m1, [r3 + 20 * 16] - pmulhrsw m2, m7 + INTRA_PRED_ANG16_MC1 0 - pxor m3, m3 + movu xm1, [r2 - 2] + pinsrb xm1, [r2 + 45], 0 + pinsrb xm1, [r2 + 38], 1 + vinserti128 m1, m1, xm1, 1 + pshufb m1, m5 + vbroadcasti128 m2, [r2 + 6] + pshufb m2, m5 - packuswb m2, m3 - movh [r0 + 1944], m2 + lea r0, [r0 + 4 * r1] - ; mode 32 [row 4, 5] + INTRA_PRED_ANG16_MC1 2 + RET - pmaddubsw m2, m6, [r3 + 9 * 16] - pmulhrsw m2, m7 +%macro INTRA_PRED_ANG16_MC5 2 + pslldq xm6, xm6, 1 + pinsrb xm6, [r2 + %1], 0 + vinserti128 m1, m6, xm6, 1 + pshufb m1, m5 + vbroadcasti128 m2, [r2 + %2] + pshufb m2, m5 +%endmacro - pmaddubsw m3, m6, [r3 + 30 * 16] - pmulhrsw m3, m7 +INIT_YMM avx2 +cglobal intra_pred_ang16_23, 3, 5, 7 + mova m0, [pw_1024] + mova m5, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_23] - packuswb m2, m3 - movu [r0 + 1952], m2 + INTRA_PRED_ANG16_MC2 0 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 + INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 1 - ; mode 33 [row 4, 5] + movu xm6, [r2 - 1] + pinsrb xm6, [r2 + 36], 0 + vinserti128 m1, m6, xm6, 1 + pshufb m1, m5 + vbroadcasti128 m2, [r2 + 7] + pshufb m2, m5 + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 2 - pmaddubsw m2, m0, [r3 + 2 * 16] - pmulhrsw m2, m7 + lea r0, [r0 + 4 * r1] - pmaddubsw m3, m0, [r3 + 28 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 3 - packuswb m2, m3 - movu [r0 + 2016], m2 + add r4, 4 * mmsize - ; mode 32 [row 6] + INTRA_PRED_ANG16_MC5 39, 6 + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 - pmaddubsw m2, m0, [r3 + 19 * 16] - pmulhrsw m2, m7 + lea r0, [r0 + 4 * r1] - ; mode 32 [row 7] + INTRA_PRED_ANG16_MC3 r0 + r1, 1 + INTRA_PRED_ANG16_MC5 43, 5 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 2 - movu m0, [r1 + 6] - palignr m3, m0, 1 - punpcklbw m0, m3 + lea r0, [r0 + 4 * r1] - pmaddubsw m3, m0, [r3 + 8 * 16] - pmulhrsw m3, m7 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 - packuswb m2, m3 - movu [r0 + 1968], m2 + add r4, 4 * mmsize - ; mode 33 [row 6, 7] + INTRA_PRED_ANG16_MC5 46, 4 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 0 + RET - pmaddubsw m2, m0, [r3 + 22 * 16] - pmulhrsw m2, m7 +INIT_YMM avx2 +cglobal intra_pred_ang16_22, 3, 5, 7 + mova m0, [pw_1024] + mova m5, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang16_mode_22] - movu m0, [r1 + 7] - palignr m3, m0, 1 - punpcklbw m0, m3 + INTRA_PRED_ANG16_MC2 0 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 0 - pmaddubsw m3, m0, [r3 + 16 * 16] - pmulhrsw m3, m7 + movu xm6, [r2 - 1] + pinsrb xm6, [r2 + 34], 0 + vinserti128 m1, m6, xm6, 1 + pshufb m1, m5 + vbroadcasti128 m2, [r2 + 7] + pshufb m2, m5 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 1 - packuswb m2, m3 - movu [r0 + 2032], m2 + lea r0, [r0 + 4 * r1] - ; mode 33 [row 0] + INTRA_PRED_ANG16_MC5 37, 6 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 2 + INTRA_PRED_ANG16_MC3 r0 + 2 * r1, 3 - pmaddubsw m2, m4, [r3 + 26 * 16] - pmulhrsw m2, m7 + add r4, 4 * mmsize - pxor m3, m3 + INTRA_PRED_ANG16_MC5 39, 5 + INTRA_PRED_ANG16_MC0 r0 + r3, r0 + 4 * r1, 0 - packuswb m2, m3 - movh [r0 + 1984], m2 + lea r0, [r0 + 4 * r1] - ; mode 34 [row 0, 1, 2, 3, 4, 5, 6, 7] + INTRA_PRED_ANG16_MC5 42, 4 + INTRA_PRED_ANG16_MC0 r0 + r1, r0 + 2 * r1, 1 + INTRA_PRED_ANG16_MC3 r0 + r3, 2 - movu m0, [r2 + 2] - palignr m1, m0, 1 - punpcklqdq m2, m0, m1 - movu [r0 + 2048], m2 + lea r0, [r0 + 4 * r1] - palignr m1, m0, 2 - palignr m2, m0, 3 - punpcklqdq m1, m2 - movu [r0 + 2064], m1 + INTRA_PRED_ANG16_MC5 44, 3 + INTRA_PRED_ANG16_MC0 r0, r0 + r1, 3 + INTRA_PRED_ANG16_MC5 47, 2 + INTRA_PRED_ANG16_MC0 r0 + 2 * r1, r0 + r3, 4 + RET - palignr m1, m0, 4 - palignr m2, m0, 5 - punpcklqdq m1, m2 - movu [r0 + 2080], m1 +%macro INTRA_PRED_ANG32_ALIGNR_STORE 1 + lea r0, [r0 + 4 * r1] + palignr m2, m1, m0, %1 + movu [r0], m2 + palignr m2, m1, m0, (%1 + 1) + movu [r0 + r1], m2 + palignr m2, m1, m0, (%1 + 2) + movu [r0 + 2 * r1], m2 + palignr m2, m1, m0, (%1 + 3) + movu [r0 + r3], m2 +%endmacro - palignr m1, m0, 6 - palignr m2, m0, 7 - punpcklqdq m1, m2 - movu [r0 + 2096], m1 -RET +INIT_YMM avx2 +cglobal intra_pred_ang32_34, 3, 4,3 + lea r3, [3 * r1] + + movu m0, [r2 + 2] + movu m1, [r2 + 18] + movu [r0], m0 + palignr m2, m1, m0, 1 + movu [r0 + r1], m2 + palignr m2, m1, m0, 2 + movu [r0 + 2 * r1], m2 + palignr m2, m1, m0, 3 + movu [r0 + r3], m2 + + INTRA_PRED_ANG32_ALIGNR_STORE 4 + INTRA_PRED_ANG32_ALIGNR_STORE 8 + INTRA_PRED_ANG32_ALIGNR_STORE 12 + + lea r0, [r0 + 4 * r1] + palignr m2, m1, m0, 16 + movu [r0], m2 + movu m0, [r2 + 19] + movu [r0 + r1], m0 + movu m1, [r2 + 35] + palignr m2, m1, m0, 1 + movu [r0 + 2 * r1], m2 + palignr m2, m1, m0, 2 + movu [r0 + r3], m2 + + INTRA_PRED_ANG32_ALIGNR_STORE 3 + INTRA_PRED_ANG32_ALIGNR_STORE 7 + INTRA_PRED_ANG32_ALIGNR_STORE 11 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang32_2, 3, 4,3 + lea r3, [3 * r1] + + movu m0, [r2 + 64 + 2] + movu m1, [r2 + 64 + 18] + movu [r0], m0 + palignr m2, m1, m0, 1 + movu [r0 + r1], m2 + palignr m2, m1, m0, 2 + movu [r0 + 2 * r1], m2 + palignr m2, m1, m0, 3 + movu [r0 + r3], m2 + + INTRA_PRED_ANG32_ALIGNR_STORE 4 + INTRA_PRED_ANG32_ALIGNR_STORE 8 + INTRA_PRED_ANG32_ALIGNR_STORE 12 + + lea r0, [r0 + 4 * r1] + palignr m2, m1, m0, 16 + movu [r0], m2 + movu m0, [r2 + 64 + 19] + movu [r0 + r1], m0 + movu m1, [r2 + 64 + 35] + palignr m2, m1, m0, 1 + movu [r0 + 2 * r1], m2 + palignr m2, m1, m0, 2 + movu [r0 + r3], m2 + + INTRA_PRED_ANG32_ALIGNR_STORE 3 + INTRA_PRED_ANG32_ALIGNR_STORE 7 + INTRA_PRED_ANG32_ALIGNR_STORE 11 + RET + +%macro INTRA_PRED_ANG32_STORE 0 + lea r0, [r0 + 4 * r1] + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r3], m0 +%endmacro -;-------------------------------------------------------------------------------- -; void all_angs_pred_16x16(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) -;-------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal all_angs_pred_16x16, 3,4,8 - ; mode 2 +INIT_YMM avx2 +cglobal intra_pred_ang32_26, 3, 4, 1 + lea r3, [3 * r1] + movu m0, [r2 + 1] + movu [r0], m0 + movu [r0 + r1], m0 + movu [r0 + r1 * 2], m0 + movu [r0 + r3], m0 + + INTRA_PRED_ANG32_STORE + INTRA_PRED_ANG32_STORE + INTRA_PRED_ANG32_STORE + INTRA_PRED_ANG32_STORE + INTRA_PRED_ANG32_STORE + INTRA_PRED_ANG32_STORE + INTRA_PRED_ANG32_STORE + RET - movu m0, [r2 + 2 + 32] - movu [r0 + 0 * 16], m0 +%if ARCH_X86_64 == 1 +%macro INTRA_PRED_ANG32_CAL_ROW 0 + pmaddubsw m6, m2, m10 + pmulhrsw m6, m0 + pmaddubsw m7, m3, m10 + pmulhrsw m7, m0 + pmaddubsw m8, m4, m10 + pmulhrsw m8, m0 + pmaddubsw m9, m5, m10 + pmulhrsw m9, m0 + packuswb m6, m7 + packuswb m8, m9 + vperm2i128 m7, m6, m8, 00100000b + vperm2i128 m6, m6, m8, 00110001b +%endmacro - movu m1, m0 - movu m6, [r2 + 18 + 32] - palignr m5, m6, m0, 1 - movu [r0 + 1 * 16], m5 - - movu m4, m5 - - palignr m5, m6, m0, 2 - movu [r0 + 2 * 16], m5 - palignr m5, m6, m0, 3 - movu [r0 + 3 * 16], m5 - palignr m5, m6, m0, 4 - movu [r0 + 4 * 16], m5 - palignr m5, m6, m0, 5 - movu [r0 + 5 * 16], m5 - palignr m5, m6, m0, 6 - movu [r0 + 6 * 16], m5 - palignr m5, m6, m0, 7 - movu [r0 + 7 * 16], m5 - - movu m7, m5 - - palignr m5, m6, m0, 8 - movu [r0 + 8 * 16], m5 - - movu m2, m5 - - palignr m5, m6, m0, 9 - movu [r0 + 9 * 16], m5 - - palignr m3, m6, m0, 10 - movu [r0 + 10 * 16], m3 - palignr m3, m6, m0, 11 - movu [r0 + 11 * 16], m3 - palignr m3, m6, m0, 12 - movu [r0 + 12 * 16], m3 - - ; mode 3 [row 15] - movu [r0 + (3-2)*16*16 + 15 * 16], m3 - - palignr m3, m6, m0, 13 - movu [r0 + 13 * 16], m3 - palignr m3, m6, m0, 14 - movu [r0 + 14 * 16], m3 - palignr m3, m6, m0, 15 - movu [r0 + 15 * 16], m3 - - ; mode 3 [row 0] - lea r3, [ang_table] - movu m3, [pw_1024] - movu m0, [r2 + 1 + 32] - punpcklbw m0, m1 - - ; mode 17 [row 8 - second half] - pmaddubsw m1, m0, [r3 + 22 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 248 * 16 + 8], m1 - ; mode 17 [row 8 - second half] end - - pmaddubsw m1, m0, [r3 + 26 * 16] - pmulhrsw m1, m3 - punpcklbw m7, m2 - pmaddubsw m2, m7, [r3 + 26 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 16 * 16], m1 - - ;mode 6 [row 1] - movu [r0 + 65 * 16], m1 - - ; mode 4 [row 0] - pmaddubsw m1, m0, [r3 + 21 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 21 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 32 * 16], m1 - - ; mode 5 [row 0] - pmaddubsw m1, m0, [r3 + 17 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 17 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 48 * 16], m1 - - ; mode 6 [row 0] - pmaddubsw m1, m0, [r3 + 13 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 13 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 64 * 16], m1 - - ; mode 7 [row 0] - pmaddubsw m1, m0, [r3 + 9 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 9 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 80 * 16], m1 - - ; mode 7 [row 1] - pmaddubsw m1, m0, [r3 + 18 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 18 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 81 * 16], m1 - - ; mode 7 [row 2] - pmaddubsw m1, m0, [r3 + 27 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 27 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 82 * 16], m1 - - ; mode 8 [row 0] - pmaddubsw m1, m0, [r3 + 5 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 5 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 96 * 16], m1 - - ; mode 8 [row 1] - pmaddubsw m1, m0, [r3 + 10 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 10 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 97 * 16], m1 - - ; mode 8 [row 2] - pmaddubsw m1, m0, [r3 + 15 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 15 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 98 * 16], m1 - - ; mode 8 [row 3] - pmaddubsw m1, m0, [r3 + 20 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 20 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 99 * 16], m1 - - ; mode 8 [row 4] - pmaddubsw m1, m0, [r3 + 25 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 25 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 100 * 16], m1 - - ; mode 8 [row 5] - pmaddubsw m1, m0, [r3 + 30 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 30 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 101 * 16], m1 - - ; mode 15 [row 13 - second half] - pmaddubsw m1, m0, [r3 + 18 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 221 * 16 + 8], m1 - ; mode 15 [row 13 - second half] end - - ; mode 15 [row 14 - second half] - pmaddubsw m1, m0, [r3 + 1 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 222 * 16 + 8], m1 - ; mode 15 [row 14 - second half] end - - ; mode 16 [row 10 - second half] - pmaddubsw m1, m0, [r3 + 25 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 234 * 16 + 8], m1 - ; mode 16 [row 10 - second half] end - - ; mode 16 [row 11 - second half] - pmaddubsw m1, m0, [r3 + 4 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 235 * 16 + 8], m1 - ; mode 16 [row 11 - second half] end - - ; mode 3 [row 1] - movu m6, [r3 + 20 * 16] - movu m0, [r2 + 2 + 32] - punpcklbw m0, m4 - - ; mode 17 [row 7 - second half] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 247 * 16 + 8], m1 - - ; mode 17 [row 7 - second half] end - pmaddubsw m1, m0, m6 - pmulhrsw m1, m3 - movu m2, [r2 + 10 + 32] - punpcklbw m2, m5 - pmaddubsw m4, m2, m6 - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 17 * 16], m1 - - ;mode 6 [row 3] - movu [r0 + 67 * 16], m1 - - ; mode 4 row [row 1] - pmaddubsw m1, m0, [r3 + 10 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 10 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 33 * 16], m1 - - ; mode 4 row [row 2] - pmaddubsw m1, m0, [r3 + 31 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 31 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 34 * 16], m1 - - ; mode 7 [row 6] - movu [r0 + 86 * 16], m1 - - ; mode 5 row [row 1] - pmaddubsw m1, m0, [r3 + 2 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 2 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 49 * 16], m1 - - ; mode 5 row [row 2] - pmaddubsw m1, m0, [r3 + 19 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 19 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 50 * 16], m1 - - ; mode 6 [row 2] - pmaddubsw m1, m0, [r3 + 7 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 7 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 66 * 16], m1 - - ; mode 7 [row 3] - pmaddubsw m1, m0, [r3 + 4 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 83 * 16], m1 - - ; mode 7 [row 4] - pmaddubsw m1, m0, [r3 + 13 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 13 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 84 * 16], m1 - - ; mode 8 [row 8] - movu [r0 + 104 * 16], m1 - - ; mode 7 [row 5] - pmaddubsw m1, m0, [r3 + 22 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 22 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 85 * 16], m1 - - ; mode 8 [row 6] - pmaddubsw m1, m0, [r3 + 3 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 3 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 102 * 16], m1 - - ; mode 8 [row 7] - pmaddubsw m1, m0, [r3 + 8 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 8 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 103 * 16], m1 - - ; mode 8 [row 9] - pmaddubsw m1, m0, [r3 + 18 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 18 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 105 * 16], m1 - - ; mode 8 [row 10] - pmaddubsw m1, m0, [r3 + 23 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 23 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 106 * 16], m1 - - ; mode 8 [row 11] - pmaddubsw m1, m0, [r3 + 28 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 28 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 107 * 16], m1 - - ; mode 3 [row 2] - movu m0, [r2 + 3 + 32] - movd m1, [r2 + 19 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - - ; mode 17 [row 6 - second half] - pmaddubsw m1, m0, [r3 + 10 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 246 * 16 + 8], m1 - ; mode 17 [row 6 - second half] end - - pmaddubsw m1, m0, [r3 + 14 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 11 + 32] - movd m4, [r2 + 27 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 14 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 18 * 16], m1 - - ; mode 6 [row 5] - movu [r0 + 69 * 16], m1 - - ; mode 4 row [row 3] - pmaddubsw m1, m0, [r3 + 20 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 20 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 35 * 16], m1 - - ; mode 5 row [row 3] - pmaddubsw m1, m0, [r3 + 4 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 51 * 16], m1 - - ; mode 5 row [row 4] - pmaddubsw m1, m0, [r3 + 21 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 21 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 52 * 16], m1 - - ; mode 6 [row 4] - pmaddubsw m1, m0, [r3 + 1 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 68 * 16], m1 - - ; mode 6 [row 6] - pmaddubsw m1, m0, [r3 + 27 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 27 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 70 * 16], m1 - - ; mode 7 [row 7] - pmaddubsw m1, m0, [r3 + 8 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 8 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 87 * 16], m1 - - ; mode 7 [row 8] - pmaddubsw m1, m0, [r3 + 17 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 17 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 88 * 16], m1 - - ; mode 7 [row 9] - pmaddubsw m1, m0, [r3 + 26 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 26 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 89 * 16], m1 - - ; mode 8 [row 12] - pmaddubsw m1, m0, [r3 + 1 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 108 * 16], m1 - - ; mode 8 [row 13] - pmaddubsw m1, m0, [r3 + 6 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 6 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 109 * 16], m1 - - ; mode 8 [row 14] - pmaddubsw m1, m0, [r3 + 11 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 11 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 110 * 16], m1 - - ; mode 8 [row 15] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 111 * 16], m1 - - ; mode 3 [row 3] - movu m0, [r2 + 4 + 32] - movd m1, [r2 + 20 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - - ; mode 17 [row 4 - second half] - pmaddubsw m1, m0, [r3 + 30 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 244 * 16 + 8], m1 - ; mode 17 [row 4 - second half] end - - ; mode 17 [row 5 - second half] - pmaddubsw m1, m0, [r3 + 4 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 245 * 16 + 8], m1 - ; mode 17 [row 5 - second half] end - - pmaddubsw m1, m0, [r3 + 8 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 12 + 32] - movd m4, [r2 + 28 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 8 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 19 * 16], m1 - - ; mode 6 [row 7] - movu [r0 + 71 * 16], m1 - - ; mode 4 row [row 4] - pmaddubsw m1, m0, [r3 + 9 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 9 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 36 * 16], m1 - - ; mode 4 row [row 5] - pmaddubsw m1, m0, [r3 + 30 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 30 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 37 * 16], m1 - - ; mode 7 row [row 13] - movu [r0 + 93 * 16], m1 - - ; mode 5 row [row 5] - pmaddubsw m1, m0, [r3 + 6 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 6 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 53 * 16], m1 - - ; mode 5 row [row 6] - pmaddubsw m1, m0, [r3 + 23 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 23 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 54 * 16], m1 - - ; mode 6 [row 8] - pmaddubsw m1, m0, [r3 + 21 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 21 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 72 * 16], m1 - - ; mode 7 [row 12] - movu [r0 + 92 * 16], m1 - - ; mode 7 [row 10] - pmaddubsw m1, m0, [r3 + 3 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 3 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 90 * 16], m1 - - ; mode 7 [row 11] - pmaddubsw m1, m0, [r3 + 12 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 12 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 91 * 16], m1 - - ; mode 3 [row 4] - movu m0, [r2 + 5 + 32] - movd m1, [r2 + 20 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - - ; mode 17 [row 3 - second half] - pmaddubsw m1, m0, [r3 + 24 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 243 * 16 + 8], m1 - - ; mode 17 [row 3 - second half] end - pmaddubsw m1, m0, [r3 + 2 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 13 + 32] - movd m4, [r2 + 29 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 2 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 20 * 16], m1 - - ;mode 6 [row 9] - movu [r0 + 73 * 16], m1 - - ; mode 4 row [row 6] - movu m6, [r3 + 19 * 16] - pmaddubsw m1, m0, m6 - pmulhrsw m1, m3 - pmaddubsw m4, m2, m6 - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 38 * 16], m1 - - ; mode 3 [row 5] - pmaddubsw m1, m0, [r3 + 28 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 28 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 21 * 16], m1 - - ;mode 6 [row 11] - movu [r0 + 75 * 16], m1 - - ; mode 5 row [row 7] - pmaddubsw m1, m0, [r3 + 8 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 8 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 55 * 16], m1 - - ; mode 5 row [row 8] - pmaddubsw m1, m0, [r3 + 25 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 25 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 56 * 16], m1 - - ; mode 6 [row 10] - pmaddubsw m1, m0, [r3 + 15 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 15 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 74 * 16], m1 - - ; mode 7 [row 14] - pmaddubsw m1, m0, [r3 + 7 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 7 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 94 * 16], m1 - - ; mode 7 [row 15] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 95 * 16], m1 - - ; mode 3 [row 6] - movu m0, [r2 + 6 + 32] - movd m1, [r2 + 22 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - - ; mode 17 [row 2 - second half] - pmaddubsw m1, m0, [r3 + 18 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 242 * 16 + 8], m1 - ; mode 17 [row 2 - second half] end - - pmaddubsw m1, m0, [r3 + 22 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 14 + 32] - movd m4, [r2 + 30 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 22 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 22 * 16], m1 - - ; mode 6 [row 13] - movu [r0 + 77 * 16], m1 - - ; mode 4 row [row 7] - pmaddubsw m1, m0, [r3 + 8 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 8 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 39 * 16], m1 - - ; mode 4 row [row 8] - pmaddubsw m1, m0, [r3 + 29 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 29 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 40 * 16], m1 - - ; mode 5 row [row 9] - pmaddubsw m1, m0, [r3 + 10 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 10 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 57 * 16], m1 - - ; mode 5 row [row 10] - pmaddubsw m1, m0, [r3 + 27 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 27 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 58 * 16], m1 - - ; mode 6 [row 12] - pmaddubsw m1, m0, [r3 + 9 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 9 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 76 * 16], m1 - - ; mode 3 [row 7] - movu m0, [r2 + 7 + 32] - movd m1, [r2 + 27 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - - ; mode 17 [row 1 - second half] - pmaddubsw m1, m0, [r3 + 12 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 241 * 16 + 8], m1 - ; mode 17 [row 1 - second half] end - - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 15 + 32] - movd m4, [r2 + 25 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 23 * 16], m1 - - ; mode 6 [row 15] - movu [r0 + 79 * 16], m1 - - ; mode 4 row [row 9] - pmaddubsw m1, m0, [r3 + 18 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 18 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 41 * 16], m1 - - ; mode 5 row [row 11] - pmaddubsw m1, m0, [r3 + 12 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 12 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 59 * 16], m1 - - ; mode 5 row [row 12] - pmaddubsw m1, m0, [r3 + 29 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 29 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 60 * 16], m1 - - ; mode 6 [row 14] - pmaddubsw m1, m0, [r3 + 3 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 3 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 78 * 16], m1 - - ; mode 3 [row 8] - movu m0, [r2 + 8 + 32] - movd m1, [r2 + 24 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, [r3 + 10 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 16 + 32] - psrldq m4, m2, 1 - pinsrb m4, [r2 + 32], 15 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 10 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 24 * 16], m1 - - ; mode 4 row [row 10] - pmaddubsw m1, m0, [r3 + 7 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 7 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 42 * 16], m1 - - ; mode 4 row [row 11] - pmaddubsw m1, m0, [r3 + 28 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 28 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 43 * 16], m1 - - ; mode 5 row [row 13] - pmaddubsw m1, m0, [r3 + 14 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 14 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 61 * 16], m1 - - ; mode 5 row [row 14] - pmaddubsw m1, m0, [r3 + 31 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 31 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 62 * 16], m1 - - ; mode 3 [row 9] - movu m0, [r2 + 9 + 32] - movd m1, [r2 + 16 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, [r3 + 4 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 17 + 32] - movd m4, [r2 + 33 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 25 * 16], m1 - - ; mode 4 row [row 12] - pmaddubsw m1, m0, [r3 + 17 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 17 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 44 * 16], m1 - - ; mode 3 [row 10] - pmaddubsw m1, m0, [r3 + 30 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 30 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 26 * 16], m1 - - ; mode 5 row [row 15] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 63 * 16], m1 - - ; mode 3 [row 11] - movu m0, [r2 + 10 + 32] - movd m1, [r2 + 26 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, [r3 + 24 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 18 + 32] - movd m4, [r2 + 34 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 27 * 16], m1 - - ; mode 4 row [row 13] - pmaddubsw m1, m0, [r3 + 6 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 6 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 45 * 16], m1 - - ; mode 4 row [row 14] - pmaddubsw m1, m0, [r3 + 27 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 27 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 46 * 16], m1 - - ; mode 3 [row 12] - movu m0, [r2 + 11 + 32] - movd m1, [r2 + 27 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, [r3 + 18 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 19 + 32] - movd m4, [r2 + 35 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 18 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 28 * 16], m1 - - ; mode 4 row [row 15] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m2, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 47 * 16], m1 - - ; mode 3 [row 13] - movu m0, [r2 + 12 + 32] - movd m1, [r2 + 28 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, [r3 + 12 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 20 + 32] - movd m4, [r2 + 36 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 12 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 29 * 16], m1 - - ; mode 3 [row 14] - movu m0, [r2 + 13 + 32] - movd m1, [r2 + 29 + 32] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, [r3 + 6 * 16] - pmulhrsw m1, m3 - movu m2, [r2 + 21 + 32] - movd m4, [r2 + 37 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m2, [r3 + 6 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 30 * 16], m1 - - ; mode 9 - movu m0, [r1 + 1 + 32] - movd m1, [r1 + 17 + 32] - palignr m1, m0, 1 - - ; mode 9 [row 15] - movu [r0 + 127 * 16], m1 - - ; mode 9 [row 0] - punpcklbw m0, m1 - pmaddubsw m1, m0, [r3 + 2 * 16] - pmulhrsw m1, m3 - movu m7, [r1 + 9 + 32] - movd m4, [r2 + 25 + 32] - palignr m2, m7, 1 - punpcklbw m7, m2 - pmaddubsw m2, m7, [r3 + 2 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 112 * 16], m1 - - ; mode 9 [row 1] - pmaddubsw m1, m0, [r3 + 4 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 4 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 113 * 16], m1 - - ; mode 9 [row 2] - pmaddubsw m1, m0, [r3 + 6 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 6 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 114 * 16], m1 - - ; mode 9 [row 3] - pmaddubsw m1, m0, [r3 + 8 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 8 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 115 * 16], m1 - - ; mode 9 [row 4] - pmaddubsw m1, m0, [r3 + 10 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 10 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 116 * 16], m1 - - ; mode 9 [row 5] - pmaddubsw m1, m0, [r3 + 12 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 12 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 117 * 16], m1 - - ; mode 9 [row 6] - pmaddubsw m1, m0, [r3 + 14 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 14 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 118 * 16], m1 - - ; mode 9 [row 7] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 16 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 119 * 16], m1 - - ; mode 9 [row 8] - pmaddubsw m1, m0, [r3 + 18 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 18 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 120 * 16], m1 - - ; mode 9 [row 9] - pmaddubsw m1, m0, [r3 + 20 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 20 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 121 * 16], m1 - - ; mode 9 [row 10] - pmaddubsw m1, m0, [r3 + 22 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 22 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 122 * 16], m1 - - ; mode 9 [row 11] - pmaddubsw m1, m0, [r3 + 24 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 24 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 123 * 16], m1 - - ; mode 9 [row 12] - pmaddubsw m1, m0, [r3 + 26 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 26 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 124 * 16], m1 - - ; mode 9 [row 13] - pmaddubsw m1, m0, [r3 + 28 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 28 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 125 * 16], m1 - - ; mode 9 [row 14] - pmaddubsw m1, m0, [r3 + 30 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 30 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 126 * 16], m1 - - ; mode 10 - movu m1, [r1 + 1 + 32] - movu [r0 + 128 * 16], m1 - movu [r0 + 129 * 16], m1 - movu [r0 + 130 * 16], m1 - movu [r0 + 131 * 16], m1 - movu [r0 + 132 * 16], m1 - movu [r0 + 133 * 16], m1 - movu [r0 + 134 * 16], m1 - movu [r0 + 135 * 16], m1 - movu [r0 + 136 * 16], m1 - movu [r0 + 137 * 16], m1 - movu [r0 + 138 * 16], m1 - movu [r0 + 139 * 16], m1 - movu [r0 + 140 * 16], m1 - movu [r0 + 141 * 16], m1 - movu [r0 + 142 * 16], m1 - movu [r0 + 143 * 16], m1 - - pxor m0, m0 - pshufb m1, m1, m0 - punpcklbw m1, m0 - pinsrb m2, [r1], 0 - pshufb m2, m2, m0 - punpcklbw m2, m0 - movu m4, [r1 + 1] - punpcklbw m5, m4, m0 - punpckhbw m4, m0 - psubw m5, m2 - psubw m4, m2 - psraw m5, 1 - psraw m4, 1 - paddw m5, m1 - paddw m4, m1 - packuswb m5, m4 - - pextrb [r0 + 128 * 16], m5, 0 - pextrb [r0 + 129 * 16], m5, 1 - pextrb [r0 + 130 * 16], m5, 2 - pextrb [r0 + 131 * 16], m5, 3 - pextrb [r0 + 132 * 16], m5, 4 - pextrb [r0 + 133 * 16], m5, 5 - pextrb [r0 + 134 * 16], m5, 6 - pextrb [r0 + 135 * 16], m5, 7 - pextrb [r0 + 136 * 16], m5, 8 - pextrb [r0 + 137 * 16], m5, 9 - pextrb [r0 + 138 * 16], m5, 10 - pextrb [r0 + 139 * 16], m5, 11 - pextrb [r0 + 140 * 16], m5, 12 - pextrb [r0 + 141 * 16], m5, 13 - pextrb [r0 + 142 * 16], m5, 14 - pextrb [r0 + 143 * 16], m5, 15 - - ; mode 11 - movu m0, [r1 + 32] - pinsrb m0, [r1], 0 - - ; mode 11 [row 15] - movu [r0 + 159 * 16], m0 - - ; mode 11 [row 0] - movu m1, [r1 + 1 + 32] - punpcklbw m0, m1 - pmaddubsw m1, m0, [r3 + 30 * 16] - pmulhrsw m1, m3 - movu m7, [r1 + 8 + 32] - movu m2, [r1 + 9 + 32] - punpcklbw m7, m2 - pmaddubsw m2, m7, [r3 + 30 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 144 * 16], m1 - - ; mode 11 [row 1] - pmaddubsw m1, m0, [r3 + 28 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 28 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 145 * 16], m1 - - ; mode 11 [row 2] - pmaddubsw m1, m0, [r3 + 26 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 26 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 146 * 16], m1 - - ; mode 11 [row 3] - pmaddubsw m1, m0, [r3 + 24 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 24 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 147 * 16], m1 - - ; mode 11 [row 4] - pmaddubsw m1, m0, [r3 + 22 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 22 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 148 * 16], m1 - - ; mode 11 [row 5] - pmaddubsw m1, m0, [r3 + 20 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 20 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 149 * 16], m1 - - ; mode 11 [row 6] - pmaddubsw m1, m0, [r3 + 18 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 18 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 150 * 16], m1 - - ; mode 11 [row 7] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 16 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 151 * 16], m1 - - ; mode 11 [row 8] - pmaddubsw m1, m0, [r3 + 14 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 14 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 152 * 16], m1 - - ; mode 11 [row 9] - pmaddubsw m1, m0, [r3 + 12 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 12 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 153 * 16], m1 - - ; mode 11 [row 10] - pmaddubsw m1, m0, [r3 + 10 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 10 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 154 * 16], m1 - - ; mode 11 [row 11] - pmaddubsw m1, m0, [r3 + 8 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 8 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 155 * 16], m1 - - ; mode 11 [row 12] - pmaddubsw m1, m0, [r3 + 6 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 6 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 156 * 16], m1 - - ; mode 11 [row 13] - pmaddubsw m1, m0, [r3 + 4 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 4 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 157 * 16], m1 - - ; mode 11 [row 14] - pmaddubsw m1, m0, [r3 + 2 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 2 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 158 * 16], m1 - - ; mode 12 [row 0] - movu m0, [r2 + 32] - pinsrb m0, [r2], 0 - movu m1, [r2 + 1 + 32] - punpcklbw m0, m1 - pmaddubsw m1, m0, [r3 + 27 * 16] - pmulhrsw m1, m3 - movu m7, [r2 + 8 + 32] - movd m2, [r2 + 24 + 32] - palignr m2, m7, 1 - punpcklbw m7, m2 - pmaddubsw m2, m7, [r3 + 27 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 160 * 16], m1 - - ; mode 12 [row 1] - pmaddubsw m1, m0, [r3 + 22 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 22 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 161 * 16], m1 - - ; mode 12 [row 2] - pmaddubsw m1, m0, [r3 + 17 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 17 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 162 * 16], m1 - - ; mode 12 [row 3] - pmaddubsw m1, m0, [r3 + 12 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 12 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 163 * 16], m1 - - ; mode 12 [row 4] - pmaddubsw m1, m0, [r3 + 7 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 7 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 164 * 16], m1 - - ; mode 12 [row 5] - pmaddubsw m1, m0, [r3 + 2 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 2 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 165 * 16], m1 - - ; mode 13 [row 0] - pmaddubsw m1, m0, [r3 + 23 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 23 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 176 * 16], m1 - - ; mode 13 [row 1] - pmaddubsw m1, m0, [r3 + 14 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 14 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 177 * 16], m1 - - ; mode 13 [row 2] - pmaddubsw m1, m0, [r3 + 5 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 5 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 178 * 16], m1 - - ; mode 14 [row 0] - pmaddubsw m1, m0, [r3 + 19 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 19 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 192 * 16], m1 - - ; mode 14 [row 1] - pmaddubsw m1, m0, [r3 + 6 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 6 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 193 * 16], m1 - - ; mode 17 [row 0] - movu [r0 + 240 * 16], m1 - - ; mode 15 [row 0] - pmaddubsw m1, m0, [r3 + 15 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 15 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 208 * 16], m1 - - ; mode 15 [row 15 - second half] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 223 * 16 + 8], m1 - ; mode 15 [row 15 - second half] end - - ; mode 16 [row 0] - pmaddubsw m1, m0, [r3 + 11 * 16] - pmulhrsw m1, m3 - pmaddubsw m2, m7, [r3 + 11 * 16] - pmulhrsw m2, m3 - packuswb m1, m2 - movu [r0 + 224 * 16], m1 - - ; mode 17 [row 9 - second half] - pmaddubsw m1, m0, [r3 + 28 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 249 * 16 + 8], m1 - ; mode 17 [row 9 - second half] end - - ; mode 17 [row 10 - second half] - pmaddubsw m1, m0, [r3 + 2 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 250 * 16 + 8], m1 - ; mode 17 [row 10 - second half] end - - ; mode 17 [row 1 - first half] - pslldq m6, m0, 2 - pinsrb m6, [r2], 1 - pinsrb m6, [r2 + 1], 0 - pmaddubsw m1, m6, [r3 + 12 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 241 * 16], m1 - - ; mode 17 [row 11 - second half] - pmaddubsw m1, m6, [r3 + 8 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 251 * 16 + 8], m1 - ; mode 17 [row 11 - second half] end - - ; mode 17 [row 2 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 1], 1 - pinsrb m6, [r2 + 2], 0 - pmaddubsw m1, m6, [r3 + 18 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 242 * 16], m1 - - ; mode 17 [row 12 - second half] - pmaddubsw m1, m6, [r3 + 14 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 252 * 16 + 8], m1 - ; mode 17 [row 12 - second half] end - - ; mode 17 [row 3 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 2], 1 - pinsrb m6, [r2 + 4], 0 - pmaddubsw m1, m6, [r3 + 24 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 243 * 16], m1 - - ; mode 17 [row 13 - first half] - pmaddubsw m1, m6, [r3 + 20 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 253 * 16 + 8], m1 - - ; mode 17 [row 4 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 4], 1 - pinsrb m6, [r2 + 5], 0 - pmaddubsw m1, m6, [r3 + 30 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 244 * 16], m1 - - ; mode 17 [row 5 - first half] - pmaddubsw m1, m6, [r3 + 4 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 245 * 16], m1 - - ; mode 17 [row 14 - second half] - pmaddubsw m1, m6, [r3 + 26 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 254 * 16 + 8], m1 - ; mode 17 [row 14 - second half] end - - ; mode 17 [row 6 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 5], 1 - pinsrb m6, [r2 + 6], 0 - pmaddubsw m1, m6, [r3 + 10 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 246 * 16], m1 - - ; mode 17 [row 7 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 6], 1 - pinsrb m6, [r2 + 7], 0 - pmaddubsw m1, m6, [r3 + 16 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 247 * 16], m1 - - ; mode 17 [row 8 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 7], 1 - pinsrb m6, [r2 + 9], 0 - pmaddubsw m1, m6, [r3 + 22 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 248 * 16], m1 - - ; mode 17 [row 9 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 9], 1 - pinsrb m6, [r2 + 10], 0 - pmaddubsw m1, m6, [r3 + 28 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 249 * 16], m1 - - ; mode 17 [row 10 - first half] - pmaddubsw m1, m6, [r3 + 2 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 250 * 16], m1 - - ; mode 17 [row 11 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 10], 1 - pinsrb m6, [r2 + 11], 0 - pmaddubsw m1, m6, [r3 + 8 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 251 * 16], m1 - - ; mode 17 [row 12 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 11], 1 - pinsrb m6, [r2 + 12], 0 - pmaddubsw m1, m6, [r3 + 14 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 252 * 16], m1 - - ; mode 17 [row 13 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 12], 1 - pinsrb m6, [r2 + 14], 0 - pmaddubsw m1, m6, [r3 + 20 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 253 * 16], m1 - - ; mode 17 [row 14 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 14], 1 - pinsrb m6, [r2 + 15], 0 - pmaddubsw m1, m6, [r3 + 26 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 254 * 16], m1 - - ; mode 16 [row 12 - second half] - pmaddubsw m1, m0, [r3 + 15 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 236 * 16 + 8], m1 - ; mode 16 [row 12 - second half] - - ; mode 12 [row 6] - pslldq m2, m0, 2 - pinsrb m2, [r2], 1 - pinsrb m2, [r2 + 6], 0 - pmaddubsw m1, m2, [r3 + 29 * 16] - pmulhrsw m1, m3 - movu m0, [r2 + 7 + 32] - psrldq m4, m0, 1 - punpcklbw m0, m4 - pmaddubsw m4, m0, [r3 + 29 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 166 * 16], m1 - - ; mode 12 [row 7] - pmaddubsw m1, m2, [r3 + 24 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 167 * 16], m1 - - ; mode 12 [row 8] - pmaddubsw m1, m2, [r3 + 19 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 19 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 168 * 16], m1 - - ; mode 12 [row 9] - pmaddubsw m1, m2, [r3 + 14 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 14 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 169 * 16], m1 - - ; mode 12 [row 10] - pmaddubsw m1, m2, [r3 + 9 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 9 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 170 * 16], m1 - - ; mode 12 [row 11] - pmaddubsw m1, m2, [r3 + 4 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 171 * 16], m1 - - ; mode 13 [row 3] - pinsrb m7, m2, [r2 + 4], 0 - pmaddubsw m1, m7, [r3 + 28 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 28 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 179 * 16], m1 - - ; mode 13 [row 4] - pmaddubsw m1, m7, [r3 + 19 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 19 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 180 * 16], m1 - - ; mode 13 [row 5] - pmaddubsw m1, m7, [r3 + 10 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 10 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 181 * 16], m1 - - ; mode 13 [row 6] - pmaddubsw m1, m7, [r3 + 1 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 182 * 16], m1 - - ; mode 14 [row 2] - pinsrb m5, m7, [r2 + 2], 0 - pmaddubsw m1, m5, [r3 + 25 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 25 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 194 * 16], m1 - - ; mode 14 [row 3] - pmaddubsw m1, m5, [r3 + 12 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 12 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 195 * 16], m1 - - ; mode 15 [row 1] - pmaddubsw m1, m5, [r3 + 30 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 30 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 209 * 16], m1 - - ; mode 15 [row 2] - pmaddubsw m1, m5, [r3 + 13 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 13 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 210 * 16], m1 - - ; mode 16 [row 1] - pmaddubsw m1, m5, [r3 + 22 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 22 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 225 * 16], m1 - - ; mode 16 [row 2] - pmaddubsw m1, m5, [r3 + 1 * 16] - pmulhrsw m1, m3 - pmaddubsw m4, m0, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m1, m4 - movu [r0 + 226 * 16], m1 - - ; mode 16 [row 13 - second half] - pmaddubsw m1, m5, [r3 + 26 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 237 * 16 + 8], m1 - ; mode 16 [row 13 - second half] - - ; mode 16 [row 14 - second half] - pmaddubsw m1, m5, [r3 + 5 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 238 * 16 + 8], m1 - ; mode 16 [row 14 - second half] - - ; mode 16 [row 3] - pslldq m6, m5, 2 - pinsrb m6, [r2 + 2], 1 - pinsrb m6, [r2 + 3], 0 - pmaddubsw m1, m6, [r3 + 12 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 227 * 16], m1 - - ; mode 16 [row 15 - second half] - pmaddubsw m1, m6, [r3 + 16 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 239 * 16 + 8], m1 - ; mode 16 [row 15 - second half] end - - ; mode 16 [row 4- first half] - pslldq m6, 2 - pinsrb m6, [r2 + 3], 1 - pinsrb m6, [r2 + 5], 0 - pmaddubsw m1, m6, [r3 + 23 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 228 * 16], m1 - - ; mode 16 [row 5- first half] - pmaddubsw m1, m6, [r3 + 2 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 229 * 16], m1 - - ; mode 16 [row 6- first half] - pslldq m6, 2 - pinsrb m6, [r2 + 5], 1 - pinsrb m6, [r2 + 6], 0 - pmaddubsw m1, m6, [r3 + 13 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 230 * 16], m1 - - ; mode 16 [row 7- first half] - pslldq m6, 2 - pinsrb m6, [r2 + 6], 1 - pinsrb m6, [r2 + 8], 0 - pmaddubsw m1, m6, [r3 + 24 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 231 * 16], m1 - - ; mode 16 [row 8- first half] - pmaddubsw m1, m6, [r3 + 3 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 232 * 16], m1 - ; mode 19 [row 0 - second half] end - - ; mode 16 [row 9- first half] - pslldq m6, 2 - pinsrb m6, [r2 + 8], 1 - pinsrb m6, [r2 + 9], 0 - pmaddubsw m1, m6, [r3 + 14 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 233 * 16], m1 - - ; mode 16 [row 10 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 9], 1 - pinsrb m6, [r2 + 11], 0 - pmaddubsw m1, m6, [r3 + 25 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 234 * 16], m1 - - ; mode 16 [row 11 - first half] - pmaddubsw m1, m6, [r3 + 4 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 235 * 16], m1 - - ; mode 16 [row 12 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 11], 1 - pinsrb m6, [r2 + 12], 0 - pmaddubsw m1, m6, [r3 + 15 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 236 * 16], m1 - - ; mode 16 [row 13 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 12], 1 - pinsrb m6, [r2 + 14], 0 - pmaddubsw m1, m6, [r3 + 26 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 237 * 16], m1 - - ; mode 16 [row 14 - first half] - pmaddubsw m1, m6, [r3 + 5 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 238 * 16], m1 - - ; mode 16 [row 15 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 14], 1 - pinsrb m6, [r2 + 15], 0 - pmaddubsw m1, m6, [r3 + 16 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 239 * 16], m1 - - ; mode 14 [row 4] - pslldq m5, 2 - pinsrb m5, [r2 + 2], 1 - pinsrb m5, [r2 + 5], 0 - movu m4, [r2 + 6 + 32] - psrldq m0, m4, 1 - punpcklbw m4, m0 - - ; mode 16 [row 3 - second half] - pmaddubsw m1, m4, [r3 + 12 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 227 * 16 + 8], m1 - - ; mode 16 [row 3 - second half] end - pmaddubsw m1, m5, [r3 + 31 * 16] - pmulhrsw m1, m3 - pmaddubsw m0, m4, [r3 + 31 * 16] - pmulhrsw m0, m3 - packuswb m1, m0 - movu [r0 + 196 * 16], m1 - - ; mode 14 [row 5] - pmaddubsw m1, m5, [r3 + 18 * 16] - pmulhrsw m1, m3 - pmaddubsw m0, m4, [r3 + 18 * 16] - pmulhrsw m0, m3 - packuswb m1, m0 - movu [r0 + 197 * 16], m1 - - ; mode 14 [row 6] - pmaddubsw m1, m5, [r3 + 5 * 16] - pmulhrsw m1, m3 - pmaddubsw m0, m4, [r3 + 5 * 16] - pmulhrsw m0, m3 - packuswb m1, m0 - movu [r0 + 198 * 16], m1 - - ; mode 15 [row 3] - movu m6, m5 - pinsrb m6, [r2 + 4], 0 - pmaddubsw m1, m6, [r3 + 28 * 16] - pmulhrsw m1, m3 - pmaddubsw m0, m4, [r3 + 28 * 16] - pmulhrsw m0, m3 - packuswb m1, m0 - movu [r0 + 211 * 16], m1 - - ; mode 15 [row 4] - pmaddubsw m1, m6, [r3 + 11 * 16] - pmulhrsw m1, m3 - pmaddubsw m0, m4, [r3 + 11 * 16] - pmulhrsw m0, m3 - packuswb m1, m0 - movu [r0 + 212 * 16], m1 - - ; mode 15 [row 5 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 4], 1 - pinsrb m6, [r2 + 6], 0 - pmaddubsw m1, m6, [r3 + 26 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 213 * 16], m1 - - ; mode 15 [row 6 - first half] - pmaddubsw m1, m6, [r3 + 9 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 214 * 16], m1 - - ; mode 15 [row 7 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 6], 1 - pinsrb m6, [r2 + 8], 0 - pmaddubsw m1, m6, [r3 + 24 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 215 * 16], m1 - - ; mode 15 [row 8 - first half] - pmaddubsw m1, m6, [r3 + 7 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 216 * 16], m1 - - ; mode 15 [row 9 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 8], 1 - pinsrb m6, [r2 + 9], 0 - pmaddubsw m1, m6, [r3 + 22 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 217 * 16], m1 - - ; mode 15 [row 10 - first half] - pmaddubsw m1, m6, [r3 + 5 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 218 * 16], m1 - - ; mode 15 [row 11 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 9], 1 - pinsrb m6, [r2 + 11], 0 - pmaddubsw m1, m6, [r3 + 20 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 219 * 16], m1 - - ; mode 15 [row 12 - first half] - pmaddubsw m1, m6, [r3 + 3 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 220 * 16], m1 - - ; mode 15 [row 13 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 11], 1 - pinsrb m6, [r2 + 13], 0 - pmaddubsw m1, m6, [r3 + 18 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 221 * 16], m1 - - ; mode 15 [row 14 - first half] - pmaddubsw m1, m6, [r3 + 1 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 222 * 16], m1 - - ; mode 15 [row 15 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 13], 1 - pinsrb m6, [r2 + 15], 0 - pmaddubsw m1, m6, [r3 + 16 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 223 * 16], m1 - - ; mode 14 [row 7] - pslldq m5, 2 - pinsrb m5, [r2 + 5], 1 - pinsrb m5, [r2 + 7], 0 - movu m0, [r2 + 5 + 32] - psrldq m6, m0, 1 - punpcklbw m0, m6 - - ; mode 15 [row 5 - second half] - pmaddubsw m1, m0, [r3 + 26 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 213 * 16 + 8], m1 - ; mode 15 [row 5 - second half] end - - ; mode 15 [row 6 - second half] - pmaddubsw m1, m0, [r3 + 9 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 214 * 16 + 8], m1 - ; mode 15 [row 6 - second half] end - - ; mode 16 [row 4 - second half] - pmaddubsw m1, m0, [r3 + 23 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 228 * 16 + 8], m1 - ; mode 16 [row 4 - second half] end - - ; mode 16 [row 5 - second half] - pmaddubsw m1, m0, [r3 + 2 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 229 * 16 + 8], m1 - - ; mode 16 [row 5 - second half] end - pmaddubsw m1, m5, [r3 + 24 * 16] - pmulhrsw m1, m3 - pmaddubsw m6, m0, [r3 + 24 * 16] - pmulhrsw m6, m3 - packuswb m1, m6 - movu [r0 + 199 * 16], m1 - - ; mode 14 [row 8] - pmaddubsw m1, m5, [r3 + 11 * 16] - pmulhrsw m1, m3 - pmaddubsw m6, m0, [r3 + 11 * 16] - pmulhrsw m6, m3 - packuswb m1, m6 - movu [r0 + 200 * 16], m1 - - ; mode 14 [row 9] - pslldq m5, 2 - pinsrb m5, [r2 + 7], 1 - pinsrb m5, [r2 + 10], 0 - movu m0, [r2 + 4 + 32] - psrldq m6, m0, 1 - punpcklbw m0, m6 - - ; mode 15 [row 7 - second half] - pmaddubsw m1, m0, [r3 + 24 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 215 * 16 + 8], m1 - ; mode 15 [row 7 - second half] end - - ; mode 15 [row 8 - second half] - pmaddubsw m1, m0, [r3 + 7 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 216 * 16 + 8], m1 - ; mode 15 [row 8 - second half] end - - ; mode 16 [row 6 - second half] - pmaddubsw m1, m0, [r3 + 13 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 230 * 16 + 8], m1 - ; mode 16 [row 6 - second half] end - - ; mode 15 [row 6 - second half] end - pmaddubsw m1, m5, [r3 + 30 * 16] - pmulhrsw m1, m3 - pmaddubsw m6, m0, [r3 + 30 * 16] - pmulhrsw m6, m3 - packuswb m1, m6 - movu [r0 + 201 * 16], m1 - - ; mode 14 [row 10] - pmaddubsw m1, m5, [r3 + 17 * 16] - pmulhrsw m1, m3 - pmaddubsw m6, m0, [r3 + 17 * 16] - pmulhrsw m6, m3 - packuswb m1, m6 - movu [r0 + 202 * 16], m1 - - ; mode 14 [row 11] - pmaddubsw m1, m5, [r3 + 4 * 16] - pmulhrsw m1, m3 - pmaddubsw m6, m0, [r3 + 4 * 16] - pmulhrsw m6, m3 - packuswb m1, m6 - movu [r0 + 203 * 16], m1 - - ; mode 14 [row 12] - pslldq m5, 2 - pinsrb m5, [r2 + 10], 1 - pinsrb m5, [r2 + 12], 0 - movu m0, [r2 + 3 + 32] - psrldq m6, m0, 1 - punpcklbw m0, m6 - - ; mode 15 [row 9 - second half] - pmaddubsw m1, m0, [r3 + 22 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 217 * 16 + 8], m1 - ; mode 15 [row 9 - second half] end - - ; mode 15 [row 10 - second half] - pmaddubsw m1, m0, [r3 + 5 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 218 * 16 + 8], m1 - ; mode 15 [row 10 - second half] end - - ; mode 16 [row 7 - second half] - pmaddubsw m1, m0, [r3 + 24 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 231 * 16 + 8], m1 - ; mode 16 [row 7 - second half] end - - ; mode 16 [row 8 - second half] - pmaddubsw m1, m0, [r3 + 3 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 232 * 16 + 8], m1 - ; mode 16 [row 8 - second half] end - - pmaddubsw m1, m5, [r3 + 23 * 16] - pmulhrsw m1, m3 - pmaddubsw m6, m0, [r3 + 23 * 16] - pmulhrsw m6, m3 - packuswb m1, m6 - movu [r0 + 204 * 16], m1 - - ; mode 14 [row 13] - pmaddubsw m1, m5, [r3 + 10 * 16] - pmulhrsw m1, m3 - pmaddubsw m6, m0, [r3 + 10 * 16] - pmulhrsw m6, m3 - packuswb m1, m6 - movu [r0 + 205 * 16], m1 - - ; mode 14 [row 14] - pslldq m5, 2 - pinsrb m5, [r2 + 12], 1 - pinsrb m5, [r2 + 15], 0 - movu m0, [r2 + 2 + 32] - psrldq m6, m0, 1 - punpcklbw m0, m6 - - ; mode 15 [row 11 - second half] - pmaddubsw m1, m0, [r3 + 20 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 219 * 16 + 8], m1 - ; mode 15 [row 11 - second half] end - - ; mode 15 [row 12 - second half] - pmaddubsw m1, m0, [r3 + 3 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 220 * 16 + 8], m1 - ; mode 15 [row 12 - second half] end - - ; mode 16 [row 9 - second half] - pmaddubsw m1, m0, [r3 + 14 * 16] - pmulhrsw m1, m3 - packuswb m1, m1 - movh [r0 + 233 * 16 + 8], m1 - - ; mode 16 [row 9 - second half] end - pmaddubsw m1, m5, [r3 + 29 * 16] - pmulhrsw m1, m3 - pmaddubsw m6, m0, [r3 + 29 * 16] - pmulhrsw m6, m3 - packuswb m1, m6 - movu [r0 + 206 * 16], m1 - - ; mode 14 [row 15] - pmaddubsw m1, m5, [r3 + 16 * 16] - pmulhrsw m1, m3 - pmaddubsw m6, m0, [r3 + 16 * 16] - pmulhrsw m6, m3 - packuswb m1, m6 - movu [r0 + 207 * 16], m1 - - ; mode 12 [row 12] - pslldq m0, m2, 2 - pinsrb m0, [r2 + 6], 1 - pinsrb m0, [r2 + 13], 0 - pmaddubsw m1, m0, [r3 + 31 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 31 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 172 * 16], m1 - - ; mode 12 [row 13] - pmaddubsw m1, m0, [r3 + 26 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 26 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 173 * 16], m1 - - ; mode 12 [row 14] - pmaddubsw m1, m0, [r3 + 21 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 21 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 174 * 16], m1 - - ; mode 12 [row 15] - pmaddubsw m1, m0, [r3 + 16 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 16 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 175 * 16], m1 - - ; mode 13 [row 7] - pslldq m7, 2 - pinsrb m7, [r2 + 4], 1 - pinsrb m7, [r2 + 7], 0 - pmaddubsw m1, m7, [r3 + 24 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 24 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 183 * 16], m1 - - ; mode 13 [row 8] - pmaddubsw m1, m7, [r3 + 15 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 15 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 184 * 16], m1 - - ; mode 13 [row 9] - pmaddubsw m1, m7, [r3 + 6 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 6 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 185 * 16], m1 - - ; mode 13 [row 10] - pslldq m7, 2 - pinsrb m7, [r2 + 7], 1 - pinsrb m7, [r2 + 11], 0 - pmaddubsw m1, m7, [r3 + 29 * 16] - pmulhrsw m1, m3 - movu m4, [r2 + 5 + 32] - psrldq m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, [r3 + 29 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 186 * 16], m1 - - ; mode 13 [row 11] - pmaddubsw m1, m7, [r3 + 20 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 20 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 187 * 16], m1 - - ; mode 13 [row 12] - pmaddubsw m1, m7, [r3 + 11 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 11 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 188 * 16], m1 - - ; mode 13 [row 13] - pmaddubsw m1, m7, [r3 + 2 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 2 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 189 * 16], m1 - - ; mode 13 [row 14] - pslldq m7, 2 - pinsrb m7, [r2 + 11], 1 - pinsrb m7, [r2 + 14], 0 - pmaddubsw m1, m7, [r3 + 25 * 16] - pmulhrsw m1, m3 - movu m4, [r2 + 4 + 32] - psrldq m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, [r3 + 25 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 190 * 16], m1 - - ; mode 13 [row 15] - pmaddubsw m1, m7, [r3 + 16 * 16] - pmulhrsw m1, m3 - pmaddubsw m5, m4, [r3 + 16 * 16] - pmulhrsw m5, m3 - packuswb m1, m5 - movu [r0 + 191 * 16], m1 - - ; mode 17 [row 15] - movu m0, [r2] - pshufb m1, m0, [tab_S1] - movu [r0 + 255 * 16], m1 - movu m2, [r2 + 32] - pinsrb m2, [r2], 0 - movd [r0 + 255 * 16 + 12], m2 - - ; mode 18 [row 0] - movu [r0 + 256 * 16], m0 - - ; mode 18 [row 1] - pslldq m4, m0, 1 - pinsrb m4, [r2 + 1 + 32], 0 - movu [r0 + 257 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 2 + 32], 0 - movu [r0 + 258 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 3 + 32], 0 - movu [r0 + 259 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 4 + 32], 0 - movu [r0 + 260 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 5 + 32], 0 - movu [r0 + 261 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 6 + 32], 0 - movu [r0 + 262 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 7 + 32], 0 - movu [r0 + 263 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 8 + 32], 0 - movu [r0 + 264 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 9 + 32], 0 - movu [r0 + 265 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 10 + 32], 0 - movu [r0 + 266 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 11 + 32], 0 - movu [r0 + 267 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 12 + 32], 0 - movu [r0 + 268 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 13 + 32], 0 - movu [r0 + 269 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 14 + 32], 0 - movu [r0 + 270 * 16], m4 - pslldq m4, 1 - pinsrb m4, [r2 + 15 + 32], 0 - movu [r0 + 271 * 16], m4 - - ; mode 19 [row 0] - psrldq m2, m0, 1 - punpcklbw m0, m2 - movu m5, [r2 + 8] - psrldq m6, m5, 1 - punpcklbw m5, m6 - pmaddubsw m4, m0, [r3 + 6 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 6 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 272 * 16], m4 - - ; mode 20 [row 0] - pmaddubsw m4, m0, [r3 + 11 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 11 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 288 * 16], m4 - - ; mode 21 [row 0] - pmaddubsw m4, m0, [r3 + 15 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 15 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 304 * 16], m4 - - ; mode 22 [row 0] - pmaddubsw m4, m0, [r3 + 19 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 19 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 320 * 16], m4 - - ; mode 22 [row 1] - pmaddubsw m4, m0, [r3 + 6 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 6 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 321 * 16], m4 - - ; mode 23 [row 0] - pmaddubsw m4, m0, [r3 + 23 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 23 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 336 * 16], m4 - - ; mode 23 [row 1] - pmaddubsw m4, m0, [r3 + 14 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 14 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 337 * 16], m4 - - ; mode 23 [row 2] - pmaddubsw m4, m0, [r3 + 5 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 5 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 338 * 16], m4 - - ; mode 24 [row 0] - pmaddubsw m4, m0, [r3 + 27 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 27 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 352 * 16], m4 - - ; mode 24 [row 1] - pmaddubsw m4, m0, [r3 + 22 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 22 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 353 * 16], m4 - - ; mode 24 [row 2] - pmaddubsw m4, m0, [r3 + 17 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 17 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 354 * 16], m4 - - ; mode 24 [row 3] - pmaddubsw m4, m0, [r3 + 12 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 12 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 355 * 16], m4 - - ; mode 24 [row 4] - pmaddubsw m4, m0, [r3 + 7 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 7 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 356 * 16], m4 - - ; mode 24 [row 5] - pmaddubsw m4, m0, [r3 + 2 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 2 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 357 * 16], m4 - - ; mode 24 [row 6 - first half] - pslldq m7, m0, 2 - pinsrb m7, [r2 + 0], 1 - pinsrb m7, [r2 + 6 + 32], 0 - pmaddubsw m4, m7, [r3 + 29 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 358 * 16], m4 - - ; mode 24 [row 7 - first half] - pmaddubsw m4, m7, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 359 * 16], m4 - - ; mode 24 [row 8 - first half] - pmaddubsw m4, m7, [r3 + 19 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 360 * 16], m4 - - ; mode 24 [row 9 - first half] - pmaddubsw m4, m7, [r3 + 14 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 361 * 16], m4 - - ; mode 24 [row 10 - first half] - pmaddubsw m4, m7, [r3 + 9 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 362 * 16], m4 - - ; mode 24 [row 11 - first half] - pmaddubsw m4, m7, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 363 * 16], m4 - - ; mode 24 [row 12 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 6 + 32], 1 - pinsrb m7, [r2 + 13 + 32], 0 - pmaddubsw m4, m7, [r3 + 31 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 364 * 16], m4 - - ; mode 24 [row 13 - first half] - pmaddubsw m4, m7, [r3 + 26 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 365 * 16], m4 - - ; mode 24 [row 14 - first half] - pmaddubsw m4, m7, [r3 + 21 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 366 * 16], m4 - - ; mode 24 [row 15 - first half] - pmaddubsw m4, m7, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 367 * 16], m4 - - ; mode 23 [row 3 - first half] - pslldq m7, m0, 2 - pinsrb m7, [r2 + 0], 1 - pinsrb m7, [r2 + 4 + 32], 0 - pmaddubsw m4, m7, [r3 + 28 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 339 * 16], m4 - - ; mode 23 [row 4 - first half] - pmaddubsw m4, m7, [r3 + 19 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 340 * 16], m4 - - ; mode 23 [row 5 - first half] - pmaddubsw m4, m7, [r3 + 10 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 341 * 16], m4 - - ; mode 23 [row 6 - first half] - pmaddubsw m4, m7, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 342 * 16], m4 - - ; mode 23 [row 7 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 4 + 32], 1 - pinsrb m7, [r2 + 7 + 32], 0 - pmaddubsw m4, m7, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 343 * 16], m4 - - ; mode 23 [row 8 - first half] - pmaddubsw m4, m7, [r3 + 15 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 344 * 16], m4 - - ; mode 23 [row 9 - first half] - pmaddubsw m4, m7, [r3 + 6 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 345 * 16], m4 - - ; mode 23 [row 10 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 7 + 32], 1 - pinsrb m7, [r2 + 11 + 32], 0 - pmaddubsw m4, m7, [r3 + 29 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 346 * 16], m4 - - ; mode 23 [row 11 - first half] - pmaddubsw m4, m7, [r3 + 20 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 347 * 16], m4 - - ; mode 23 [row 12 - first half] - pmaddubsw m4, m7, [r3 + 11 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 348 * 16], m4 - - ; mode 23 [row 13 - first half] - pmaddubsw m4, m7, [r3 + 2 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 349 * 16], m4 - - ; mode 23 [row 14 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 11 + 32], 1 - pinsrb m7, [r2 + 14 + 32], 0 - pmaddubsw m4, m7, [r3 + 25 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 350 * 16], m4 - - ; mode 23 [row 15 - first half] - pmaddubsw m4, m7, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 351 * 16], m4 - - ; mode 21 [row 15 - first half] - pmaddubsw m4, m0, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 319 * 16 + 8], m4 - ; mode 21 [row 15 - second half] end - - ; mode 20 [row 1 - first half] - pslldq m7, m0, 2 - pinsrb m7, [r2 + 0], 1 - pinsrb m7, [r2 + 2 + 32], 0 - pmaddubsw m4, m7, [r3 + 22 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 289 * 16], m4 - - ; mode 20 [row 2 - first half] - pmaddubsw m4, m7, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 290 * 16], m4 - - ; mode 21 [row 1 - first half] - pmaddubsw m4, m7, [r3 + 30 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 305 * 16], m4 - - ; mode 21 [row 2 - first half] - pmaddubsw m4, m7, [r3 + 13 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 306 * 16], m4 - - ; mode 22 [row 2 - first half] - pmaddubsw m4, m7, [r3 + 25 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 322 * 16], m4 - - ; mode 22 [row 3 - first half] - pmaddubsw m4, m7, [r3 + 12 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 323 * 16], m4 - - ; mode 22 [row 4 - first half] - pslldq m1, m7, 2 - pinsrb m1, [r2 + 2 + 32], 1 - pinsrb m1, [r2 + 5 + 32], 0 - pmaddubsw m4, m1, [r3 + 31 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 324 * 16], m4 - - ; mode 22 [row 5 - first half] - pmaddubsw m4, m1, [r3 + 18 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 325 * 16], m4 - - ; mode 22 [row 6 - first half] - pmaddubsw m4, m1, [r3 + 5 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 326 * 16], m4 - - ; mode 22 [row 7 - first half] - pslldq m1, 2 - pinsrb m1, [r2 + 5 + 32], 1 - pinsrb m1, [r2 + 7 + 32], 0 - pmaddubsw m4, m1, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 327 * 16], m4 - - ; mode 22 [row 8 - first half] - pmaddubsw m4, m1, [r3 + 11 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 328 * 16], m4 - - ; mode 22 [row 9 - first half] - pslldq m1, 2 - pinsrb m1, [r2 + 7 + 32], 1 - pinsrb m1, [r2 + 10 + 32], 0 - pmaddubsw m4, m1, [r3 + 30 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 329 * 16], m4 - - ; mode 22 [row 10 - first half] - pmaddubsw m4, m1, [r3 + 17 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 330 * 16], m4 - - ; mode 22 [row 11 - first half] - pmaddubsw m4, m1, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 331 * 16], m4 - - ; mode 22 [row 12 - first half] - pslldq m1, 2 - pinsrb m1, [r2 + 10 + 32], 1 - pinsrb m1, [r2 + 12 + 32], 0 - pmaddubsw m4, m1, [r3 + 23 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 332 * 16], m4 - - ; mode 22 [row 13 - first half] - pmaddubsw m4, m1, [r3 + 10 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 333 * 16], m4 - - ; mode 22 [row 14 - first half] - pslldq m1, 2 - pinsrb m1, [r2 + 12 + 32], 1 - pinsrb m1, [r2 + 15 + 32], 0 - pmaddubsw m4, m1, [r3 + 29 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 334 * 16], m4 - - ; mode 22 [row 15 - first half] - pmaddubsw m4, m1, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 335 * 16], m4 - - ; mode 21 [row 3 - first half] - pslldq m6, m7, 2 - pinsrb m6, [r2 + 2 + 32], 1 - pinsrb m6, [r2 + 4 + 32], 0 - pmaddubsw m4, m6, [r3 + 28 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 307 * 16], m4 - - ; mode 21 [row 4 - first half] - pmaddubsw m4, m6, [r3 + 11 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 308 * 16], m4 - - ; mode 21 [row 5 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 4 + 32], 1 - pinsrb m6, [r2 + 6 + 32], 0 - pmaddubsw m4, m6, [r3 + 26 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 309 * 16], m4 - - ; mode 21 [row 6 - first half] - pmaddubsw m4, m6, [r3 + 9 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 310 * 16], m4 - - ; mode 21 [row 7 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 6 + 32], 1 - pinsrb m6, [r2 + 8 + 32], 0 - pmaddubsw m4, m6, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 311 * 16], m4 - - ; mode 21 [row 8 - first half] - pmaddubsw m4, m6, [r3 + 7 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 312 * 16], m4 - - ; mode 21 [row 9 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 8 + 32], 1 - pinsrb m6, [r2 + 9 + 32], 0 - pmaddubsw m4, m6, [r3 + 22 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 313 * 16], m4 - - ; mode 21 [row 10 - first half] - pmaddubsw m4, m6, [r3 + 5 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 314 * 16], m4 - - ; mode 21 [row 11 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 9 + 32], 1 - pinsrb m6, [r2 + 11 + 32], 0 - pmaddubsw m4, m6, [r3 + 20 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 315 * 16], m4 - - ; mode 21 [row 12 - first half] - pmaddubsw m4, m6, [r3 + 3 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 316 * 16], m4 - - ; mode 21 [row 13 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 11 + 32], 1 - pinsrb m6, [r2 + 13 + 32], 0 - pmaddubsw m4, m6, [r3 + 18 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 317 * 16], m4 - - ; mode 21 [row 14 - first half] - pmaddubsw m4, m6, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 318 * 16], m4 - - ; mode 21 [row 15 - first half] - pslldq m6, 2 - pinsrb m6, [r2 + 32 + 13], 1 - pinsrb m6, [r2 + 32 + 15], 0 - pmaddubsw m4, m6, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 319 * 16], m4 - - ; mode 20 [row 13 - second half] - pmaddubsw m4, m7, [r3 + 26 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 301 * 16 + 8], m4 - ; mode 20 [row 13 - second half] - - ; mode 20 [row 14 - second half] - pmaddubsw m4, m7, [r3 + 5 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 302 * 16 + 8], m4 - ; mode 20 [row 14 - second half] - - ; mode 20 [row 3 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 32 + 2], 1 - pinsrb m7, [r2 + 32 + 3], 0 - pmaddubsw m4, m7, [r3 + 12 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 291 * 16], m4 - - ; mode 20 [row 15 - second half] - pmaddubsw m4, m7, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 303 * 16 + 8], m4 - ; mode 20 [row 15 - second half] - - ; mode 20 [row 4 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 32 + 3], 1 - pinsrb m7, [r2 + 32 + 5], 0 - pmaddubsw m4, m7, [r3 + 23 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 292 * 16], m4 - - ; mode 20 [row 5 - first half] - pmaddubsw m4, m7, [r3 + 2 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 293 * 16], m4 - - ; mode 20 [row 6 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 32 + 5], 1 - pinsrb m7, [r2 + 32 + 6], 0 - pmaddubsw m4, m7, [r3 + 13 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 294 * 16], m4 - - ; mode 20 [row 7 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 32 + 6], 1 - pinsrb m7, [r2 + 32 + 8], 0 - pmaddubsw m4, m7, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 295 * 16], m4 - - ; mode 20 [row 8 - first half] - pmaddubsw m4, m7, [r3 + 3 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 296 * 16], m4 - - ; mode 20 [row 9 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 32 + 8], 1 - pinsrb m7, [r2 + 32 + 9], 0 - pmaddubsw m4, m7, [r3 + 14 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 297 * 16], m4 - - ; mode 20 [row 10 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 32 + 9], 1 - pinsrb m7, [r2 + 32 + 11], 0 - pmaddubsw m4, m7, [r3 + 25 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 298 * 16], m4 - - ; mode 20 [row 11 - first half] - pmaddubsw m4, m7, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 299 * 16], m4 - - ; mode 20 [row 12 - first half] - movu m1, [r3 + 15 * 16] - pslldq m7, 2 - pinsrb m7, [r2 + 32 + 11], 1 - pinsrb m7, [r2 + 32 + 12], 0 - pmaddubsw m4, m7, [r3 + 15 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 300 * 16], m4 - - ; mode 20 [row 13 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 32 + 12], 1 - pinsrb m7, [r2 + 32 + 14], 0 - pmaddubsw m4, m7, [r3 + 26 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 301 * 16], m4 - - ; mode 20 [row 14 - first half] - pmaddubsw m4, m7, [r3 + 5 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 302 * 16], m4 - - ; mode 20 [row 15 - first half] - pslldq m7, 2 - pinsrb m7, [r2 + 32 + 14], 1 - pinsrb m7, [r2 + 32 + 15], 0 - pmaddubsw m4, m7, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 303 * 16], m4 - - ; mode 19 [row 1] - pslldq m0, 2 - pinsrb m0, [r2], 1 - pinsrb m0, [r2 + 32 + 1], 0 - pslldq m5, 2 - pinsrb m5, [r2 + 8], 1 - pinsrb m5, [r2 + 7], 0 - - ; mode 20 [row 1 - second half] - pmaddubsw m4, m5, [r3 + 22 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 289 * 16 + 8], m4 - ; mode 20 [row 1 - second half] end - - ; mode 20 [row 2 - second half] - pmaddubsw m4, m5, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 290 * 16 + 8], m4 - ; mode 20 [row 2 - second half] end - - ; mode 21 [row 2 - second half] - pmaddubsw m4, m5, [r3 + 30 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 305 * 16 + 8], m4 - ; mode 21 [row 2 - second half] end - - ; mode 21 [row 3 - second half] - pmaddubsw m4, m5, [r3 + 13 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 306 * 16 + 8], m4 - ; mode 21 [row 3 - second half] end - - ; mode 21 [row 4 - second half] - pmaddubsw m4, m5, [r3 + 11 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 307 * 16 + 8], m4 - ; mode 21 [row 4 - second half] end - - ; mode 22 [row 2 - second half] - pmaddubsw m4, m5, [r3 + 25 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 322 * 16 + 8], m4 - ; mode 22 [row 2 - second half] end - - ; mode 22 [row 3 - second half] - pmaddubsw m4, m5, [r3 + 12 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 323 * 16 + 8], m4 - ; mode 22 [row 3 - second half] end - - ; mode 23 [row 3 - second half] - pmaddubsw m4, m5, [r3 + 28 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 339 * 16 + 8], m4 - ; mode 23 [row 3 - second half] end - - ; mode 23 [row 4 - second half] - pmaddubsw m4, m5, [r3 + 19 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 340 * 16 + 8], m4 - ; mode 23 [row 4 - second half] end - - ; mode 23 [row 5 - second half] - pmaddubsw m4, m5, [r3 + 10 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 341 * 16 + 8], m4 - ; mode 23 [row 5 - second half] end - - ; mode 23 [row 6 - second half] - pmaddubsw m4, m5, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 342 * 16 + 8], m4 - ; mode 23 [row 6 - second half] end - - ; mode 24 [row 6 - second half] - pmaddubsw m4, m5, [r3 + 29 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 358 * 16 + 8], m4 - ; mode 24 [row 6 - second half] end - - ; mode 24 [row 7 - second half] - pmaddubsw m4, m5, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 359 * 16 + 8], m4 - ; mode 24 [row 7 - second half] end - - ; mode 24 [row 8 - second half] - pmaddubsw m4, m5, [r3 + 19 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 360 * 16 + 8], m4 - ; mode 24 [row 8 - second half] end - - ; mode 24 [row 9 - second half] - pmaddubsw m4, m5, [r3 + 14 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 361 * 16 + 8], m4 - ; mode 24 [row 9 - second half] end - - ; mode 24 [row 10 - second half] - pmaddubsw m4, m5, [r3 + 9 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 362 * 16 + 8], m4 - ; mode 24 [row 10 - second half] end - - ; mode 24 [row 11 - second half] - pmaddubsw m4, m5, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 363 * 16 + 8], m4 - ; mode 24 [row 11 - second half] end - - pmaddubsw m4, m0, [r3 + 12 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 12 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 273 * 16], m4 - - ; mode 19 [row 2] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 1], 1 - pinsrb m0, [r2 + 32 + 2], 0 - pslldq m5, 2 - pinsrb m5, [r2 + 7], 1 - pinsrb m5, [r2 + 6], 0 - - ; mode 20 [row 3 - second half] - pmaddubsw m4, m5, [r3 + 12 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 291 * 16 + 8], m4 - ; mode 20 [row 3 - second half] end - - ; mode 21 [row 3 - second half] - pmaddubsw m4, m5, [r3 + 28 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 307 * 16 + 8], m4 - ; mode 21 [row 3 - second half] end - - ; mode 21 [row 4 - second half] - pmaddubsw m4, m5, [r3 + 11 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 308 * 16 + 8], m4 - ; mode 21 [row 4 - second half] end - - ; mode 22 [row 4 - second half] - pmaddubsw m4, m5, [r3 + 31 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 324 * 16 + 8], m4 - ; mode 22 [row 4 - second half] end - - ; mode 22 [row 5 - second half] - pmaddubsw m4, m5, [r3 + 18 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 325 * 16 + 8], m4 - ; mode 22 [row 5 - second half] end - - ; mode 22 [row 6 - second half] - pmaddubsw m4, m5, [r3 + 5 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 326 * 16 + 8], m4 - ; mode 22 [row 6 - second half] end - - ; mode 23 [row 7 - second half] - pmaddubsw m4, m5, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 343 * 16 + 8], m4 - ; mode 23 [row 7 - second half] end - - ; mode 23 [row 8 - second half] - pmaddubsw m4, m5, [r3 + 15 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 344 * 16 + 8], m4 - ; mode 23 [row 8 - second half] end - - ; mode 23 [row 9 - second half] - pmaddubsw m4, m5, [r3 + 6 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 345 * 16 + 8], m4 - ; mode 23 [row 9 - second half] end - - ; mode 24 [row 12 - second half] - pmaddubsw m4, m5, [r3 + 31 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 364 * 16 + 8], m4 - ; mode 24 [row 12 - second half] end - - ; mode 24 [row 13 - second half] - pmaddubsw m4, m5, [r3 + 26 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 365 * 16 + 8], m4 - ; mode 24 [row 13 - second half] end - - ; mode 24 [row 14 - second half] - pmaddubsw m4, m5, [r3 + 21 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 366 * 16 + 8], m4 - ; mode 24 [row 14 - second half] end - - ; mode 24 [row 15 - second half] - pmaddubsw m4, m5, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 367 * 16 + 8], m4 - ; mode 24 [row 15 - second half] end - - pmaddubsw m4, m0, [r3 + 18 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 18 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 274 * 16], m4 - - ; mode 19 [row 3] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 2], 1 - pinsrb m0, [r2 + 32 + 4], 0 - pslldq m5, 2 - pinsrb m5, [r2 + 6], 1 - pinsrb m5, [r2 + 5], 0 - - ; mode 20 [row 4 - second half] - pmaddubsw m4, m5, [r3 + 23 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 292 * 16 + 8], m4 - ; mode 20 [row 4 - second half] end - - ; mode 20 [row 5 - second half] - pmaddubsw m4, m5, [r3 + 2 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 293 * 16 + 8], m4 - ; mode 20 [row 5 - second half] end - - ; mode 21 [row 5 - second half] - pmaddubsw m4, m5, [r3 + 26 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 309 * 16 + 8], m4 - ; mode 21 [row 5 - second half] end - - ; mode 21 [row 6 - second half] - pmaddubsw m4, m5, [r3 + 9 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 310 * 16 + 8], m4 - ; mode 21 [row 6 - second half] end - - ; mode 22 [row 7 - second half] - pmaddubsw m4, m5, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 327 * 16 + 8], m4 - ; mode 22 [row 7 - second half] end - - ; mode 22 [row 8 - second half] - pmaddubsw m4, m5, [r3 + 11 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 328 * 16 + 8], m4 - ; mode 22 [row 7 - second half] end - - ; mode 23 [row 10 - second half] - pmaddubsw m4, m5, [r3 + 29 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 346 * 16 + 8], m4 - ; mode 23 [row 10 - second half] end - - ; mode 23 [row 11 - second half] - pmaddubsw m4, m5, [r3 + 20 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 347 * 16 + 8], m4 - ; mode 23 [row 11 - second half] end - - ; mode 23 [row 12 - second half] - pmaddubsw m4, m5, [r3 + 11 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 348 * 16 + 8], m4 - ; mode 23 [row 12 - second half] end - - ; mode 23 [row 13 - second half] - pmaddubsw m4, m5, [r3 + 2 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 349 * 16 + 8], m4 - ; mode 23 [row 13 - second half] end - - pmaddubsw m4, m0, [r3 + 24 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 24 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 275 * 16], m4 - - ; mode 19 [row 4] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 4], 1 - pinsrb m0, [r2 + 32 + 5], 0 - pslldq m5, 2 - pinsrb m5, [r2 + 5], 1 - pinsrb m5, [r2 + 4], 0 - - ; mode 20 [row 6 - second half] - pmaddubsw m4, m5, [r3 + 13 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 294 * 16 + 8], m4 - ; mode 20 [row 6 - second half] end - - ; mode 21 [row 7 - second half] - pmaddubsw m4, m5, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 311 * 16 + 8], m4 - ; mode 21 [row 7 - second half] end - - ; mode 21 [row 8 - second half] - pmaddubsw m4, m5, [r3 + 7 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 312 * 16 + 8], m4 - ; mode 21 [row 8 - second half] end - - ; mode 22 [row 9 - second half] - pmaddubsw m4, m5, [r3 + 30 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 329 * 16 + 8], m4 - ; mode 22 [row 9 - second half] end - - ; mode 22 [row 10 - second half] - pmaddubsw m4, m5, [r3 + 17 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 330 * 16 + 8], m4 - ; mode 22 [row 10 - second half] end - - ; mode 22 [row 11 - second half] - pmaddubsw m4, m5, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 331 * 16 + 8], m4 - ; mode 22 [row 11 - second half] end - - ; mode 23 [row 14 - second half] - pmaddubsw m4, m5, [r3 + 25 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 350 * 16 + 8], m4 - ; mode 23 [row 14 - second half] end - - ; mode 23 [row 15 - second half] - pmaddubsw m4, m5, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 351 * 16 + 8], m4 - - ; mode 23 [row 15 - second half] end - pmaddubsw m4, m0, [r3 + 30 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 30 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 276 * 16], m4 - - ; mode 19 [row 5] - pmaddubsw m4, m0, [r3 + 4 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 4 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 277 * 16], m4 - - ; mode 19 [row 6] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 5], 1 - pinsrb m0, [r2 + 32 + 6], 0 - pslldq m5, 2 - pinsrb m5, [r2 + 4], 1 - pinsrb m5, [r2 + 3], 0 - - ; mode 20 [row 7 - second half] - pmaddubsw m4, m5, [r3 + 24 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 295 * 16 + 8], m4 - ; mode 20 [row 7 - second half] end - - ; mode 20 [row 8 - second half] - pmaddubsw m4, m5, [r3 + 3 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 296 * 16 + 8], m4 - ; mode 20 [row 8 - second half] end - - ; mode 21 [row 9 - second half] - pmaddubsw m4, m5, [r3 + 22 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 313 * 16 + 8], m4 - ; mode 21 [row 9 - second half] end - - ; mode 21 [row 10 - second half] - pmaddubsw m4, m5, [r3 + 5 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 314 * 16 + 8], m4 - ; mode 21 [row 10 - second half] end - - ; mode 22 [row 12 - second half] - pmaddubsw m4, m5, [r3 + 23 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 332 * 16 + 8], m4 - ; mode 22 [row 12 - second half] end - - ; mode 22 [row 12 - second half] - pmaddubsw m4, m5, [r3 + 10 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 333 * 16 + 8], m4 - ; mode 22 [row 12 - second half] end - - pmaddubsw m4, m0, [r3 + 10 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 10 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 278 * 16], m4 - - ; mode 19 [row 7] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 6], 1 - pinsrb m0, [r2 + 32 + 7], 0 - pslldq m5, 2 - pinsrb m5, [r2 + 3], 1 - pinsrb m5, [r2 + 2], 0 - - ; mode 20 [row 9 - second half] - pmaddubsw m4, m5, [r3 + 14 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 297 * 16 + 8], m4 - ; mode 20 [row 9 - second half] - - ; mode 21 [row 11 - second half] - pmaddubsw m4, m5, [r3 + 20 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 315 * 16 + 8], m4 - ; mode 21 [row 11 - second half] end - - ; mode 21 [row 12 - second half] - pmaddubsw m4, m5, [r3 + 3 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 316 * 16 + 8], m4 - ; mode 21 [row 12 - second half] end - - ; mode 22 [row 14 - second half] - pmaddubsw m4, m5, [r3 + 29 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 334 * 16 + 8], m4 - ; mode 22 [row 14 - second half] end - - ; mode 22 [row 15 - second half] - pmaddubsw m4, m5, [r3 + 16 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 335 * 16 + 8], m4 - ; mode 22 [row 15 - second half] end - - pmaddubsw m4, m0, [r3 + 16 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 16 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 279 * 16], m4 - - ; mode 19 [row 8] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 7], 1 - pinsrb m0, [r2 + 32 + 9], 0 - pslldq m5, 2 - pinsrb m5, [r2 + 2], 1 - pinsrb m5, [r2 + 1], 0 - - ; mode 20 [row 10 - second half] - pmaddubsw m4, m5, [r3 + 25 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 298 * 16 + 8], m4 - ; mode 20 [row 10 - second half] end - - ; mode 20 [row 11 - second half] - pmaddubsw m4, m5, [r3 + 4 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 299 * 16 + 8], m4 - ; mode 20 [row 11 - second half] end - - ; mode 21 [row 13 - second half] - pmaddubsw m4, m5, [r3 + 18 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 317 * 16 + 8], m4 - ; mode 21 [row 13 - second half] end - - ; mode 21 [row 14 - second half] - pmaddubsw m4, m5, [r3 + 1 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 318 * 16 + 8], m4 - ; mode 21 [row 14 - second half] end - - pmaddubsw m4, m0, [r3 + 22 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 22 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 280 * 16], m4 - - ; mode 19 [row 9] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 9], 1 - pinsrb m0, [r2 + 32 + 10], 0 - pslldq m5, 2 - pinsrb m5, [r2 + 1], 1 - pinsrb m5, [r2 + 0], 0 - - ; mode 20 [row 12 - second half] - pmaddubsw m4, m5, [r3 + 15 * 16] - pmulhrsw m4, m3 - packuswb m4, m4 - movh [r0 + 300 * 16 + 8], m4 - - ; mode 20 [row 12 - second half] end - pmaddubsw m4, m0, [r3 + 28 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 28 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 281 * 16], m4 - - ; mode 19 [row 10] - pmaddubsw m4, m0, [r3 + 2 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 2 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 282 * 16], m4 - - ; mode 19 [row 11] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 10], 1 - pinsrb m0, [r2 + 32 + 11], 0 - pmaddubsw m4, m0, [r3 + 8 * 16] - pmulhrsw m4, m3 - pslldq m5, 2 - pinsrb m5, [r2], 1 - pinsrb m5, [r2 + 32 + 1], 0 - pmaddubsw m6, m5, [r3 + 8 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 283 * 16], m4 - - ; mode 19 [row 12] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 11], 1 - pinsrb m0, [r2 + 32 + 12], 0 - pslldq m5, 2 - pinsrb m5, [r2 + 32 + 1], 1 - pinsrb m5, [r2 + 32 + 2], 0 - pmaddubsw m4, m0, [r3 + 14 * 16] - pmulhrsw m4, m3 - pmaddubsw m6, m5, [r3 + 14 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 284 * 16], m4 - - ; mode 19 [row 13] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 12], 1 - pinsrb m0, [r2 + 32 + 14], 0 - pmaddubsw m4, m0, [r3 + 20 * 16] - pmulhrsw m4, m3 - pslldq m5, 2 - pinsrb m5, [r2 + 32 + 2], 1 - pinsrb m5, [r2 + 32 + 4], 0 - pmaddubsw m6, m5, [r3 + 20 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 285 * 16], m4 - - ; mode 19 [row 14] - pslldq m0, 2 - pinsrb m0, [r2 + 32 + 14], 1 - pinsrb m0, [r2 + 32 + 15], 0 - pmaddubsw m4, m0, [r3 + 26 * 16] - pmulhrsw m4, m3 - pslldq m5, 2 - pinsrb m5, [r2 + 32 + 4], 1 - pinsrb m5, [r2 + 32 + 5], 0 - pmaddubsw m6, m5, [r3 + 26 * 16] - pmulhrsw m6, m3 - packuswb m4, m6 - movu [r0 + 286 * 16], m4 - - ; mode 19 [row 15] - movu m0, [r2 + 32] - pshufb m0, [tab_S1] - movu [r0 + 287 * 16], m0 - movd m1, [r2] - movd [r0 + 287 * 16 + 12], m1 - - ; mode 25 - movu m1, [r1] - - ; mode 26 [all rows] - psrldq m6, m1, 1 - pinsrb m6, [r1 + 16], 15 - movu m7, m6 - movu [r0 + 384 * 16], m6 - movu [r0 + 385 * 16], m6 - movu [r0 + 386 * 16], m6 - movu [r0 + 387 * 16], m6 - movu [r0 + 388 * 16], m6 - movu [r0 + 389 * 16], m6 - movu [r0 + 390 * 16], m6 - movu [r0 + 391 * 16], m6 - movu [r0 + 392 * 16], m6 - movu [r0 + 393 * 16], m6 - movu [r0 + 394 * 16], m6 - movu [r0 + 395 * 16], m6 - movu [r0 + 396 * 16], m6 - movu [r0 + 397 * 16], m6 - movu [r0 + 398 * 16], m6 - movu [r0 + 399 * 16], m6 - - pxor m0, m0 - pshufb m6, m6, m0 - punpcklbw m6, m0 - pinsrb m2, [r1], 0 - pshufb m2, m2, m0 - punpcklbw m2, m0 - movu m4, [r1 + 1 + 32] - punpcklbw m5, m4, m0 - punpckhbw m4, m0 - psubw m5, m2 - psubw m4, m2 - psraw m5, 1 - psraw m4, 1 - paddw m5, m6 - paddw m4, m6 - packuswb m5, m4 - - pextrb [r0 + 384 * 16], m5, 0 - pextrb [r0 + 385 * 16], m5, 1 - pextrb [r0 + 386 * 16], m5, 2 - pextrb [r0 + 387 * 16], m5, 3 - pextrb [r0 + 388 * 16], m5, 4 - pextrb [r0 + 389 * 16], m5, 5 - pextrb [r0 + 390 * 16], m5, 6 - pextrb [r0 + 391 * 16], m5, 7 - pextrb [r0 + 392 * 16], m5, 8 - pextrb [r0 + 393 * 16], m5, 9 - pextrb [r0 + 394 * 16], m5, 10 - pextrb [r0 + 395 * 16], m5, 11 - pextrb [r0 + 396 * 16], m5, 12 - pextrb [r0 + 397 * 16], m5, 13 - pextrb [r0 + 398 * 16], m5, 14 - pextrb [r0 + 399 * 16], m5, 15 - - ; mode 25 [row 15] - movu [r0 + 383 * 16], m1 - - ; mode 25 [row 0] - psrldq m2, m1, 1 - punpcklbw m1, m2 - movu m2, [r1 + 8] - psrldq m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m4, m1, [r3 + 30 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 30 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 368 * 16], m4 - - ; mode 25 [row 1] - pmaddubsw m4, m1, [r3 + 28 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 28 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 369 * 16], m4 - - ; mode 25 [row 2] - pmaddubsw m4, m1, [r3 + 26 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 26 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 370 * 16], m4 - - ; mode 25 [row 3] - pmaddubsw m4, m1, [r3 + 24 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 24 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 371 * 16], m4 - - ; mode 25 [row 4] - pmaddubsw m4, m1, [r3 + 22 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 22 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 372 * 16], m4 - - ; mode 25 [row 5] - pmaddubsw m4, m1, [r3 + 20 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 20 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 373 * 16], m4 - - ; mode 25 [row 6] - pmaddubsw m4, m1, [r3 + 18 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 18 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 374 * 16], m4 - - ; mode 25 [row 7] - pmaddubsw m4, m1, [r3 + 16 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 16 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 375 * 16], m4 - - ; mode 25 [row 8] - pmaddubsw m4, m1, [r3 + 14 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 14 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 376 * 16], m4 - - ; mode 25 [row 9] - pmaddubsw m4, m1, [r3 + 12 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 12 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 377 * 16], m4 - - ; mode 25 [row 10] - pmaddubsw m4, m1, [r3 + 10 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 10 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 378 * 16], m4 - - ; mode 25 [row 11] - pmaddubsw m4, m1, [r3 + 8 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 8 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 379 * 16], m4 - - ; mode 25 [row 12] - pmaddubsw m4, m1, [r3 + 6 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 6 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 380 * 16], m4 - - ; mode 25 [row 13] - pmaddubsw m4, m1, [r3 + 4 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 4 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 381 * 16], m4 - - ; mode 25 [row 14] - pmaddubsw m4, m1, [r3 + 2 * 16] - pmulhrsw m4, m3 - pmaddubsw m5, m2, [r3 + 2 * 16] - pmulhrsw m5, m3 - packuswb m4, m5 - movu [r0 + 382 * 16], m4 - - ; mode 27 [row 15] - psrldq m6, m7, 1 - punpcklbw m7, m6 - pinsrb m6, [r1 + 17], 15 - movu [r0 + 415 * 16], m6 - - ; mode 27 [row 0] - movu m4, [r1 + 9] - psrldq m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m6, m7, [r3 + 2 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 2 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 400 * 16], m6 - - ; mode 27 [row 1] - pmaddubsw m6, m7, [r3 + 4 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 4 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 401 * 16], m6 - - ; mode 27 [row 2] - pmaddubsw m6, m7, [r3 + 6 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 6 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 402 * 16], m6 - - ; mode 27 [row 3] - pmaddubsw m6, m7, [r3 + 8 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 8 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 403 * 16], m6 - - ; mode 27 [row 4] - pmaddubsw m6, m7, [r3 + 10 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 10 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 404 * 16], m6 - - ; mode 27 [row 5] - pmaddubsw m6, m7, [r3 + 12 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 12 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 405 * 16], m6 - - ; mode 27 [row 6] - pmaddubsw m6, m7, [r3 + 14 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 14 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 406 * 16], m6 - - ; mode 27 [row 7] - pmaddubsw m6, m7, [r3 + 16 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 16 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 407 * 16], m6 - - ; mode 27 [row 8] - pmaddubsw m6, m7, [r3 + 18 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 18 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 408 * 16], m6 - - ; mode 27 [row 9] - pmaddubsw m6, m7, [r3 + 20 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 20 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 409 * 16], m6 - - ; mode 27 [row 10] - pmaddubsw m6, m7, [r3 + 22 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 22 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 410 * 16], m6 - - ; mode 27 [row 11] - pmaddubsw m6, m7, [r3 + 24 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 24 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 411 * 16], m6 - - ; mode 27 [row 12] - pmaddubsw m6, m7, [r3 + 26 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 26 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 412 * 16], m6 - - ; mode 27 [row 13] - pmaddubsw m6, m7, [r3 + 28 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 28 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 413 * 16], m6 - - ; mode 27 [row 14] - pmaddubsw m6, m7, [r3 + 30 * 16] - pmulhrsw m6, m3 - pmaddubsw m5, m4, [r3 + 30 * 16] - pmulhrsw m5, m3 - packuswb m6, m5 - movu [r0 + 414 * 16], m6 - - ; mode 28 [row 0] - movu m1, [r2 + 1] - psrldq m2, m1, 1 - punpcklbw m1, m2 - movu m4, [r2 + 9] - psrldq m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m2, m1, [r3 + 5 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 5 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 416 * 16], m2 - - ; mode 28 [row 0] - pmaddubsw m2, m1, [r3 + 5 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 5 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 416 * 16], m2 - - ; mode 28 [row 1] - pmaddubsw m2, m1, [r3 + 10 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 10 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 417 * 16], m2 - - ; mode 28 [row 2] - pmaddubsw m2, m1, [r3 + 15 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 15 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 418 * 16], m2 - - ; mode 28 [row 3] - pmaddubsw m2, m1, [r3 + 20 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 20 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 419 * 16], m2 - - ; mode 28 [row 4] - pmaddubsw m2, m1, [r3 + 25 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 25 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 420 * 16], m2 - - ; mode 28 [row 5] - pmaddubsw m2, m1, [r3 + 30 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 30 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 421 * 16], m2 - - ; mode 29 [row 0] - pmaddubsw m2, m1, [r3 + 9 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 9 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 432 * 16], m2 - - ; mode 29 [row 1] - pmaddubsw m2, m1, [r3 + 18 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 18 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 433 * 16], m2 - - ; mode 29 [row 2] - pmaddubsw m2, m1, [r3 + 27 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 27 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 434 * 16], m2 - - ; mode 30 [row 0] - pmaddubsw m2, m1, [r3 + 13 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 13 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 448 * 16], m2 - - ; mode 30 [row 1] - pmaddubsw m2, m1, [r3 + 26 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 26 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 449 * 16], m2 - - ; mode 33 [row 0] - movu [r0 + 496 * 16], m2 - - ; mode 31 [row 0] - pmaddubsw m2, m1, [r3 + 17 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 17 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 464 * 16], m2 - - ; mode 32 [row 0] - pmaddubsw m2, m1, [r3 + 21 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 21 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 480 * 16], m2 - - ; mode 28 [row 6] - movd m7, [r2 + 9] - palignr m7, m1, 2 - pmaddubsw m2, m7, [r3 + 3 * 16] - pmulhrsw m2, m3 - movd m6, [r2 + 17] - palignr m6, m4, 2 - pmaddubsw m5, m6, [r3 + 3 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 422 * 16], m2 - - ; mode 28 [row 7] - pmaddubsw m2, m7, [r3 + 8 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 8 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 423 * 16], m2 - - ; mode 28 [row 8] - pmaddubsw m2, m7, [r3 + 13 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 13 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 424 * 16], m2 - - ; mode 28 [row 9] - pmaddubsw m2, m7, [r3 + 18 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 18 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 425 * 16], m2 - - ; mode 28 [row 10] - pmaddubsw m2, m7, [r3 + 23 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 23 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 426 * 16], m2 - - ; mode 29 [row 3] - pmaddubsw m2, m7, [r3 + 4 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 4 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 435 * 16], m2 - - ; mode 29 [row 4] - pmaddubsw m2, m7, [r3 + 13 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 13 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 436 * 16], m2 - - ; mode 29 [row 5] - pmaddubsw m2, m7, [r3 + 22 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 22 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 437 * 16], m2 - - ; mode 29 [row 6] - pmaddubsw m2, m7, [r3 + 31 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 31 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 438 * 16], m2 - - ; mode 32 [row 2] - movu [r0 + 482 * 16], m2 - - ; mode 30 [row 2] - pmaddubsw m2, m7, [r3 + 7 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 7 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 450 * 16], m2 - - ; mode 30 [row 3] - pmaddubsw m2, m7, [r3 + 20 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 20 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 451 * 16], m2 - - ; mode 33 [row 1] - movu [r0 + 497 * 16], m2 - - ; mode 31 [row 1] - pmaddubsw m2, m7, [r3 + 2 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 2 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 465 * 16], m2 - - ; mode 31 [row 2] - pmaddubsw m2, m7, [r3 + 19 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 19 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 466 * 16], m2 - - ; mode 32 [row 1] - pmaddubsw m2, m7, [r3 + 10 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 10 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 481 * 16], m2 - - ; mode 28 [row 11] - pmaddubsw m2, m7, [r3 + 28 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 28 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 427 * 16], m2 - - ; mode 28 [row 12] - movd m1, [r2 + 10] - palignr m1, m7, 2 - pmaddubsw m2, m1, [r3 + 1 * 16] - pmulhrsw m2, m3 - movd m4, [r2 + 18] - palignr m4, m6, 2 - pmaddubsw m5, m4, [r3 + 1 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 428 * 16], m2 - - ; mode 30 [row 4] - movu [r0 + 452 * 16], m2 - - ; mode 28 [row 13] - pmaddubsw m2, m1, [r3 + 6 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 6 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 429 * 16], m2 - - ; mode 28 [row 14] - pmaddubsw m2, m1, [r3 + 11 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 11 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 430 * 16], m2 - - ; mode 28 [row 15] - pmaddubsw m2, m1, [r3 + 16 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 16 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 431 * 16], m2 - - ; mode 29 [row 7] - pmaddubsw m2, m1, [r3 + 8 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 8 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 439 * 16], m2 - - ; mode 29 [row 8] - pmaddubsw m2, m1, [r3 + 17 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 17 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 440 * 16], m2 - - ; mode 29 [row 9] - pmaddubsw m2, m1, [r3 + 26 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 26 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 441 * 16], m2 - - ; mode 30 [row 5] - pmaddubsw m2, m1, [r3 + 14 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 14 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 453 * 16], m2 - - ; mode 33 [row 2] - movu [r0 + 498 * 16], m2 - - ; mode 30 [row 6] - pmaddubsw m2, m1, [r3 + 27 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 27 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 454 * 16], m2 - - ; mode 31 [row 3] - pmaddubsw m2, m1, [r3 + 4 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 4 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 467 * 16], m2 - - ; mode 31 [row 4] - pmaddubsw m2, m1, [r3 + 21 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 21 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 468 * 16], m2 - - ; mode 32 [row 3] - pmaddubsw m2, m1, [r3 + 20 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 20 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 483 * 16], m2 - - ; mode 29 [row 10] - movd m7, [r2 + 11] - palignr m7, m1, 2 - pmaddubsw m2, m7, [r3 + 3 * 16] - pmulhrsw m2, m3 - movd m6, [r2 + 19] - palignr m6, m4, 2 - pmaddubsw m5, m6, [r3 + 3 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 442 * 16], m2 - - ; mode 29 [row 11] - pmaddubsw m2, m7, [r3 + 12 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 12 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 443 * 16], m2 - - ; mode 29 [row 12] - pmaddubsw m2, m7, [r3 + 21 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 21 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 444 * 16], m2 - - ; mode 30 [row 8] - movu [r0 + 456 * 16], m2 - - ; mode 29 [row 13] - pmaddubsw m2, m7, [r3 + 30 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 30 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 445 * 16], m2 - - ; mode 32 [row 5] - movu [r0 + 485 * 16], m2 - - ; mode 30 [row 7] - pmaddubsw m2, m7, [r3 + 8 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 8 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 455 * 16], m2 - - ; mode 33 [row 3] - movu [r0 + 499 * 16], m2 - - ; mode 31 [row 5] - pmaddubsw m2, m7, [r3 + 6 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 6 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 469 * 16], m2 - - ; mode 31 [row 6] - pmaddubsw m2, m7, [r3 + 23 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 23 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 470 * 16], m2 - - ; mode 32 [row 4] - pmaddubsw m2, m7, [r3 + 9 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 9 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 484 * 16], m2 - - movu m1, m7 - movu m4, m6 - - ; mode 29 [row 14] - movu m1, [r2 + 12] - palignr m1, m7, 2 - pmaddubsw m2, m1, [r3 + 7 * 16] - pmulhrsw m2, m3 - movd m4, [r2 + 20] - palignr m4, m6, 2 - pmaddubsw m5, m4, [r3 + 7 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 446 * 16], m2 - - ; mode 29 [row 15] - pmaddubsw m2, m1, [r3 + 16 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 16 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 447 * 16], m2 - - ; mode 30 [row 9] - pmaddubsw m2, m1, [r3 + 2 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 2 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 457 * 16], m2 - - ; mode 33 [row 4] - movu [r0 + 500 * 16], m2 - - ; mode 30 [row 10] - pmaddubsw m2, m1, [r3 + 15 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 15 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 458 * 16], m2 - - ; mode 30 [row 11] - pmaddubsw m2, m1, [r3 + 28 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 28 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 459 * 16], m2 - - ; mode 33 [row 5] - movu [r0 + 501 * 16], m2 - - ; mode 31 [row 7] - pmaddubsw m2, m1, [r3 + 8 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 8 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 471 * 16], m2 - - ; mode 31 [row 8] - pmaddubsw m2, m1, [r3 + 25 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 25 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 472 * 16], m2 - - ; mode 32 [row 6] - pmaddubsw m2, m1, [r3 + 19 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 19 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 486 * 16], m2 - - ; mode 30 [row 12] - movd m7, [r2 + 13] - palignr m7, m1, 2 - pmaddubsw m2, m7, [r3 + 9 * 16] - pmulhrsw m2, m3 - movd m6, [r2 + 21] - palignr m6, m4, 2 - pmaddubsw m5, m6, [r3 + 9 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 460 * 16], m2 - - ; mode 30 [row 13] - pmaddubsw m2, m7, [r3 + 22 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 22 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 461 * 16], m2 - - ; mode 33 [row 6] - movu [r0 + 502 * 16], m2 - - ; mode 31 [row 9] - pmaddubsw m2, m7, [r3 + 10 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 10 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 473 * 16], m2 - - ; mode 31 [row 10] - pmaddubsw m2, m7, [r3 + 27 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 27 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 474 * 16], m2 - - ; mode 32 [row 7] - pmaddubsw m2, m7, [r3 + 8 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 8 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 487 * 16], m2 - - ; mode 32 [row 8] - pmaddubsw m2, m7, [r3 + 29 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 29 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 488 * 16], m2 - - - movu m1, m7 - movu m4, m6 - - ; mode 30 [row 14] - movd m1, [r2 + 14] - palignr m1, m7, 2 - pmaddubsw m2, m1, [r3 + 3 * 16] - pmulhrsw m2, m3 - movd m4, [r2 + 22] - palignr m4, m6, 2 - pmaddubsw m5, m4, [r3 + 3 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 462 * 16], m2 - - ; mode 30 [row 15] - pmaddubsw m2, m1, [r3 + 16 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 16 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 463 * 16], m2 - - ; mode 33 [row 7] - movu [r0 + 503 * 16], m2 - - ; mode 31 [row 11] - pmaddubsw m2, m1, [r3 + 12 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 12 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 475 * 16], m2 - - ; mode 31 [row 12] - pmaddubsw m2, m1, [r3 + 29 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 29 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 476 * 16], m2 - - ; mode 32 [row 9] - pmaddubsw m2, m1, [r3 + 18 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 18 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 489 * 16], m2 - - ; mode 31 [row 13] - movd m7, [r2 + 15] - palignr m7, m1, 2 - pmaddubsw m2, m7, [r3 + 14 * 16] - pmulhrsw m2, m3 - movd m6, [r2 + 23] - palignr m6, m4, 2 - pmaddubsw m5, m6, [r3 + 14 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 477 * 16], m2 - - ; mode 31 [row 14] - pmaddubsw m2, m7, [r3 + 31 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 31 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 478 * 16], m2 - - ; mode 32 [row 10] - pmaddubsw m2, m7, [r3 + 7 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 7 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 490 * 16], m2 - - ; mode 32 [row 11] - pmaddubsw m2, m7, [r3 + 28 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 28 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 491 * 16], m2 - - ; mode 33 [row 8] - pmaddubsw m2, m7, [r3 + 10 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 10 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 504 * 16], m2 - - ; mode 31 [row 15] - movd m1, [r2 + 16] - palignr m1, m7, 2 - pmaddubsw m2, m1, [r3 + 16 * 16] - pmulhrsw m2, m3 - movd m4, [r2 + 24] - palignr m4, m6, 2 - pmaddubsw m5, m4, [r3 + 16 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 479 * 16], m2 - - ; mode 32 [row 12] - pmaddubsw m2, m1, [r3 + 17 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 17 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 492 * 16], m2 - - ; mode 33 [row 9] - pmaddubsw m2, m1, [r3 + 4 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 4 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 505 * 16], m2 - - ; mode 33 [row 10] - pmaddubsw m2, m1, [r3 + 30 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 30 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 506 * 16], m2 - - ; mode 33 [row 10] - pmaddubsw m2, m1, [r3 + 4 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 4 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 505 * 16], m2 - - ; mode 32 [row 13] - movd m7, [r2 + 17] - palignr m7, m1, 2 - pmaddubsw m2, m7, [r3 + 6 * 16] - pmulhrsw m2, m3 - - movd m6, [r2 + 25] - palignr m6, m4, 2 - pmaddubsw m5, m6, [r3 + 6 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 493 * 16], m2 - - ; mode 32 [row 14] - pmaddubsw m2, m7, [r3 + 27 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 27 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 494 * 16], m2 - - ; mode 33 [row 11] - pmaddubsw m2, m7, [r3 + 24 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m6, [r3 + 24 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 507 * 16], m2 - - ; mode 32 [row 15] - movd m1, [r2 + 18] - palignr m1, m7, 2 - pmaddubsw m2, m1, [r3 + 16 * 16] - pmulhrsw m2, m3 - psrldq m4, 2 - pinsrb m4, [r2 + 26], 14 - pinsrb m4, [r2 + 27], 15 - movd m4, [r2 + 26] - palignr m4, m6, 2 - pmaddubsw m5, m4, [r3 + 16 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 495 * 16], m2 - - ; mode 33 [row 12] - pmaddubsw m2, m1, [r3 + 18 * 16] - pmulhrsw m2, m3 - pmaddubsw m5, m4, [r3 + 18 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 508 * 16], m2 - - ; mode 33 [row 13] - movd m7, [r2 + 19] - palignr m7, m1, 2 - pmaddubsw m2, m7, [r3 + 12 * 16] - pmulhrsw m2, m3 - movd m6, [r2 + 27] - palignr m6, m4, 2 - pmaddubsw m5, m6, [r3 + 12 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 509 * 16], m2 - - ; mode 33 [row 14] - movd m1, [r2 + 20] - palignr m1, m7, 2 - pmaddubsw m2, m1, [r3 + 6 * 16] - pmulhrsw m2, m3 - movd m4, [r2 + 28] - palignr m4, m6, 2 - pmaddubsw m5, m4, [r3 + 6 * 16] - pmulhrsw m5, m3 - packuswb m2, m5 - movu [r0 + 510 * 16], m2 - - ; mode 34 [row 0] - movu m1, [r2 + 2] - movu [r0 + 512 * 16], m1 - movu m2, [r2 + 18] - palignr m3, m2, m1, 1 - movu [r0 + 513 * 16], m3 - palignr m3, m2, m1, 2 - movu [r0 + 514 * 16], m3 - palignr m3, m2, m1, 3 - movu [r0 + 515 * 16], m3 - palignr m3, m2, m1, 4 - movu [r0 + 516 * 16], m3 - palignr m3, m2, m1, 5 - movu [r0 + 517 * 16], m3 - palignr m3, m2, m1, 6 - movu [r0 + 518 * 16], m3 - palignr m3, m2, m1, 7 - movu [r0 + 519 * 16], m3 - palignr m3, m2, m1, 8 - movu [r0 + 520 * 16], m3 - palignr m3, m2, m1, 9 - movu [r0 + 521 * 16], m3 - palignr m3, m2, m1, 10 - movu [r0 + 522 * 16], m3 - palignr m3, m2, m1, 11 - movu [r0 + 523 * 16], m3 - palignr m3, m2, m1, 12 - movu [r0 + 524 * 16], m3 - - ; mode 33 [row 15] - movu [r0 + 511 * 16], m3 - - ; mode 34 - palignr m3, m2, m1, 13 - movu [r0 + 525 * 16], m3 - palignr m3, m2, m1, 14 - movu [r0 + 526 * 16], m3 - palignr m3, m2, m1, 15 - movu [r0 + 527 * 16], m3 - RET - -;-------------------------------------------------------------------------------- -; void all_angs_pred_32x32(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma) -;-------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal all_angs_pred_32x32, 3,7,8, 0-4 - mov r6d, [r1 + 64] - mov r3d, [r1] - mov [rsp], r6d - mov [r1 + 64], r3b - mov r3d, [r2] - mov r6d, [r2 + 64] - mov [r2 + 64], r3b - - lea r3, [r2] - lea r4, [r2 + 64] - lea r2, [r1 + 64] - - ;mode 2[row 0] - movu m0, [r4 + 2] - movu [r0 + 0 * 16], m0 - movu m1, [r4 + 18] - movu [r0 + 1 * 16], m1 - - ;mode 9 [row 15] - movu [r0 + 478 * 16], m0 - movu [r0 + 479 * 16], m1 - - ;mode 2[row 1] - movu m2, [r4 + 34] - palignr m3, m1, m0, 1 - movu [r0 + 2 * 16], m3 - palignr m4, m2, m1, 1 - movu [r0 + 3 * 16], m4 - - ; mode 9 [row 31] - movu [r0 + 510 * 16], m3 - movu [r0 + 511 * 16], m4 - - ;mode 2[row 17] - movu [r0 + 34 * 16], m4 - movu m5, [r4 + 35] - movu [r0 + 35 * 16], m5 - - ;mode 2[row 2] - palignr m3, m1, m0, 2 - movu [r0 + 4 * 16], m3 - palignr m4, m2, m1, 2 - movu [r0 + 5 * 16], m4 - - ;mode 2[row 18] - movu [r0 + 36 * 16], m4 - movu m6, [r4 + 51] - palignr m7, m6, m5, 1 - movu [r0 + 37 * 16], m7 - - ;mode 2[row 3] - palignr m3, m1, m0, 3 - movu [r0 + 6 * 16], m3 - palignr m4, m2, m1, 3 - movu [r0 + 7 * 16], m4 - - ;mode 2[row 19] - movu [r0 + 38 * 16], m4 - palignr m7, m6, m5, 2 - movu [r0 + 39 * 16], m7 - - ;mode 2[row 4] - palignr m3, m1, m0, 4 - movu [r0 + 8 * 16], m3 - palignr m4, m2, m1, 4 - movu [r0 + 9 * 16], m4 - - ; mode 8 [row 31] - movu [r0 + 446 * 16], m3 - movu [r0 + 447 * 16], m4 - - ;mode 2[row 20] - movu [r0 + 40 * 16], m4 - palignr m7, m6, m5, 3 - movu [r0 + 41 * 16], m7 - - ; mode 4 [row 31] - movu [r0 + 190 * 16], m4 - movu [r0 + 191 * 16], m7 - - ;mode 2[row 5] - palignr m3, m1, m0, 5 - movu [r0 + 10 * 16], m3 - palignr m4, m2, m1, 5 - movu [r0 + 11 * 16], m4 - - ;mode 2[row 21] - movu [r0 + 42 * 16], m4 - palignr m7, m6, m5, 4 - movu [r0 + 43 * 16], m7 - - ;mode 2[row 6] - palignr m3, m1, m0, 6 - movu [r0 + 12 * 16], m3 - palignr m4, m2, m1, 6 - movu [r0 + 13 * 16], m4 - - ;mode 2[row 22] - movu [r0 + 44 * 16], m4 - palignr m7, m6, m5, 5 - movu [r0 + 45 * 16], m7 - - ;mode 2[row 7] - palignr m3, m1, m0, 7 - movu [r0 + 14 * 16], m3 - palignr m4, m2, m1, 7 - movu [r0 + 15 * 16], m4 - - ;mode 2[row 23] - movu [r0 + 46 * 16], m4 - palignr m7, m6, m5, 6 - movu [r0 + 47 * 16], m7 - - ;mode 2[row 8] - palignr m3, m1, m0, 8 - movu [r0 + 16 * 16], m3 - palignr m4, m2, m1, 8 - movu [r0 + 17 * 16], m4 - - ;mode 7[row 31] - movu [r0 + 382 * 16], m3 - movu [r0 + 383 * 16], m4 - - ;mode 2[row 24] - movu [r0 + 48 * 16], m4 - palignr m7, m6, m5, 7 - movu [r0 + 49 * 16], m7 - - ;mode 2[row 9] - palignr m3, m1, m0, 9 - movu [r0 + 18 * 16], m3 - palignr m4, m2, m1, 9 - movu [r0 + 19 * 16], m4 - - ;mode 2[row 25] - movu [r0 + 50 * 16], m4 - palignr m7, m6, m5, 8 - movu [r0 + 51 * 16], m7 - - ; mode 3 [row 31] - movu [r0 + 126 * 16], m4 - movu [r0 + 127 * 16], m7 - - ;mode 2[row 10] - palignr m3, m1, m0, 10 - movu [r0 + 20 * 16], m3 - palignr m4, m2, m1, 10 - movu [r0 + 21 * 16], m4 - - ;mode 2[row 26] - movu [r0 + 52 * 16], m4 - palignr m7, m6, m5, 9 - movu [r0 + 53 * 16], m7 - - ;mode 2[row 11] - palignr m3, m1, m0, 11 - movu [r0 + 22 * 16], m3 - palignr m4, m2, m1, 11 - movu [r0 + 23 * 16], m4 - - ;mode 2[row 27] - movu [r0 + 54 * 16], m4 - palignr m7, m6, m5, 10 - movu [r0 + 55 * 16], m7 - - ;mode 2[row 12] - palignr m3, m1, m0, 12 - movu [r0 + 24 * 16], m3 - palignr m4, m2, m1, 12 - movu [r0 + 25 * 16], m4 - - ; mode 6 [row 31] - movu [r0 + 318 * 16], m3 - movu [r0 + 319 * 16], m4 - - ; mode 3 [row 15] - movu [r0 + 94 * 16], m3 - movu [r0 + 95 * 16], m4 - - ;mode 2[row 28] - movu [r0 + 56 * 16], m4 - palignr m7, m6, m5, 11 - movu [r0 + 57 * 16], m7 - - ;mode 2[row 13] - palignr m3, m1, m0, 13 - movu [r0 + 26 * 16], m3 - palignr m4, m2, m1, 13 - movu [r0 + 27 * 16], m4 - - ;mode 2[row 29] - movu [r0 + 58 * 16], m4 - palignr m7, m6, m5, 12 - movu [r0 + 59 * 16], m7 - - ;mode 2[row 14] - palignr m3, m1, m0, 14 - movu [r0 + 28 * 16], m3 - palignr m4, m2, m1, 14 - movu [r0 + 29 * 16], m4 - - ;mode 2[row 30] - movu [r0 + 60 * 16], m4 - palignr m7, m6, m5, 13 - movu [r0 + 61 * 16], m7 - - ;mode 2[row 15] - palignr m3, m1, m0, 15 - movu [r0 + 30 * 16], m3 - palignr m4, m2, m1, 15 - movu [r0 + 31 * 16], m4 - - ;mode 2[row 31] - movu [r0 + 62 * 16], m4 - palignr m7, m6, m5, 14 - movu [r0 + 63 * 16], m7 - - ;mode 2[row 16] - movu [r0 + 32 * 16], m1 - movu [r0 + 33 * 16], m2 - - ; mode 5[row 31] - movu [r0 + 254 * 16], m1 - movu [r0 + 255 * 16], m2 - - ; mode 3 [row 0] - lea r5, [ang_table] - movu m6, [r5 + 26 * 16] - movu m7, [pw_1024 ] - movu m1, [r4 + 1 ] - punpcklbw m1, m0 - pmaddubsw m0, m1, m6 - pmulhrsw m0, m7 - movu m2, [r4 + 9] - movd m3, [r4 + 10] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m0, m3 - movu [r0 + 64 * 16], m0 - - ; mode 6 [row 1 - first half] - movu [r0 + 258 * 16], m0 - - ; mode 9 [row 12 - first half] - movu [r0 + 472 * 16], m0 - - movu m0, [r4 + 17] - movd m3, [r4 + 18] - palignr m3, m0, 1 - punpcklbw m0, m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 25] - movd m5, [r4 + 26] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 65 * 16], m3 - - ; mode 6 [row 1 - second half] - movu [r0 + 259 * 16], m3 - - ; mode 9 [row 12 - second half] - movu [r0 + 473 * 16], m3 - - ; mode 4 [row 0] - movu m6, [r5 + 21 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 128 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 129 * 16], m3 - - ; mode 5 [row 0] - movu m6, [r5 + 17 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 192 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 193 * 16], m3 - - ; mode 6 [row 0] - movu m6, [r5 + 13 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 256 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 257 * 16], m3 - - ; mode 7 [row 0] - movu m6, [r5 + 9 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 320 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 321 * 16], m3 - - ; mode 7 [row 1] - movu m6, [r5 + 18 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 322 * 16], m3 - - ; mode 9 [row 8 - first half] - movu [r0 + 464 * 16], m3 - - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 323 * 16], m3 - - ; mode 9 [row 8 - second half] - movu [r0 + 465 * 16], m3 - - ; mode 7 [row 2] - movu m6, [r5 + 27 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 324 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 325 * 16], m3 - - ; mode 8 [row 0] - movu m6, [r5 + 5 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 384 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 385 * 16], m3 - - ; mode 8 [row 1] - movu m6, [r5 + 10 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 386 * 16], m3 - - ; mode 9 [row 4 - first half] - movu [r0 + 456 * 16], m3 - - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 387 * 16], m3 - - ; mode 9 [row 4 - second half] - movu [r0 + 457 * 16], m3 - - ; mode 8 [row 2] - movu m6, [r5 + 15 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 388 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 389 * 16], m3 - - ; mode 8 [row 3] - movu m6, [r5 + 20 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 390 * 16], m3 - - ; mode 9 [row 9 - first half] - movu [r0 + 466 * 16], m3 - - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 391 * 16], m3 - - ; mode 9 [row 9 - second half] - movu [r0 + 467 * 16], m3 - - ; mode 8 [row 4] - movu m6, [r5 + 25 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 392 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 393 * 16], m3 - - ; mode 8 [row 5] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 394 * 16], m3 - - ; mode 9 [row 14 - first half] - movu [r0 + 476 * 16], m3 - - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 395 * 16], m3 - - ; mode 9 [row 14 - second half] - movu [r0 + 477 * 16], m3 - - ; mode 9 [row 0] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 448 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 449 * 16], m3 - - ; mode 9 [row 1] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 450 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 451 * 16], m3 - - ; mode 9 [row 2] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 452 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 453 * 16], m3 - - ; mode 9 [row 3] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 454 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 455 * 16], m3 - - ; mode 9 [row 5] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 458 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 459 * 16], m3 - - ; mode 9 [row 6] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 460 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 461 * 16], m3 - - ; mode 9 [row 7] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 462 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 463 * 16], m3 - - ; mode 9 [row 10] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 468 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 469 * 16], m3 - - ; mode 9 [row 11] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 470 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 471 * 16], m3 - - ; mode 9 [row 13] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 474 * 16], m3 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 475 * 16], m3 - - ; mode 3 [row 1] - movu m6, [r5 + 20 * 16] - movu m0, [r4 + 2] - movd m1, [r4 + 3] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 10] - movd m3, [r4 + 11] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 66 * 16], m1 - - ; mode 6 [row 3 - first half] - movu [r0 + 262 * 16], m1 - - ; mode 9 [row 25 - first half] - movu [r0 + 498 * 16], m1 - - movu m1, [r4 + 18] - movd m3, [r4 + 19] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 26] - movd m5, [r4 + 27] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 67 * 16], m3 - - ; mode 6 [row 3 - second half] - movu [r0 + 263 * 16], m3 - - ; mode 9 [row 25 - second half] - movu [r0 + 499 * 16], m3 - - ; mode 4 [row 1] - movu m6, [r5 + 10 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 130 * 16], m3 - - ; mode 9 [row 20 - first half] - movu [r0 + 488 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 131 * 16], m3 - - ; mode 9 [row 20 - second half] - movu [r0 + 489 * 16], m3 - - ; mode 4 [row 2] - movu m6, [r5 + 31 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 132 * 16], m3 - - ; mode 7 [row 6 - first half] - movu [r0 + 332 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 133 * 16], m3 - - ; mode 7 [row 6 - second half] - movu [r0 + 333 * 16], m3 - - ; mode 5 [row 1] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 194 * 16], m3 - - ; mode 5 [row 1 - first half] - movu [r0 + 480 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 195 * 16], m3 - - ; mode 5 [row 1 - second half] - movu [r0 + 481 * 16], m3 - - ; mode 5 [row 2] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 196 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 197 * 16], m3 - - ; mode 6 [row 2] - movu m6, [r5 + 7 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 260 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 261 * 16], m3 - - ; mode 7 [row 3] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 326 * 16], m3 - - ; mode 9 [row 17 - first half] - movu [r0 + 482 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 327 * 16], m3 - - ; mode 9 [row 17 - second half] - movu [r0 + 483 * 16], m3 - - ; mode 7 [row 4] - movu m6, [r5 + 13 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 328 * 16], m3 - - ; mode 8 [row 8 - first half] - movu [r0 + 400 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 329 * 16], m3 - - ; mode 8 [row 8 - second half] - movu [r0 + 401 * 16], m3 - - ; mode 7 [row 5] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 330 * 16], m3 - - ; mode 9 [row 26 - first half] - movu [r0 + 500 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 331 * 16], m3 - - ; mode 9 [row 26 - second half] - movu [r0 + 501 * 16], m3 - - ; mode 8 [row 6] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 396 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 397 * 16], m3 - - ; mode 9 [row 18] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 484 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 485 * 16], m3 - - ; mode 9 [row 21] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 490 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 491 * 16], m3 - - ; mode 9 [row 22] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 492 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 493 * 16], m3 - - ; mode 9 [row 23] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 494 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 495 * 16], m3 - - ; mode 9 [row 27] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 502 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 503 * 16], m3 - - ; mode 9 [row 28] - movu m6, [r5 + 26 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 504 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 505 * 16], m3 - - ; mode 9 [row 30] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 508 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 509 * 16], m3 - - ; mode 8 [row 7] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 398 * 16], m3 - - ; mode 9 [row 19 - first half] - movu [r0 + 486 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 399 * 16], m3 - - ; mode 9 [row 19 - second half] - movu [r0 + 487 * 16], m3 - - ; mode 8 [row 9] - movu m6, [r5 + 18 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 402 * 16], m3 - - ; mode 9 [row 24 - first half] - movu [r0 + 496 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 403 * 16], m3 - - ; mode 9 [row 24 - second half] - movu [r0 + 497 * 16], m3 - - ; mode 8 [row 10] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 404 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 405 * 16], m3 - - ; mode 8 [row 11] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 406 * 16], m3 - - ; mode 9 [row 29 - first half] - movu [r0 + 506 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 407 * 16], m3 - - ; mode 9 [row 29 - second half] - movu [r0 + 507 * 16], m3 - - ; mode 3 [row 2] - movu m6, [r5 + 14 * 16] - movu m0, [r4 + 3] - movd m1, [r4 + 4] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 11] - movd m3, [r4 + 12] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 68 * 16], m1 - - ; mode 3 [row 2 - first half] - movu [r0 + 266 * 16], m1 - - movu m1, [r4 + 19] - movd m3, [r4 + 20] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 27] - movd m5, [r4 + 28] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 69 * 16], m3 - - ; mode 3 [row 2 - second half] - movu [r0 + 267 * 16], m3 - - ; mode 4 [row 3] - movu m6, [r5 + 20 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 134 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 135 * 16], m3 - - ; mode 5 [row 3] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 198 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 199 * 16], m3 - - ; mode 5 [row 4] - movu m6, [r5 + 21 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 200 * 16], m3 - - ; mode 8 [row 16 - first half] - movu [r0 + 416 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 201 * 16], m3 - - ; mode 8 [row 16 - second half] - movu [r0 + 417 * 16], m3 - - ; mode 6 [row 4] - movu m6, [r5 + 1 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 264 * 16], m3 - - ; mode 6 [row 4 - first half] - movu [r0 + 408 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 265 * 16], m3 - - ; mode 6 [row 4 - second half] - movu [r0 + 409 * 16], m3 - - ; mode 6 [row 6] - movu m6, [r5 + 27 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 268 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 269 * 16], m3 - - ; mode 7 [row 7] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 334 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 335 * 16], m3 - - ; mode 7 [row 8] - movu m6, [r5 + 17 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 336 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 337 * 16], m3 - - ; mode 7 [row 9] - movu m6, [r5 + 26 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 338 * 16], m3 - - ; mode 8 [row 17 - first half] - movu [r0 + 418 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 339 * 16], m3 - - ; mode 8 [row 17 - second half] - movu [r0 + 419 * 16], m3 - - ; mode 8 [row 13] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 410 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 411 * 16], m3 - - ; mode 8 [row 14] - movu m6, [r5 + 11 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 412 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 413 * 16], m3 - - ; mode 8 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 414 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 415 * 16], m3 - - ; mode 8 [row 18] - movu m6, [r5 + 31 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 420 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 421 * 16], m3 - - ; mode 3 [row 3] - movu m6, [r5 + 8 * 16] - movu m0, [r4 + 4] - movd m1, [r4 + 5] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 12] - movd m3, [r4 + 13] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 70 * 16], m1 - - ; mode 6 [row 7 - first half] - movu [r0 + 270 * 16], m1 - - movu m1, [r4 + 20] - movd m3, [r4 + 21] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 28] - movd m5, [r4 + 29] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 71 * 16], m3 - - ; mode 6 [row 7 - second half] - movu [r0 + 271 * 16], m3 - - ; mode 4 [row 4] - movu m6, [r5 + 9 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 136 * 16], m3 - - ; mode 4 [row 4 - first half] - movu [r0 + 424 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 137 * 16], m3 - - ; mode 4 [row 4 - second half] - movu [r0 + 425 * 16], m3 - - ; mode 4 [row 5] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 138 * 16], m3 - - ; mode 7 [row 13 - first half] - movu [r0 + 346 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 139 * 16], m3 - - ; mode 7 [row 13 - second half] - movu [r0 + 347 * 16], m3 - - ; mode 5 [row 5] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 202 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 203 * 16], m3 - - ; mode 5 [row 6] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 204 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 205 * 16], m3 - - ; mode 6 [row 8] - movu m6, [r5 + 21 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 272 * 16], m3 - - ; mode 7 [row 12 - first half] - movu [r0 + 344 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 273 * 16], m3 - - ; mode 7 [row 12 - second half] - movu [r0 + 345 * 16], m3 - - ; mode 7 [row 10] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 340 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 341 * 16], m3 - - ; mode 7 [row 11] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 342 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 343 * 16], m3 - - ; mode 8 [row 19] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 422 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 423 * 16], m3 - - ; mode 8 [row 21] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 426 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 427 * 16], m3 - - ; mode 8 [row 22] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 428 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 429 * 16], m3 - - ; mode 8 [row 23] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 430 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 431 * 16], m3 - - ; mode 8 [row 24] - movu m6, [r5 + 29 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 432 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 433 * 16], m3 - - ; mode 3 [row 4] - movu m6, [r5 + 2 * 16] - movu m0, [r4 + 5] - movd m1, [r4 + 6] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 13] - movd m3, [r4 + 14] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 72 * 16], m1 - - ; mode 3 [row 4 - first half] - movu [r0 + 274 * 16], m1 - - ; mode 8 [row 25 - first half] - movu [r0 + 434 * 16], m1 - - movu m1, [r4 + 21] - movd m3, [r4 + 22] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 29] - movd m5, [r4 + 30] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 73 * 16], m3 - - ; mode 3 [row 4 - second half] - movu [r0 + 275 * 16], m3 - - ; mode 8 [row 25 - second half] - movu [r0 + 435 * 16], m3 - - ; mode 3 [row 5] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 74 * 16], m3 - - ; mode 3 [row 5 - first half] - movu [r0 + 278 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 75 * 16], m3 - - ; mode 3 [row 5 - second half] - movu [r0 + 279 * 16], m3 - - ; mode 4 [row 6] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 140 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 141 * 16], m3 - - ; mode 5 [row 7] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 206 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 207 * 16], m3 - - ; mode 5 [row 8] - movu m6, [r5 + 25 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 208 * 16], m3 - - ; mode 7 [row 16 - first half] - movu [r0 + 352 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 209 * 16], m3 - - ; mode 7 [row 16 - second half] - movu [r0 + 353 * 16], m3 - - ; mode 6 [row 10] - movu m6, [r5 + 15 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 276 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 277 * 16], m3 - - ; mode 7 [row 14] - movu m6, [r5 + 7 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 348 * 16], m3 - - ; mode 8 [row 26 - first half] - movu [r0 + 436 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 349 * 16], m3 - - ; mode 8 [row 26 - second half] - movu [r0 + 437 * 16], m3 - - ; mode 7 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 350 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 351 * 16], m3 - - ; mode 8 [row 27] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 438 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 439 * 16], m3 - - ; mode 8 [row 28] - movu m6, [r5 + 17 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 440 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 441 * 16], m3 - - ; mode 8 [row 29] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 442 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 443 * 16], m3 - - ; mode 8 [row 30] - movu m6, [r5 + 27 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 444 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 445 * 16], m3 - - ; mode 3 [row 6] - movu m6, [r5 + 22 * 16] - movu m0, [r4 + 6] - movd m1, [r4 + 7] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 14] - movd m3, [r4 + 15] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 76 * 16], m1 - - ; mode 6 [row 13 - first half] - movu [r0 + 282 * 16], m1 - - movu m1, [r4 + 22] - movd m3, [r4 + 23] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 30] - movd m5, [r4 + 31] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 77 * 16], m3 - - ; mode 6 [row 13 - second half] - movu [r0 + 283 * 16], m3 - - ; mode 4 [row 7] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 142 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 143 * 16], m3 - - ; mode 4 [row 8] - movu m6, [r5 + 29 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 144 * 16], m3 - - ; mode 4 [row 8 - first half] - movu [r0 + 360 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 145 * 16], m3 - - ; mode 4 [row 8 - second half] - movu [r0 + 361 * 16], m3 - - ; mode 5 [row 9] - movu m6, [r5 + 10 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 210 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 211 * 16], m3 - - ; mode 5 [row 10] - movu m6, [r5 + 27 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 212 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 213 * 16], m3 - - ; mode 7 [row 17] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 354 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 355 * 16], m3 - - ; mode 7 [row 18] - movu m6, [r5 + 11 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 356 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 357 * 16], m3 - - ; mode 7 [row 19] - movu m6, [r5 + 20 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 358 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 359 * 16], m3 - - ; mode 6 [row 12] - movu m6, [r5 + 9 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 280 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 281 * 16], m3 - - ; mode 3 [row 7] - movu m6, [r5 + 16 * 16] - movu m0, [r4 + 7] - movd m1, [r4 + 8] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 15] - movd m3, [r4 + 16] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 78 * 16], m1 - - ; mode 6 [row 15 - first half] - movu [r0 + 286 * 16], m1 - - movu m1, [r4 + 23] - movd m3, [r4 + 24] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 31] - movd m5, [r4 + 32] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 79 * 16], m3 - - ; mode 6 [row 15 - second half] - movu [r0 + 287 * 16], m3 - - ; mode 4 [row 9] - movu m6, [r5 + 18 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 146 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 147 * 16], m3 - - ; mode 5 [row 11] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 214 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 215 * 16], m3 - - ; mode 5 [row 12] - movu m6, [r5 + 29 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 216 * 16], m3 - - ; mode 6 [row 16 - first half] - movu [r0 + 288 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 217 * 16], m3 - - ; mode 6 [row 16 - second half] - movu [r0 + 289 * 16], m3 - - ; mode 6 [row 14] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 284 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 285 * 16], m3 - - ; mode 7 [row 21] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 362 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 363 * 16], m3 - - ; mode 7 [row 22] - movu m6, [r5 + 15 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 364 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 365 * 16], m3 - - ; mode 7 [row 23] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 366 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 367 * 16], m3 - - ; mode 3 [row 8] - movu m6, [r5 + 10 * 16] - movu m0, [r4 + 8] - movd m1, [r4 + 9] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 16] - movd m3, [r4 + 17] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 80 * 16], m1 - - ; mode 7 [row 25 - first half] - movu [r0 + 290 * 16], m1 - - ; mode 6 [row 17 - first half] - movu [r0 + 370 * 16], m1 - - movu m1, [r4 + 24] - movd m3, [r4 + 25] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 32] - movd m5, [r4 + 33] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 81 * 16], m3 - - ; mode 7 [row 25 - second half] - movu [r0 + 291 * 16], m3 - - ; mode 6 [row 17 - second half] - movu [r0 + 371 * 16], m3 - - ; mode 4 [row 10] - movu m6, [r5 + 7 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 148 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 149 * 16], m3 - - ; mode 4 [row 11] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 150 * 16], m3 - - ; mode 7 [row 27 - first half] - movu [r0 + 374 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 151 * 16], m3 - - ; mode 7 [row 27 - second half] - movu [r0 + 375 * 16], m3 - - ; mode 5 [row 13] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 218 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 219 * 16], m3 - - ; mode 5 [row 14] - movu m6, [r5 + 31 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 220 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 221 * 16], m3 - - ; mode 6 [row 18] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 292 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 293 * 16], m3 - - ; mode 7 [row 24] - movu m6, [r5 + 1 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 368 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 369 * 16], m3 - - ; mode 7 [row 26] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 372 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 373 * 16], m3 - - ; mode 3 [row 9] - movu m6, [r5 + 4 * 16] - movu m0, [r4 + 9] - movd m1, [r4 + 10] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 17] - movd m3, [r4 + 18] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 82 * 16], m1 - - ; mode 6 [row 19 - first half] - movu [r0 + 294 * 16], m1 - - movu m1, [r4 + 25] - movd m3, [r4 + 26] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 33] - movd m5, [r4 + 34] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 83 * 16], m3 - - ; mode 6 [row 19 - second half] - movu [r0 + 295 * 16], m3 - - ; mode 4 [row 12] - movu m6, [r5 + 17 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 152 * 16], m3 - - ; mode 4 [row 12 - first half] - movu [r0 + 296 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 153 * 16], m3 - - ; mode 4 [row 12 - second half] - movu [r0 + 297 * 16], m3 - - ; mode 3 [row 10] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 84 * 16], m3 - - ; mode 6 [row 21 - first half] - movu [r0 + 298 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 85 * 16], m3 - - ; mode 6 [row 21 - second half] - movu [r0 + 299 * 16], m3 - - ; mode 5 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 222 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 223 * 16], m3 - - ; mode 7 [row 28] - movu m6, [r5 + 5 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 376 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 377 * 16], m3 - - ; mode 7 [row 29] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 378 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 379 * 16], m3 - - ; mode 7 [row 30] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 380 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 381 * 16], m3 - - ; mode 3 [row 11] - movu m6, [r5 + 24 * 16] - movu m0, [r4 + 10] - movd m1, [r4 + 11] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 18] - movd m3, [r4 + 19] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 86 * 16], m1 - - ; mode 6 [row 23 - first half] - movu [r0 + 302 * 16], m1 - - movu m1, [r4 + 26] - movd m3, [r4 + 27] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 34] - movd m5, [r4 + 35] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 87 * 16], m3 - - ; mode 6 [row 23 - second half] - movu [r0 + 303 * 16], m3 - - ; mode 4 [row 13] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 154 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 155 * 16], m3 - - ; mode 4 [row 14] - movu m6, [r5 + 27 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 156 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 157 * 16], m3 - - ; mode 5 [row 16] - movu m6, [r5 + 1 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 224 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 225 * 16], m3 - - ; mode 5 [row 17] - movu m6, [r5 + 18 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 226 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 227 * 16], m3 - - ; mode 6 [row 22] - movu m6, [r5 + 11 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 300 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 301 * 16], m3 - - ; mode 3 [row 12] - movu m6, [r5 + 18 * 16] - movu m0, [r4 + 11] - movd m1, [r4 + 12] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 19] - movd m3, [r4 + 20] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 88 * 16], m1 - - ; mode 6 [row 25 - first half] - movu [r0 + 306 * 16], m1 - - movu m1, [r4 + 27] - movd m3, [r4 + 28] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 35] - movd m5, [r4 + 36] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 89 * 16], m3 - - ; mode 6 [row 25 - second half] - movu [r0 + 307 * 16], m3 - - ; mode 4 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 158 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 159 * 16], m3 - - ; mode 5 [row 18] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 228 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 229 * 16], m3 - - ; mode 5 [row 19] - movu m6, [r5 + 20 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 230 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 231 * 16], m3 - - ; mode 6 [row 24] - movu m6, [r5 + 5 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 304 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 305 * 16], m3 - - ; mode 6 [row 26] - movu m6, [r5 + 31 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 308 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 309 * 16], m3 - - ; mode 3 [row 13] - movu m6, [r5 + 12 * 16] - movu m0, [r4 + 12] - movd m1, [r4 + 13] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 20] - movd m3, [r4 + 21] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 90 * 16], m1 - - movu m1, [r4 + 28] - movd m3, [r4 + 29] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 36] - movd m5, [r4 + 37] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 91 * 16], m3 - - ; mode 4 [row 16] - movu m6, [r5 + 5 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 160 * 16], m3 - - ; mode 5 [row 20 - first half] - movu [r0 + 232 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 161 * 16], m3 - - ; mode 5 [row 20 - second half] - movu [r0 + 233 * 16], m3 - - ; mode 4 [row 17] - movu m6, [r5 + 26 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 162 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 163 * 16], m3 - - ; mode 5 [row 21] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 234 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 235 * 16], m3 - - ; mode 6 [row 27] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 310 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 311 * 16], m3 - - ; mode 6 [row 28] - movu m6, [r5 + 25 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 312 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 313 * 16], m3 - - ; mode 3 [row 14] - movu m6, [r5 + 6 * 16] - movu m0, [r4 + 13] - movd m1, [r4 + 14] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 21] - movd m3, [r4 + 22] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 92 * 16], m1 - - ; mode 6 [row 29 - first half] - movu [r0 + 314 * 16], m1 - - movu m1, [r4 + 29] - movd m3, [r4 + 30] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 37] - movd m5, [r4 + 38] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 93 * 16], m3 - - ; mode 6 [row 29 - second half] - movu [r0 + 315 * 16], m3 - - ; mode 4 [row 18] - movu m6, [r5 + 15 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 164 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 165 * 16], m3 - - ; mode 5 [row 22] - movu m6, [r5 + 7 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 236 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 237 * 16], m3 - - ; mode 5 [row 23] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 238 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 239 * 16], m3 - - ; mode 6 [row 30] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 316 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 317 * 16], m3 - - ; mode 3 [row 16] - movu m6, [r5 + 26 * 16] - movu m0, [r4 + 14] - movd m1, [r4 + 15] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 22] - movd m3, [r4 + 23] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 96 * 16], m1 - - ; mode 5 [row 25 - first half] - movu [r0 + 242 * 16], m1 - - movu m1, [r4 + 30] - movd m3, [r4 + 31] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 38] - movd m5, [r4 + 39] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 97 * 16], m3 - - ; mode 5 [row 25 - second half] - movu [r0 + 243 * 16], m3 - - ; mode 4 [row 19] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 166 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 167 * 16], m3 - - ; mode 4 [row 20] - movu m6, [r5 + 25 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 168 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 169 * 16], m3 - - ; mode 5 [row 24] - movu m6, [r5 + 9 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 240 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 241 * 16], m3 - - ; mode 3 [row 17] - movu m6, [r5 + 20 * 16] - movu m0, [r4 + 15] - movd m1, [r4 + 16] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 23] - movd m3, [r4 + 24] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 98 * 16], m1 - - movu m1, [r4 + 31] - movd m3, [r4 + 32] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 39] - movd m5, [r4 + 40] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 99 * 16], m3 - - ; mode 4 [row 21] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 170 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 171 * 16], m3 - - ; mode 5 [row 26] - movu m6, [r5 + 11 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 244 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 245 * 16], m3 - - ; mode 5 [row 27] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 246 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 247 * 16], m3 - - ; mode 3 [row 18] - movu m6, [r5 + 14 * 16] - movu m0, [r4 + 16] - movd m1, [r4 + 17] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 24] - movd m3, [r4 + 25] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 100 * 16], m1 - - movu m1, [r4 + 32] - movd m3, [r4 + 33] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 40] - movd m5, [r4 + 41] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 101 * 16], m3 - - ; mode 4 [row 22] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 172 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 173 * 16], m3 - - ; mode 4 [row 23] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 174 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 175 * 16], m3 - - ; mode 5 [row 28] - movu m6, [r5 + 13 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 248 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 249 * 16], m3 - - ; mode 5 [row 29] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 250 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 251 * 16], m3 - - ; mode 3 [row 19] - movu m6, [r5 + 8 * 16] - movu m0, [r4 + 17] - movd m1, [r4 + 18] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 25] - movd m3, [r4 + 26] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 102 * 16], m1 - - movu m1, [r4 + 33] - movd m3, [r4 + 34] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 41] - movd m5, [r4 + 42] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 103 * 16], m3 - - ; mode 4 [row 24] - movu m6, [r5 + 13 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 176 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 177 * 16], m3 - - ; mode 5 [row 30] - movu m6, [r5 + 15 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 252 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 253 * 16], m3 - - ; mode 3 [row 20] - movu m6, [r5 + 2 * 16] - movu m0, [r4 + 18] - movd m1, [r4 + 19] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 26] - movd m3, [r4 + 27] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 104 * 16], m1 - - movu m1, [r4 + 34] - movd m3, [r4 + 35] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 42] - movd m5, [r4 + 43] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 105 * 16], m3 - - ; mode 4 [row 25] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 178 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 179 * 16], m3 - - ; mode 4 [row 26] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 180 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 181 * 16], m3 - - ; mode 3 [row 21] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 106 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 107 * 16], m3 - - ; mode 3 [row 22] - movu m6, [r5 + 22 * 16] - movu m0, [r4 + 19] - movd m1, [r4 + 20] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 27] - movd m3, [r4 + 28] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 108 * 16], m1 - - movu m1, [r4 + 35] - movd m3, [r4 + 36] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 43] - movd m5, [r4 + 44] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 109 * 16], m3 - - ; mode 4 [row 27] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 182 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 183 * 16], m3 - - ; mode 3 [row 23] - movu m6, [r5 + 16 * 16] - movu m0, [r4 + 20] - movd m1, [r4 + 21] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 28] - movd m3, [r4 + 29] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 110 * 16], m1 - - movu m1, [r4 + 36] - movd m3, [r4 + 37] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 44] - movd m5, [r4 + 45] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 111 * 16], m3 - - ; mode 4 [row 28] - movu m6, [r5 + 1 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 184 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 185 * 16], m3 - - ; mode 4 [row 29] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 186 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 187 * 16], m3 - - ; mode 3 [row 24] - movu m6, [r5 + 10 * 16] - movu m0, [r4 + 21] - movd m1, [r4 + 22] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 29] - movd m3, [r4 + 30] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 112 * 16], m1 - - movu m1, [r4 + 37] - movd m3, [r4 + 38] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 45] - movd m5, [r4 + 46] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 113 * 16], m3 - - ; mode 4 [row 30] - movu m6, [r5 + 11 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 188 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 189 * 16], m3 - - ; mode 3 [row 25] - movu m6, [r5 + 4 * 16] - movu m0, [r4 + 22] - movd m1, [r4 + 23] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 30] - movd m3, [r4 + 31] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 114 * 16], m1 - - movu m1, [r4 + 38] - movd m3, [r4 + 39] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 46] - movd m5, [r4 + 47] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 115 * 16], m3 - - ; mode 3 [row 26] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 116 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 117 * 16], m3 - - ; mode 3 [row 27] - movu m6, [r5 + 24 * 16] - movu m0, [r4 + 23] - movd m1, [r4 + 24] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 31] - movd m3, [r4 + 32] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 118 * 16], m1 - - movu m1, [r4 + 39] - movd m3, [r4 + 40] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 47] - movd m5, [r4 + 48] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 119 * 16], m3 - - ; mode 3 [row 28] - movu m6, [r5 + 18 * 16] - movu m0, [r4 + 24] - movd m1, [r4 + 25] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 32] - movd m3, [r4 + 33] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 120 * 16], m1 - - movu m1, [r4 + 40] - movd m3, [r4 + 41] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 48] - movd m5, [r4 + 49] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 121 * 16], m3 - - ; mode 3 [row 29] - movu m6, [r5 + 12 * 16] - movu m0, [r4 + 25] - movd m1, [r4 + 26] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 33] - movd m3, [r4 + 34] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 122 * 16], m1 - - movu m1, [r4 + 41] - movd m3, [r4 + 42] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 49] - movd m5, [r4 + 50] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 123 * 16], m3 - - ; mode 3 [row 30] - movu m6, [r5 + 6 * 16] - movu m0, [r4 + 26] - movd m1, [r4 + 27] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r4 + 34] - movd m3, [r4 + 35] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 124 * 16], m1 - - movu m1, [r4 + 42] - movd m3, [r4 + 43] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r4 + 50] - movd m5, [r4 + 51] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 125 * 16], m3 - - ; mode 10 - movu m1, [r2 + 1] - movu m2, [r2 + 17] - movu [r0 + 512 * 16], m1 - movu [r0 + 513 * 16], m2 - movu [r0 + 514 * 16], m1 - movu [r0 + 515 * 16], m2 - movu [r0 + 516 * 16], m1 - movu [r0 + 517 * 16], m2 - movu [r0 + 518 * 16], m1 - movu [r0 + 519 * 16], m2 - movu [r0 + 520 * 16], m1 - movu [r0 + 521 * 16], m2 - movu [r0 + 522 * 16], m1 - movu [r0 + 523 * 16], m2 - movu [r0 + 524 * 16], m1 - movu [r0 + 525 * 16], m2 - movu [r0 + 526 * 16], m1 - movu [r0 + 527 * 16], m2 - - movu [r0 + 528 * 16], m1 - movu [r0 + 529 * 16], m2 - movu [r0 + 530 * 16], m1 - movu [r0 + 531 * 16], m2 - movu [r0 + 532 * 16], m1 - movu [r0 + 533 * 16], m2 - movu [r0 + 534 * 16], m1 - movu [r0 + 535 * 16], m2 - movu [r0 + 536 * 16], m1 - movu [r0 + 537 * 16], m2 - movu [r0 + 538 * 16], m1 - movu [r0 + 539 * 16], m2 - movu [r0 + 540 * 16], m1 - movu [r0 + 541 * 16], m2 - movu [r0 + 542 * 16], m1 - movu [r0 + 543 * 16], m2 - - movu [r0 + 544 * 16], m1 - movu [r0 + 545 * 16], m2 - movu [r0 + 546 * 16], m1 - movu [r0 + 547 * 16], m2 - movu [r0 + 548 * 16], m1 - movu [r0 + 549 * 16], m2 - movu [r0 + 550 * 16], m1 - movu [r0 + 551 * 16], m2 - movu [r0 + 552 * 16], m1 - movu [r0 + 553 * 16], m2 - movu [r0 + 554 * 16], m1 - movu [r0 + 555 * 16], m2 - movu [r0 + 556 * 16], m1 - movu [r0 + 557 * 16], m2 - movu [r0 + 558 * 16], m1 - movu [r0 + 559 * 16], m2 - - movu [r0 + 560 * 16], m1 - movu [r0 + 561 * 16], m2 - movu [r0 + 562 * 16], m1 - movu [r0 + 563 * 16], m2 - movu [r0 + 564 * 16], m1 - movu [r0 + 565 * 16], m2 - movu [r0 + 566 * 16], m1 - movu [r0 + 567 * 16], m2 - movu [r0 + 568 * 16], m1 - movu [r0 + 569 * 16], m2 - movu [r0 + 570 * 16], m1 - movu [r0 + 571 * 16], m2 - movu [r0 + 572 * 16], m1 - movu [r0 + 573 * 16], m2 - movu [r0 + 574 * 16], m1 - movu [r0 + 575 * 16], m2 - - ; mode 11 [row 0] - movu m0, [r4] - - ; mode 11 [row 15 - first half] - movu [r0 + 606 * 16], m0 - - movu [r0 + 606 * 16], m0 - - ; mode 12 [row 31] - pslldq m6, m0, 4 - pinsrb m6, [r3 + 26], 0 - pinsrb m6, [r3 + 19], 1 - pinsrb m6, [r3 + 13], 2 - pinsrb m6, [r3 + 6], 3 - movu [r0 + 702 * 16], m6 - movu m6, [r4 + 12] - movu [r0 + 703 * 16], m6 - - ; mode 11 [row 31] - pslldq m6, m0, 1 - pinsrb m6, [r3 + 16], 0 - movu [r0 + 638 * 16], m6 - movu m6, [r4 + 15] - movu [r0 + 639 * 16], m6 - - movd m1, [r4 + 1] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m1, m0, [r5 + 30 * 16] - pmulhrsw m1, m7 - movu m2, [r4 + 8] - movd m3, [r4 + 9] - palignr m3, m2, 1 - punpcklbw m2, m3 - pmaddubsw m3, m2, [r5 + 30 * 16] - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 576 * 16], m1 - - movu m1, [r4 + 16] - - ; mode 11 [row 15 - second half] - movu [r0 + 607 * 16], m1 - - movd m3, [r4 + 17] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, [r5 + 30 * 16] - pmulhrsw m3, m7 - movu m4, [r4 + 24] - movd m5, [r4 + 25] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, [r5 + 30 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 577 * 16], m3 - - ; mode 11 [row 1] - pmaddubsw m3, m0, [r5 + 28 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 578 * 16], m3 - pmaddubsw m3, m1, [r5 + 28 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 579 * 16], m3 - - ; mode 11 [row 2] - pmaddubsw m3, m0, [r5 + 26 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 26 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 580 * 16], m3 - pmaddubsw m3, m1, [r5 + 26 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 26 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 581 * 16], m3 - - ; mode 11 [row 3] - pmaddubsw m3, m0, [r5 + 24 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 582 * 16], m3 - pmaddubsw m3, m1, [r5 + 24 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 583 * 16], m3 - - ; mode 11 [row 4] - pmaddubsw m3, m0, [r5 + 22 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 22 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 584 * 16], m3 - - ; mode 12 [row 1 - first half] - movu [r0 + 642 * 16], m3 - - pmaddubsw m3, m1, [r5 + 22 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 22 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 585 * 16], m3 - - ; mode 12 [row 1 - second half] - movu [r0 + 643 * 16], m3 - - ; mode 11 [row 5] - pmaddubsw m3, m0, [r5 + 20 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 20 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 586 * 16], m3 - pmaddubsw m3, m1, [r5 + 20 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 20 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 587 * 16], m3 - - ; mode 11 [row 6] - pmaddubsw m3, m0, [r5 + 18 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 18 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 588 * 16], m3 - pmaddubsw m3, m1, [r5 + 18 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 18 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 589 * 16], m3 - - ; mode 11 [row 7] - pmaddubsw m3, m0, [r5 + 16 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 16 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 590 * 16], m3 - pmaddubsw m3, m1, [r5 + 16 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 16 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 591 * 16], m3 - - ; mode 11 [row 8] - pmaddubsw m3, m0, [r5 + 14 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 14 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 592 * 16], m3 - - ; mode 13 [row 1 - first half] - movu [r0 + 706 * 16], m3 - - pmaddubsw m3, m1, [r5 + 14 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 14 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 593 * 16], m3 - - ; mode 13 [row 1 - second half] - movu [r0 + 707 * 16], m3 - - ; mode 11 [row 9] - pmaddubsw m3, m0, [r5 + 12 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 12 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 594 * 16], m3 - - ; mode 12 [row 3 - first half] - movu [r0 + 646 * 16], m3 - - pmaddubsw m3, m1, [r5 + 12 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 12 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 595 * 16], m3 - - ; mode 12 [row 3 - second half] - movu [r0 + 647 * 16], m3 - - ; mode 11 [row 10] - pmaddubsw m3, m0, [r5 + 10 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 10 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 596 * 16], m3 - pmaddubsw m3, m1, [r5 + 10 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 10 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 597 * 16], m3 - - ; mode 11 [row 11] - pmaddubsw m3, m0, [r5 + 8 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 8 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 598 * 16], m3 - pmaddubsw m3, m1, [r5 + 8 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 8 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 599 * 16], m3 - - ; mode 11 [row 12] - pmaddubsw m3, m0, [r5 + 6 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 6 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 600 * 16], m3 - - ; mode 14 [row 1 - first half] - movu [r0 + 770 * 16], m3 - - pmaddubsw m3, m1, [r5 + 6 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 6 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 601 * 16], m3 - - ; mode 14 [row 1 - second half] - movu [r0 + 771 * 16], m3 - - ; mode 11 [row 13] - pmaddubsw m3, m0, [r5 + 4 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 4 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 602 * 16], m3 - pmaddubsw m3, m1, [r5 + 4 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 4 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 603 * 16], m3 - - ; mode 11 [row 14] - pmaddubsw m3, m0, [r5 + 2 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 2 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 604 * 16], m3 - - ; mode 13 [row 5 - first half] - movu [r0 + 650 * 16], m3 - - pmaddubsw m3, m1, [r5 + 2 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 2 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 605 * 16], m3 - - ; mode 13 [row 5 - second half] - movu [r0 + 651 * 16], m3 - - ; mode 12 [row 0] - pmaddubsw m3, m0, [r5 + 27 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 27 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 640 * 16], m3 - pmaddubsw m3, m1, [r5 + 27 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 27 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 641 * 16], m3 - - ; mode 12 [row 2] - pmaddubsw m3, m0, [r5 + 17 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 17 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 644 * 16], m3 - pmaddubsw m3, m1, [r5 + 17 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 17 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 645 * 16], m3 - - ; mode 12 [row 4] - pmaddubsw m3, m0, [r5 + 7 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 7 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 648 * 16], m3 - pmaddubsw m3, m1, [r5 + 7 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 7 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 649 * 16], m3 - - ; mode 13 [row 0] - pmaddubsw m3, m0, [r5 + 23 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 23 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 704 * 16], m3 - pmaddubsw m3, m1, [r5 + 23 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 23 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 705 * 16], m3 - - ; mode 13 [row 2] - pmaddubsw m3, m0, [r5 + 5 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 5 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 708 * 16], m3 - pmaddubsw m3, m1, [r5 + 5 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 5 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 709 * 16], m3 - - ; mode 14 [row 0] - pmaddubsw m3, m0, [r5 + 19 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 19 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 768 * 16], m3 - pmaddubsw m3, m1, [r5 + 19 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 19 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 769 * 16], m3 - - ; mode 15 [row 0] - pmaddubsw m3, m0, [r5 + 15 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 15 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 832 * 16], m3 - pmaddubsw m3, m1, [r5 + 15 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 15 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 833 * 16], m3 - - ; mode 11 [row 16] - pslldq m0, 2 - pinsrb m0, [r4 + 0], 1 - pinsrb m0, [r3 + 16], 0 - pmaddubsw m3, m0, [r5 + 30 * 16] - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 8], 1 - pinsrb m2, [r4 + 7], 0 - pmaddubsw m5, m2, [r5 + 30 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 608 * 16], m3 - pslldq m1, 2 - pinsrb m1, [r4 + 16], 1 - pinsrb m1, [r4 + 15], 0 - pmaddubsw m3, m1, [r5 + 30 * 16] - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrb m4, [r4 + 24], 1 - pinsrb m4, [r4 + 23], 0 - pmaddubsw m5, m4, [r5 + 30 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 609 * 16], m3 - - ; mode 11 [row 17] - pmaddubsw m3, m0, [r5 + 28 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 610 * 16], m3 - pmaddubsw m3, m1, [r5 + 28 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 611 * 16], m3 - - ; mode 11 [row 18] - pmaddubsw m3, m0, [r5 + 26 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 26 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 612 * 16], m3 - pmaddubsw m3, m1, [r5 + 26 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 26 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 613 * 16], m3 - - ; mode 11 [row 19] - pmaddubsw m3, m0, [r5 + 24 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 614 * 16], m3 - pmaddubsw m3, m1, [r5 + 24 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 615 * 16], m3 - - ; mode 11 [row 20] - pmaddubsw m3, m0, [r5 + 22 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 22 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 616 * 16], m3 - pmaddubsw m3, m1, [r5 + 22 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 22 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 617 * 16], m3 - - ; mode 11 [row 21] - pmaddubsw m3, m0, [r5 + 20 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 20 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 618 * 16], m3 - pmaddubsw m3, m1, [r5 + 20 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 20 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 619 * 16], m3 - - ; mode 11 [row 22] - pmaddubsw m3, m0, [r5 + 18 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 18 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 620 * 16], m3 - pmaddubsw m3, m1, [r5 + 18 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 18 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 621 * 16], m3 - - ; mode 11 [row 23] - pmaddubsw m3, m0, [r5 + 16 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 16 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 622 * 16], m3 - pmaddubsw m3, m1, [r5 + 16 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 16 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 623 * 16], m3 - - ; mode 11 [row 24] - pmaddubsw m3, m0, [r5 + 14 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 14 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 624 * 16], m3 - pmaddubsw m3, m1, [r5 + 14 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 14 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 625 * 16], m3 - - ; mode 11 [row 25] - pmaddubsw m3, m0, [r5 + 12 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 12 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 626 * 16], m3 - pmaddubsw m3, m1, [r5 + 12 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 12 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 627 * 16], m3 - - ; mode 11 [row 26] - pmaddubsw m3, m0, [r5 + 10 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 10 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 628 * 16], m3 - pmaddubsw m3, m1, [r5 + 10 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 10 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 629 * 16], m3 - - ; mode 11 [row 27] - pmaddubsw m3, m0, [r5 + 8 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 8 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 630 * 16], m3 - pmaddubsw m3, m1, [r5 + 8 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 8 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 631 * 16], m3 - - ; mode 11 [row 28] - pmaddubsw m3, m0, [r5 + 6 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 6 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 632 * 16], m3 - pmaddubsw m3, m1, [r5 + 6 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 6 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 633 * 16], m3 - - ; mode 11 [row 29] - pmaddubsw m3, m0, [r5 + 4 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 4 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 634 * 16], m3 - pmaddubsw m3, m1, [r5 + 4 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 4 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 635 * 16], m3 - - ; mode 11 [row 30] - pmaddubsw m3, m0, [r5 + 2 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 2 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 636 * 16], m3 - pmaddubsw m3, m1, [r5 + 2 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 2 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 637 * 16], m3 - - ; mode 12 [row 6] - pinsrb m0, [r3 + 6], 0 - pmaddubsw m3, m0, [r5 + 29 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 29 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 652 * 16], m3 - pmaddubsw m3, m1, [r5 + 29 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 29 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 653 * 16], m3 - - ; mode 12 [row 7] - pmaddubsw m3, m0, [r5 + 24 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 654 * 16], m3 - pmaddubsw m3, m1, [r5 + 24 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 655 * 16], m3 - - ; mode 12 [row 8] - pmaddubsw m3, m0, [r5 + 19 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 19 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 656 * 16], m3 - pmaddubsw m3, m1, [r5 + 19 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 19 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 657 * 16], m3 - - ; mode 12 [row 9] - pmaddubsw m3, m0, [r5 + 14 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 14 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 658 * 16], m3 - pmaddubsw m3, m1, [r5 + 14 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 14 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 659 * 16], m3 - - ; mode 12 [row 10] - pmaddubsw m3, m0, [r5 + 9 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 9 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 660 * 16], m3 - pmaddubsw m3, m1, [r5 + 9 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 9 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 661 * 16], m3 - - ; mode 12 [row 11] - pmaddubsw m3, m0, [r5 + 4 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 4 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 662 * 16], m3 - pmaddubsw m3, m1, [r5 + 4 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 4 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 663 * 16], m3 - - ; mode 13 [row 3] - movu m6, m0 - pinsrb m6, [r3 + 4], 0 - pmaddubsw m3, m6, [r5 + 28 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 710 * 16], m3 - pmaddubsw m3, m1, [r5 + 28 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 711 * 16], m3 - - ; mode 13 [row 4] - pmaddubsw m3, m6, [r5 + 19 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 19 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 712 * 16], m3 - pmaddubsw m3, m1, [r5 + 19 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 19 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 713 * 16], m3 - - ; mode 13 [row 5] - pmaddubsw m3, m6, [r5 + 10 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 10 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 714 * 16], m3 - pmaddubsw m3, m1, [r5 + 10 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 10 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 715 * 16], m3 - - ; mode 13 [row 6] - pmaddubsw m3, m6, [r5 + 1 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 1 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 716 * 16], m3 - pmaddubsw m3, m1, [r5 + 1 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 1 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 717 * 16], m3 - - ; mode 14 [row 2] - movu m6, m0 - pinsrb m6, [r4 + 0], 1 - pinsrb m6, [r3 + 2], 0 - pmaddubsw m3, m6, [r5 + 25 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 25 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 772 * 16], m3 - pmaddubsw m3, m1, [r5 + 25 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 25 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 773 * 16], m3 - - ; mode 14 [row 3] - pmaddubsw m3, m6, [r5 + 12 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 12 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 774 * 16], m3 - pmaddubsw m3, m1, [r5 + 12 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 12 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 775 * 16], m3 - - ; mode 15 [row 1] - pmaddubsw m3, m6, [r5 + 30 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 30 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 834 * 16], m3 - pmaddubsw m3, m1, [r5 + 30 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 30 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 835 * 16], m3 - - ; mode 15 [row 2] - pmaddubsw m3, m6, [r5 + 13 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 13 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 836 * 16], m3 - pmaddubsw m3, m1, [r5 + 13 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 13 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 837 * 16], m3 - - ; mode 15 [row 3] - pslldq m6, 2 - pinsrb m6, [r3 + 2], 1 - pinsrb m6, [r3 + 4], 0 - pmaddubsw m3, m6, [r5 + 28 * 16] - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 7], 1 - pinsrb m2, [r4 + 6], 0 - pmaddubsw m5, m2, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 838 * 16], m3 - pslldq m1, 2 - pinsrb m1, [r4 + 15], 1 - pinsrb m1, [r4 + 14], 0 - pmaddubsw m3, m1, [r5 + 28 * 16] - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrb m4, [r4 + 23], 1 - pinsrb m4, [r4 + 22], 0 - pmaddubsw m5, m4, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 839 * 16], m3 - - ; mode 15 [row 4] - pmaddubsw m3, m6, [r5 + 11 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 11 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 840 * 16], m3 - pmaddubsw m3, m1, [r5 + 11 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 11 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 841 * 16], m3 - - ; mode 15 [row 5, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 4], 1 - pinsrb m6, [r3 + 6], 0 - pmaddubsw m3, m6, [r5 + 26 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 842 * 16], m3 - - ; mode 15 [row 6, 0-7] - pmaddubsw m3, m6, [r5 + 9 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 844 * 16], m3 - - ; mode 15 [row 7, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 6], 1 - pinsrb m6, [r3 + 8], 0 - pmaddubsw m3, m6, [r5 + 24 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 846 * 16], m3 - - ; mode 15 [row 8, 0-7] - pmaddubsw m3, m6, [r5 + 7 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 848 * 16], m3 - - ; mode 15 [row 9, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 8], 1 - pinsrb m6, [r3 + 9], 0 - pmaddubsw m3, m6, [r5 + 22 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 850 * 16], m3 - - ; mode 15 [row 10, 0-7] - pmaddubsw m3, m6, [r5 + 5 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 852 * 16], m3 - - ; mode 15 [row 11, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 9], 1 - pinsrb m6, [r3 + 11], 0 - pmaddubsw m3, m6, [r5 + 20 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 854 * 16], m3 - - ; mode 15 [row 12, 0-7] - pmaddubsw m3, m6, [r5 + 3 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 856 * 16], m3 - - ; mode 15 [row 13, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 11], 1 - pinsrb m6, [r3 + 13], 0 - pmaddubsw m3, m6, [r5 + 18 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 858 * 16], m3 - - ; mode 15 [row 14, 0-7] - pmaddubsw m3, m6, [r5 + 1 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 860 * 16], m3 - - ; mode 15 [row 15, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 13], 1 - pinsrb m6, [r3 + 15], 0 - pmaddubsw m3, m6, [r5 + 16 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 862 * 16], m3 - - ; mode 15 [row 16, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 15], 1 - pinsrb m6, [r3 + 17], 0 - pmaddubsw m3, m6, [r5 + 31 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 864 * 16], m3 - - ; mode 15 [row 17, 0-7] - pmaddubsw m3, m6, [r5 + 14 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 866 * 16], m3 - - ; mode 15 [row 18, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 17], 1 - pinsrb m6, [r3 + 19], 0 - pmaddubsw m3, m6, [r5 + 29 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 868 * 16], m3 - - ; mode 15 [row 19, 0-7] - pmaddubsw m3, m6, [r5 + 12 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 870 * 16], m3 - - ; mode 15 [row 20, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 19], 1 - pinsrb m6, [r3 + 21], 0 - pmaddubsw m3, m6, [r5 + 27 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 872 * 16], m3 - - ; mode 15 [row 21, 0-7] - pmaddubsw m3, m6, [r5 + 10 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 874 * 16], m3 - - ; mode 15 [row 22, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 21], 1 - pinsrb m6, [r3 + 23], 0 - pmaddubsw m3, m6, [r5 + 25 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 876 * 16], m3 - - ; mode 15 [row 23, 0-7] - pmaddubsw m3, m6, [r5 + 8 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 878 * 16], m3 - - ; mode 15 [row 24, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 23], 1 - pinsrb m6, [r3 + 24], 0 - pmaddubsw m3, m6, [r5 + 23 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 880 * 16], m3 - - ; mode 15 [row 25, 0-7] - pmaddubsw m3, m6, [r5 + 6 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 882 * 16], m3 - - ; mode 15 [row 26, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 24], 1 - pinsrb m6, [r3 + 26], 0 - pmaddubsw m3, m6, [r5 + 21 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 884 * 16], m3 - - ; mode 15 [row 27, 0-7] - pmaddubsw m3, m6, [r5 + 4 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 886 * 16], m3 - - ; mode 15 [row 28, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 26], 1 - pinsrb m6, [r3 + 28], 0 - pmaddubsw m3, m6, [r5 + 19 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 888 * 16], m3 - - ; mode 15 [row 29, 0-7] - pmaddubsw m3, m6, [r5 + 2 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 890 * 16], m3 - - ; mode 15 [row 30, 0-7] - pslldq m6, 2 - pinsrb m6, [r3 + 28], 1 - pinsrb m6, [r3 + 30], 0 - pmaddubsw m3, m6, [r5 + 17 * 16] - pmulhrsw m3, m7 - packuswb m3, m3 - movh [r0 + 892 * 16], m3 - - ; mode 15 [row 31, 0-7] - pshufb m3, m6, [tab_S2] - movh [r0 + 894 * 16], m3 - - ; mode 12 [row 12] - pslldq m0, 2 - pinsrb m0, [r3 + 6], 1 - pinsrb m0, [r3 + 13], 0 - pmaddubsw m3, m0, [r5 + 31 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 31 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 664 * 16], m3 - pmaddubsw m3, m1, [r5 + 31 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 31 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 665 * 16], m3 - - ; mode 12 [row 13] - pmaddubsw m3, m0, [r5 + 26 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 26 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 666 * 16], m3 - pmaddubsw m3, m1, [r5 + 26 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 26 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 667 * 16], m3 - - ; mode 12 [row 14] - pmaddubsw m3, m0, [r5 + 21 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 21 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 668 * 16], m3 - pmaddubsw m3, m1, [r5 + 21 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 21 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 669 * 16], m3 - - ; mode 12 [row 15] - pmaddubsw m3, m0, [r5 + 16 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 16 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 670 * 16], m3 - pmaddubsw m3, m1, [r5 + 16 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 16 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 671 * 16], m3 - - ; mode 12 [row 16] - pmaddubsw m3, m0, [r5 + 11 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 11 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 672 * 16], m3 - pmaddubsw m3, m1, [r5 + 11 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 11 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 673 * 16], m3 - - ; mode 12 [row 17] - pmaddubsw m3, m0, [r5 + 6 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 6 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 674 * 16], m3 - pmaddubsw m3, m1, [r5 + 6 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 6 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 675 * 16], m3 - - ; mode 12 [row 18] - pmaddubsw m3, m0, [r5 + 1 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 1 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 676 * 16], m3 - pmaddubsw m3, m1, [r5 + 1 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 1 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 677 * 16], m3 - - ; mode 13 [row 7] - movu m6, m0 - pinsrb m6, [r3 + 4], 2 - pinsrb m6, [r3 + 4], 1 - pinsrb m6, [r3 + 7], 0 - pmaddubsw m3, m6, [r5 + 24 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 718 * 16], m3 - pmaddubsw m3, m1, [r5 + 24 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 719 * 16], m3 - - ; mode 13 [row 8] - pmaddubsw m3, m6, [r5 + 15 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 15 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 720 * 16], m3 - pmaddubsw m3, m1, [r5 + 15 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 15 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 721 * 16], m3 - - ; mode 13 [row 9] - pmaddubsw m3, m6, [r5 + 6 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 6 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 722 * 16], m3 - pmaddubsw m3, m1, [r5 + 6 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 6 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 723 * 16], m3 - - ; mode 14 [row 4] - pinsrb m6, [r3 + 2], 2 - pinsrb m6, [r3 + 2], 1 - pinsrb m6, [r3 + 5], 0 - pmaddubsw m3, m6, [r5 + 31 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 31 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 776 * 16], m3 - pmaddubsw m3, m1, [r5 + 31 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 31 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 777 * 16], m3 - - ; mode 14 [row 5] - pmaddubsw m3, m6, [r5 + 18 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 18 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 778 * 16], m3 - pmaddubsw m3, m1, [r5 + 18 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 18 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 779 * 16], m3 - - ; mode 14 [row 6] - pmaddubsw m3, m6, [r5 + 5 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 5 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 780 * 16], m3 - pmaddubsw m3, m1, [r5 + 5 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 5 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 781 * 16], m3 - - ; mode 14 [row 7] - pslldq m6, 2 - pinsrb m6, [r3 + 5], 1 - pinsrb m6, [r3 + 7], 0 - pmaddubsw m3, m6, [r5 + 24 * 16] - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 5], 0 - pmaddubsw m5, m2, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 782 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 13], 0 - pmaddubsw m3, m1, [r5 + 24 * 16] - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 21], 0 - pmaddubsw m5, m4, [r5 + 24 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 783 * 16], m3 - - ; mode 14 [row 8] - pmaddubsw m3, m6, [r5 + 11 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 11 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 784 * 16], m3 - pmaddubsw m3, m1, [r5 + 11 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 11 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 785 * 16], m3 - - ; mode 15 [row 5, 8-31] - pmaddubsw m5, m2, [r5 + 26 * 16] - pmulhrsw m5, m7 - packuswb m5, m5 - movh [r0 + 842 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 26 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 26 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 843 * 16], m3 - - ; mode 15 [row 6, 8-31] - pmaddubsw m5, m2, [r5 + 9 * 16] - pmulhrsw m5, m7 - packuswb m5, m5 - movh [r0 + 844 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 9 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 9 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 845 * 16], m3 - - ; mode 12 [row 19] - pslldq m0, 2 - pinsrb m0, [r3 + 13], 1 - pinsrb m0, [r3 + 19], 0 - pmaddubsw m3, m0, [r5 + 28 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 678 * 16], m3 - pmaddubsw m3, m1, [r5 + 28 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 28 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 679 * 16], m3 - - ; mode 12 [row 20] - pmaddubsw m3, m0, [r5 + 23 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 23 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 680 * 16], m3 - pmaddubsw m3, m1, [r5 + 23 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 23 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 681 * 16], m3 - - ; mode 12 [row 21] - pmaddubsw m3, m0, [r5 + 18 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 18 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 682 * 16], m3 - pmaddubsw m3, m1, [r5 + 18 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 18 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 683 * 16], m3 - - ; mode 12 [row 22] - pmaddubsw m3, m0, [r5 + 13 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 13 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 684 * 16], m3 - pmaddubsw m3, m1, [r5 + 13 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 13 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 685 * 16], m3 - - ; mode 12 [row 23] - pmaddubsw m3, m0, [r5 + 8 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 8 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 686 * 16], m3 - pmaddubsw m3, m1, [r5 + 8 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 8 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 687 * 16], m3 - - ; mode 12 [row 24] - pmaddubsw m3, m0, [r5 + 3 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m2, [r5 + 3 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 688 * 16], m3 - pmaddubsw m3, m1, [r5 + 3 * 16] - pmulhrsw m3, m7 - pmaddubsw m5, m4, [r5 + 3 * 16] - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 689 * 16], m3 - - ; mode 13 [row 10] - movu m7, m6 - movu m6, m0 - pinsrb m6, [r3 + 4], 4 - pinsrb m6, [r3 + 4], 3 - pinsrb m6, [r3 + 7], 2 - pinsrb m6, [r3 + 7], 1 - pinsrb m6, [r3 + 11], 0 - pmaddubsw m3, m6, [r5 + 29 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 29 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 724 * 16], m3 - pmaddubsw m3, m1, [r5 + 29 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 29 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 725 * 16], m3 - - ; mode 13 [row 11] - pmaddubsw m3, m6, [r5 + 20 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 20 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 726 * 16], m3 - pmaddubsw m3, m1, [r5 + 20 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 20 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 727 * 16], m3 - - ; mode 13 [row 12] - pmaddubsw m3, m6, [r5 + 11 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 11 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 728 * 16], m3 - pmaddubsw m3, m1, [r5 + 11 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 11 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 729 * 16], m3 - - ; mode 13 [row 13] - pmaddubsw m3, m6, [r5 + 2 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 2 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 730 * 16], m3 - pmaddubsw m3, m1, [r5 + 2 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 2 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 731 * 16], m3 - - ; mode 14 [row 9] - pslldq m7, 2 - pinsrb m7, [r3 + 7], 1 - pinsrb m7, [r3 + 10], 0 - pmaddubsw m3, m7, [r5 + 30 * 16] - pmulhrsw m3, [pw_1024] - pslldq m2, 2 - pinsrw m2, [r4 + 4], 0 - pmaddubsw m5, m2, [r5 + 30 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 786 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 12], 0 - pmaddubsw m3, m1, [r5 + 30 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrb m4, [r4 + 21], 1 - pinsrb m4, [r4 + 20], 0 - pmaddubsw m5, m4, [r5 + 30 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 787 * 16], m3 - - ; mode 14 [row 10] - pmaddubsw m3, m7, [r5 + 17 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 17 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 788 * 16], m3 - pmaddubsw m3, m1, [r5 + 17 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 17 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 789 * 16], m3 - - ; mode 14 [row 11] - pmaddubsw m3, m7, [r5 + 4 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 4 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 790 * 16], m3 - pmaddubsw m3, m1, [r5 + 4 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 4 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 791 * 16], m3 - - movu m6, [pw_1024] - - ; mode 15 [row 7, 8-31] - pmaddubsw m5, m2, [r5 + 24 * 16] - pmulhrsw m5, m6 - packuswb m5, m5 - movh [r0 + 846 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 24 * 16] - pmulhrsw m3, m6 - pmaddubsw m5, m4, [r5 + 24 * 16] - pmulhrsw m5, m6 - packuswb m3, m5 - movu [r0 + 847 * 16], m3 - - ; mode 15 [row 8, 8-31] - pmaddubsw m5, m2, [r5 + 7 * 16] - pmulhrsw m5, m6 - packuswb m5, m5 - movh [r0 + 848 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 7 * 16] - pmulhrsw m3, m6 - pmaddubsw m5, m4, [r5 + 7 * 16] - pmulhrsw m5, m6 - packuswb m3, m5 - movu [r0 + 849 * 16], m3 - - ; mode 12 [row 25] - pslldq m0, 2 - pinsrb m0, [r3 + 19], 1 - pinsrb m0, [r3 + 26], 0 - pmaddubsw m3, m0, [r5 + 30 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 30 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 690 * 16], m3 - pmaddubsw m3, m1, [r5 + 30 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 30 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 691 * 16], m3 - - ; mode 12 [row 26] - pmaddubsw m3, m0, [r5 + 25 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 25 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 692 * 16], m3 - pmaddubsw m3, m1, [r5 + 25 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 25 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 693 * 16], m3 - - ; mode 12 [row 27] - pmaddubsw m3, m0, [r5 + 20 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 20 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 694 * 16], m3 - pmaddubsw m3, m1, [r5 + 20 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 20 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 695 * 16], m3 - - ; mode 12 [row 28] - pmaddubsw m3, m0, [r5 + 15 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 15 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 696 * 16], m3 - pmaddubsw m3, m1, [r5 + 15 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 15 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 697 * 16], m3 - - ; mode 12 [row 29] - pmaddubsw m3, m0, [r5 + 10 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 10 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 698 * 16], m3 - pmaddubsw m3, m1, [r5 + 10 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 10 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 699 * 16], m3 - - ; mode 12 [row 30] - pmaddubsw m3, m0, [r5 + 5 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 5 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 700 * 16], m3 - pmaddubsw m3, m1, [r5 + 5 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 5 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 701 * 16], m3 - - ; mode 13 [row 14] - movu m6, m0 - pinsrb m6, [r3 + 4], 6 - pinsrb m6, [r3 + 4], 5 - pinsrb m6, [r3 + 7], 4 - pinsrb m6, [r3 + 7], 3 - pinsrb m6, [r3 + 11], 2 - pinsrb m6, [r3 + 11], 1 - pinsrb m6, [r3 + 14], 0 - pmaddubsw m3, m6, [r5 + 25 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 25 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 732 * 16], m3 - pmaddubsw m3, m1, [r5 + 25 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 25 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 733 * 16], m3 - - ; mode 13 [row 15] - pmaddubsw m3, m6, [r5 + 16 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 16 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 734 * 16], m3 - pmaddubsw m3, m1, [r5 + 16 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 16 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 735 * 16], m3 - - ; mode 13 [row 16] - pmaddubsw m3, m6, [r5 + 7 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 7 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 736 * 16], m3 - pmaddubsw m3, m1, [r5 + 7 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 7 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 737 * 16], m3 - - ; mode 13 [row 17] - pslldq m6, 2 - pinsrb m6, [r3 + 14], 1 - pinsrb m6, [r3 + 18], 0 - pmaddubsw m3, m6, [r5 + 30 * 16] - pmulhrsw m3, [pw_1024] - pslldq m2, 2 - pinsrw m2, [r4 + 3], 0 - pmaddubsw m5, m2, [r5 + 30 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 738 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 11], 0 - pmaddubsw m3, m1, [r5 + 30 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 19], 0 - pmaddubsw m5, m4, [r5 + 30 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 739 * 16], m3 - - ; mode 13 [row 18] - pmaddubsw m3, m6, [r5 + 21 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 21 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 740 * 16], m3 - pmaddubsw m3, m1, [r5 + 21 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 21 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 741 * 16], m3 - - ; mode 13 [row 19] - pmaddubsw m3, m6, [r5 + 12 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 12 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 742 * 16], m3 - pmaddubsw m3, m1, [r5 + 12 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 12 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 743 * 16], m3 - - ; mode 13 [row 20] - pmaddubsw m3, m6, [r5 + 3 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 3 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 744 * 16], m3 - pmaddubsw m3, m1, [r5 + 3 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 3 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 745 * 16], m3 - - ; mode 14 [row 12] - pslldq m7, 2 - pinsrb m7, [r3 + 10], 1 - pinsrb m7, [r3 + 12], 0 - pmaddubsw m3, m7, [r5 + 23 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 23 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 792 * 16], m3 - pmaddubsw m3, m1, [r5 + 23 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 23 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 793 * 16], m3 - - ; mode 14 [row 13] - pmaddubsw m3, m7, [r5 + 10 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 10 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 794 * 16], m3 - pmaddubsw m3, m1, [r5 + 10 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 10 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 795 * 16], m3 - - ; mode 15 [row 9] - pmaddubsw m5, m2, [r5 + 22 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movu [r0 + 850 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 22 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 22 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 851 * 16], m3 - - ; mode 15 [row 10] - pmaddubsw m5, m2, [r5 + 5 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movu [r0 + 852 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 5 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 5 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 853 * 16], m3 - - ; mode 13 [row 21] - pslldq m6, 2 - pinsrb m6, [r3 + 18], 1 - pinsrb m6, [r3 + 21], 0 - pmaddubsw m3, m6, [r5 + 26 * 16] - pmulhrsw m3, [pw_1024] - pslldq m2, 2 - pinsrw m2, [r4 + 2], 0 - pmaddubsw m5, m2, [r5 + 26 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 746 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 10], 0 - pmaddubsw m3, m1, [r5 + 26 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 18], 0 - pmaddubsw m5, m4, [r5 + 26 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 747 * 16], m3 - - ; mode 13 [row 22] - pmaddubsw m3, m6, [r5 + 17 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 17 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 748 * 16], m3 - pmaddubsw m3, m1, [r5 + 17 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 17 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 749 * 16], m3 - - ; mode 13 [row 23] - pmaddubsw m3, m6, [r5 + 8 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 8 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 750 * 16], m3 - pmaddubsw m3, m1, [r5 + 8 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 8 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 751 * 16], m3 - - ; mode 14 [row 14] - pslldq m7, 2 - pinsrb m7, [r3 + 12], 1 - pinsrb m7, [r3 + 15], 0 - pmaddubsw m3, m7, [r5 + 29 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 29 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 796 * 16], m3 - pmaddubsw m3, m1, [r5 + 29 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 29 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 797 * 16], m3 - - ; mode 14 [row 15] - pmaddubsw m3, m7, [r5 + 16 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 16 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 798 * 16], m3 - pmaddubsw m3, m1, [r5 + 16 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 16 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 799 * 16], m3 - - ; mode 14 [row 16] - pmaddubsw m3, m7, [r5 + 3 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 3 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 800 * 16], m3 - pmaddubsw m3, m1, [r5 + 3 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 3 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 801 * 16], m3 - - ; mode 15 [row 11] - pmaddubsw m5, m2, [r5 + 20 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 854 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 20 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 20 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 855 * 16], m3 - - ; mode 15 [row 12] - pmaddubsw m5, m2, [r5 + 3 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 856 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 3 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 3 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 857 * 16], m3 - - ; mode 13 [row 24] - pslldq m6, 2 - pinsrb m6, [r3 + 21], 1 - pinsrb m6, [r3 + 25], 0 - pmaddubsw m3, m6, [r5 + 31 * 16] - pmulhrsw m3, [pw_1024] - pslldq m2, 2 - pinsrw m2, [r4 + 1], 0 - pmaddubsw m5, m2, [r5 + 31 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 752 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 9], 0 - pmaddubsw m3, m1, [r5 + 31 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 17], 0 - pmaddubsw m5, m4, [r5 + 31 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 753 * 16], m3 - - ; mode 13 [row 25] - pmaddubsw m3, m6, [r5 + 22 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 22 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 754 * 16], m3 - pmaddubsw m3, m1, [r5 + 22 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 22 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 755 * 16], m3 - - ; mode 13 [row 26] - pmaddubsw m3, m6, [r5 + 13 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 13 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 756 * 16], m3 - pmaddubsw m3, m1, [r5 + 13 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 13 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 757 * 16], m3 - - ; mode 13 [row 27] - pmaddubsw m3, m6, [r5 + 4 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 4 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 758 * 16], m3 - pmaddubsw m3, m1, [r5 + 4 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 4 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 759 * 16], m3 - - ; mode 14 [row 17] - pslldq m7, 2 - pinsrb m7, [r3 + 15], 1 - pinsrb m7, [r3 + 17], 0 - pmaddubsw m3, m7, [r5 + 22 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 22 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 802 * 16], m3 - pmaddubsw m3, m1, [r5 + 22 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 22 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 803 * 16], m3 - - ; mode 14 [row 18] - pmaddubsw m3, m7, [r5 + 9 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 9 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 804 * 16], m3 - pmaddubsw m3, m1, [r5 + 9 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 9 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 805 * 16], m3 - - ; mode 15 [row 13] - pmaddubsw m5, m2, [r5 + 18 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 858 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 18 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 18 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 859 * 16], m3 - - ; mode 15 [row 14] - pmaddubsw m5, m2, [r5 + 1 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 860 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 1 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 1 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 861 * 16], m3 - - ; mode 13 [row 28] - pslldq m6, 2 - pinsrb m6, [r3 + 25], 1 - pinsrb m6, [r3 + 28], 0 - pmaddubsw m3, m6, [r5 + 27 * 16] - pmulhrsw m3, [pw_1024] - pslldq m2, 2 - pinsrw m2, [r4 + 0], 0 - pmaddubsw m5, m2, [r5 + 27 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 760 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 8], 0 - pmaddubsw m3, m1, [r5 + 27 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 16], 0 - pmaddubsw m5, m4, [r5 + 27 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 761 * 16], m3 - - ; mode 13 [row 29] - pmaddubsw m3, m6, [r5 + 18 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 18 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 762 * 16], m3 - pmaddubsw m3, m1, [r5 + 18 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 18 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 763 * 16], m3 - - ; mode 13 [row 30] - pmaddubsw m3, m6, [r5 + 9 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 9 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 764 * 16], m3 - pmaddubsw m3, m1, [r5 + 9 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 9 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 765 * 16], m3 - - ; mode 14 [row 19] - pslldq m7, 2 - pinsrb m7, [r3 + 17], 1 - pinsrb m7, [r3 + 20], 0 - pmaddubsw m3, m7, [r5 + 28 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 28 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 806 * 16], m3 - pmaddubsw m3, m1, [r5 + 28 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 28 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 807 * 16], m3 - - ; mode 14 [row 20] - pmaddubsw m3, m7, [r5 + 15 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 15 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 808 * 16], m3 - pmaddubsw m3, m1, [r5 + 15 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 15 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 809 * 16], m3 - - ; mode 14 [row 21] - pmaddubsw m3, m7, [r5 + 2 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 2 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 810 * 16], m3 - pmaddubsw m3, m1, [r5 + 2 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 2 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 811 * 16], m3 - - ; mode 15 [row 15] - pmaddubsw m5, m2, [r5 + 16 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 862 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 16 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 16 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 863 * 16], m3 - - ; mode 14 [row 22] - pslldq m7, 2 - pinsrb m7, [r3 + 20], 1 - pinsrb m7, [r3 + 22], 0 - pmaddubsw m3, m7, [r5 + 21 * 16] - pmulhrsw m3, [pw_1024] - pslldq m2, 2 - pinsrb m2, [r4 + 0], 1 - pinsrb m2, [r3 + 2], 0 - pmaddubsw m5, m2, [r5 + 21 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 812 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 7], 0 - pmaddubsw m3, m1, [r5 + 21 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 15], 0 - pmaddubsw m5, m4, [r5 + 21 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 813 * 16], m3 - - ; mode 14 [row 23] - pmaddubsw m3, m7, [r5 + 8 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 8 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 814 * 16], m3 - pmaddubsw m3, m1, [r5 + 8 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 8 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 815 * 16], m3 - - ; mode 15 [row 16] - pmaddubsw m5, m2, [r5 + 31 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 864 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 31 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 31 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 865 * 16], m3 - - ; mode 15 [row 17] - pmaddubsw m5, m2, [r5 + 14 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 866 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 14 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 14 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 867 * 16], m3 - - ; mode 14 [row 24] - pslldq m7, 2 - pinsrb m7, [r3 + 22], 1 - pinsrb m7, [r3 + 25], 0 - pmaddubsw m3, m7, [r5 + 27 * 16] - pmulhrsw m3, [pw_1024] - pslldq m2, 2 - pinsrb m2, [r3 + 2], 1 - pinsrb m2, [r3 + 5], 0 - pmaddubsw m5, m2, [r5 + 27 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 816 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 6], 0 - pmaddubsw m3, m1, [r5 + 27 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 14], 0 - pmaddubsw m5, m4, [r5 + 27 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 817 * 16], m3 - - ; mode 14 [row 25] - pmaddubsw m3, m7, [r5 + 14 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 14 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 818 * 16], m3 - pmaddubsw m3, m1, [r5 + 14 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 14 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 819 * 16], m3 - - ; mode 14 [row 26] - pmaddubsw m3, m7, [r5 + 1 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 1 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 820 * 16], m3 - pmaddubsw m3, m1, [r5 + 1 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 1 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 821 * 16], m3 - - ; mode 15 [row 18] - pinsrb m2, [r3 + 4], 0 - pmaddubsw m5, m2, [r5 + 29 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 868 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 29 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 29 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 869 * 16], m3 - - ; mode 15 [row 19] - pmaddubsw m5, m2, [r5 + 12 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 870 * 16 + 8], m5 - pmaddubsw m3, m1, [r5 + 12 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 12 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 871 * 16], m3 - - ; mode 15 [row 20 - 8 to 15] - pslldq m3, m2, 2 - pinsrb m3, [r3 + 4], 1 - pinsrb m3, [r3 + 6], 0 - pmaddubsw m5, m3, [r5 + 27 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 872 * 16 + 8], m5 - - ; mode 15 [row 21 - 8 to 15] - pmaddubsw m5, m3, [r5 + 10 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 874 * 16 + 8], m5 - - ; mode 15 [row 22 - 8 to 15] - pslldq m3, 2 - pinsrb m3, [r3 + 6], 1 - pinsrb m3, [r3 + 8], 0 - pmaddubsw m5, m3, [r5 + 25 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 876 * 16 + 8], m5 - - ; mode 15 [row 23 - 8 to 15] - pmaddubsw m5, m3, [r5 + 8 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 878 * 16 + 8], m5 - - ; mode 15 [row 24 - 8 to 15] - pslldq m3, 2 - pinsrb m3, [r3 + 8], 1 - pinsrb m3, [r3 + 9], 0 - pmaddubsw m5, m3, [r5 + 23 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 880 * 16 + 8], m5 - - ; mode 15 [row 25 - 8 to 15] - pmaddubsw m5, m3, [r5 + 6 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 882 * 16 + 8], m5 - - ; mode 15 [row 26 - 8 to 15] - pslldq m3, 2 - pinsrb m3, [r3 + 9], 1 - pinsrb m3, [r3 + 11], 0 - pmaddubsw m5, m3, [r5 + 21 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 884 * 16 + 8], m5 - - ; mode 15 [row 27 - 8 to 15] - pmaddubsw m5, m3, [r5 + 4 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 886 * 16 + 8], m5 - - ; mode 15 [row 28 - 8 to 15] - pslldq m3, 2 - pinsrb m3, [r3 + 11], 1 - pinsrb m3, [r3 + 13], 0 - pmaddubsw m5, m3, [r5 + 19 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 888 * 16 + 8], m5 - - ; mode 15 [row 29 - 8 to 15] - pmaddubsw m5, m3, [r5 + 2 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 890 * 16 + 8], m5 - - ; mode 15 [row 30 - 8 to 15] - pslldq m3, 2 - pinsrb m3, [r3 + 13], 1 - pinsrb m3, [r3 + 15], 0 - pmaddubsw m5, m3, [r5 + 17 * 16] - pmulhrsw m5, [pw_1024] - packuswb m5, m5 - movh [r0 + 892 * 16 + 8], m5 - - ; mode 15 [row 31, 8 to 15] - pshufb m5, m3, [tab_S2] - movh [r0 + 894 * 16 + 8], m5 - - ; mode 14 [row 27] - pinsrb m2, [r3 + 5], 0 - pslldq m7, 2 - pinsrb m7, [r3 + 25], 1 - pinsrb m7, [r3 + 27], 0 - pmaddubsw m3, m7, [r5 + 20 * 16] - pmulhrsw m3, [pw_1024] - pslldq m2, 2 - pinsrb m2, [r3 + 5], 1 - pinsrb m2, [r3 + 7], 0 - pmaddubsw m5, m2, [r5 + 20 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 822 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 5], 0 - pmaddubsw m3, m1, [r5 + 20 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 13], 0 - pmaddubsw m5, m4, [r5 + 20 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 823 * 16], m3 - - ; mode 15 [row 20 - 16 to 31] - pmaddubsw m3, m1, [r5 + 27 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 27 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 873 * 16], m3 - - ; mode 15 [row 21 - 16 to 31] - pmaddubsw m3, m1, [r5 + 10 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 10 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 875 * 16], m3 - - ; mode 14 [row 28] - pmaddubsw m3, m7, [r5 + 7 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 7 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 824 * 16], m3 - pmaddubsw m3, m1, [r5 + 7 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 7 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 825 * 16], m3 - - ; mode 14 [row 29] - pslldq m7, 2 - pinsrb m7, [r3 + 27], 1 - pinsrb m7, [r3 + 30], 0 - pmaddubsw m3, m7, [r5 + 26 * 16] - pmulhrsw m3, [pw_1024] - pslldq m2, 2 - pinsrb m2, [r3 + 7], 1 - pinsrb m2, [r3 + 10], 0 - pmaddubsw m5, m2, [r5 + 26 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 826 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 4], 0 - pmaddubsw m3, m1, [r5 + 26 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 12], 0 - pmaddubsw m5, m4, [r5 + 26 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 827 * 16], m3 - - ; mode 14 [row 30] - pmaddubsw m3, m7, [r5 + 13 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m2, [r5 + 13 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 828 * 16], m3 - pmaddubsw m3, m1, [r5 + 13 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 13 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 829 * 16], m3 - - ; mode 15 [row 22] - pmaddubsw m3, m1, [r5 + 25 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 25 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 877 * 16], m3 - - ; mode 15 [row 23] - pmaddubsw m3, m1, [r5 + 8 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 8 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 879 * 16], m3 - - ; mode 14 [row 31] - pshufb m3, m7, [tab_S2] - movh [r0 + 830 * 16], m3 - pshufb m3, m2, [tab_S2] - movh [r0 + 830 * 16 + 8], m3 - pshufb m3, m1, [tab_S2] - movh [r0 + 831 * 16], m3 - pshufb m3, m4, [tab_S2] - movh [r0 + 831 * 16 + 8], m3 - - ; mode 13 [row 31] - pshufb m0, m6, [tab_S2] - movh [r0 + 766 * 16], m0 - movh m0, [r4] - movh [r0 + 766 * 16 + 8], m0 - movu m0, [r4 + 8] - movu [r0 + 767 * 16], m0 - - ; mode 15 [row 24] - pslldq m1, 2 - pinsrw m1, [r4 + 3], 0 - pmaddubsw m3, m1, [r5 + 23 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 11], 0 - pmaddubsw m5, m4, [r5 + 23 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 881 * 16], m3 - - ; mode 15 [row 25] - pmaddubsw m3, m1, [r5 + 6 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 6 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 883 * 16], m3 - - ; mode 15 [row 26] - pslldq m1, 2 - pinsrw m1, [r4 + 2], 0 - pmaddubsw m3, m1, [r5 + 21 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 10], 0 - pmaddubsw m5, m4, [r5 + 21 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 885 * 16], m3 - - ; mode 15 [row 27] - pmaddubsw m3, m1, [r5 + 4 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 4 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 887 * 16], m3 - - ; mode 15 [row 28] - pslldq m1, 2 - pinsrw m1, [r4 + 1], 0 - pmaddubsw m3, m1, [r5 + 19 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 9], 0 - pmaddubsw m5, m4, [r5 + 19 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 889 * 16], m3 - - ; mode 15 [row 29] - pmaddubsw m3, m1, [r5 + 2 * 16] - pmulhrsw m3, [pw_1024] - pmaddubsw m5, m4, [r5 + 2 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 891 * 16], m3 - - ; mode 15 [row 30] - pslldq m1, 2 - pinsrw m1, [r4 + 0], 0 - pmaddubsw m3, m1, [r5 + 17 * 16] - pmulhrsw m3, [pw_1024] - pslldq m4, 2 - pinsrw m4, [r4 + 8], 0 - pmaddubsw m5, m4, [r5 + 17 * 16] - pmulhrsw m5, [pw_1024] - packuswb m3, m5 - movu [r0 + 893 * 16], m3 - - ; mode 15 [row 31] - pshufb m5, m1, [tab_S2] - movh [r0 + 895 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 895 * 16 + 8], m5 - - ; mode 16 [row 0] - movu m6, [r5 + 11 * 16] - movu m7, [pw_1024] - movh m0, [r4 ] - movh m1, [r4 + 1 ] - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movh m2, [r4 + 8] - movh m3, [r4 + 9] - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 896 * 16], m1 - - movh m1, [r4 + 16] - movh m3, [r4 + 17] - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movh m4, [r4 + 24] - movh m5, [r4 + 25] - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 897 * 16], m3 - - ; mode16 [row 1] - movu m6, [r5 + 22 * 16] - pslldq m0, 2 - pinsrb m0, [r4], 1 - pinsrb m0, [r3 + 2], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 898 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 15], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 23], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 899 * 16], m3 - - ; mode16 [row 2] - movu m6, [r5 + 1 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 900 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 901 * 16], m3 - - ; mode16 [row 3] - movu m6, [r5 + 12 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 2], 1 - pinsrb m0, [r3 + 3], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 902 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 14], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 22], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 903 * 16], m3 - - ; mode16 [row 4] - movu m6, [r5 + 23 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 3], 1 - pinsrb m0, [r3 + 5], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 904 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 13], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 21], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 905 * 16], m3 - - ; mode16 [row 5] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 906 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 907 * 16], m3 - - ; mode16 [row 6] - movu m6, [r5 + 13 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 5], 1 - pinsrb m0, [r3 + 6], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 5], 1 - pinsrb m2, [r4 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 908 * 16], m3 - pslldq m1, 2 - pinsrw m1, [r4 + 12], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 20], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 909 * 16], m3 - - ; mode16 [row 7] - movu m6, [r5 + 24 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 6], 1 - pinsrb m0, [r3 + 8], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 3], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 910 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 11], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 19], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 911 * 16], m3 - - ; mode16 [row 8] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 912 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 913 * 16], m3 - - ; mode16 [row 9] - movu m6, [r5 + 14 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 8], 1 - pinsrb m0, [r3 + 9], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 914 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 10], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 18], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 915 * 16], m3 - - ; mode16 [row 10] - movu m6, [r5 + 25 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 9], 1 - pinsrb m0, [r3 + 11], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 1], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 916 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 9], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrb m4, [r4 + 18], 1 - pinsrb m4, [r4 + 17], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 917 * 16], m3 - - ; mode16 [row 11] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 918 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 919 * 16], m3 - - ; mode16 [row 12] - movu m6, [r5 + 15 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 11], 1 - pinsrb m0, [r3 + 12], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 0], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 920 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 8], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 16], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 921 * 16], m3 - - ; mode16 [row 13] - movu m6, [r5 + 26 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 12], 1 - pinsrb m0, [r3 + 14], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 0], 1 - pinsrb m2, [r3 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 922 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 7], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 15], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 923 * 16], m3 - - ; mode16 [row 14] - movu m6, [r5 + 5 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 924 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 925 * 16], m3 - - ; mode16 [row 15] - movu m6, [r5 + 16 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 14], 1 - pinsrb m0, [r3 + 15], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 2], 1 - pinsrb m2, [r3 + 3], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 926 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 6], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 14], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 927 * 16], m3 - - ; mode16 [row 16] - movu m6, [r5 + 27 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 15], 1 - pinsrb m0, [r3 + 17], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 3], 1 - pinsrb m2, [r3 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 928 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 5], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 13], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 929 * 16], m3 - - ; mode16 [row 17] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 930 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 931 * 16], m3 - - ; mode16 [row 18] - movu m6, [r5 + 17 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 17], 1 - pinsrb m0, [r3 + 18], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 5], 1 - pinsrb m2, [r3 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 932 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 4], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 12], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 933 * 16], m3 - - ; mode16 [row 19] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 18], 1 - pinsrb m0, [r3 + 20], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 6], 1 - pinsrb m2, [r3 + 8], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 934 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 3], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 11], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 935 * 16], m3 - - ; mode16 [row 20] - movu m6, [r5 + 7 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 936 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 937 * 16], m3 - - ; mode16 [row 21] - movu m6, [r5 + 18 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 20], 1 - pinsrb m0, [r3 + 21], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 8], 1 - pinsrb m2, [r3 + 9], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 938 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 2], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 10], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 939 * 16], m3 - - ; mode16 [row 22] - movu m6, [r5 + 29 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 21], 1 - pinsrb m0, [r3 + 23], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 9], 1 - pinsrb m2, [r3 + 11], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 940 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 1], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 9], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 941 * 16], m3 - - ; mode16 [row 23] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 942 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 943 * 16], m3 - - ; mode16 [row 24] - movu m6, [r5 + 19 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 23], 1 - pinsrb m0, [r3 + 24], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 11], 1 - pinsrb m2, [r3 + 12], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 944 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 0], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 8], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 945 * 16], m3 - - ; mode16 [row 25] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 24], 1 - pinsrb m0, [r3 + 26], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 12], 1 - pinsrb m2, [r3 + 14], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 946 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r4 + 0], 1 - pinsrb m1, [r3 + 2], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 7], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 947 * 16], m3 - - ; mode16 [row 26] - movu m6, [r5 + 9 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 948 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 949 * 16], m3 - - ; mode16 [row 27] - movu m6, [r5 + 20 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 26], 1 - pinsrb m0, [r3 + 27], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 14], 1 - pinsrb m2, [r3 + 15], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 950 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 2], 1 - pinsrb m1, [r3 + 3], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 6], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 951 * 16], m3 - - ; mode16 [row 28] - movu m6, [r5 + 31 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 27], 1 - pinsrb m0, [r3 + 29], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 15], 1 - pinsrb m2, [r3 + 17], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 952 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 3], 1 - pinsrb m1, [r3 + 5], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 5], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 953 * 16], m3 - - ; mode16 [row 29] - movu m6, [r5 + 10 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 954 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 955 * 16], m3 - - ; mode16 [row 30] - movu m6, [r5 + 21 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 29], 1 - pinsrb m0, [r3 + 30], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 17], 1 - pinsrb m2, [r3 + 18], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 956 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 5], 1 - pinsrb m1, [r3 + 6], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 4], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 957 * 16], m3 - - ; mode16 [row 31] - pshufb m5, m0, [tab_S2] - movh [r0 + 958 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 958 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 959 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 959 * 16 + 8], m5 - - ; mode 17 [row 0] - movu m6, [r5 + 6 * 16] - movu m7, [pw_1024] - movh m0, [r4 ] - movh m1, [r4 + 1 ] - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movh m2, [r4 + 8] - movh m3, [r4 + 9] - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 960 * 16], m1 - - movh m1, [r4 + 16] - movh m3, [r4 + 17] - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movh m4, [r4 + 24] - movh m5, [r4 + 25] - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 961 * 16], m3 - - ; mode17 [row 1] - movu m6, [r5 + 12 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 0], 1 - pinsrb m0, [r3 + 1], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 962 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 15], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 23], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 963 * 16], m3 - - ; mode17 [row 2] - movu m6, [r5 + 18 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 1], 1 - pinsrb m0, [r3 + 2], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 964 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 14], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 22], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 965 * 16], m3 - - ; mode17 [row 3] - movu m6, [r5 + 24 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 2], 1 - pinsrb m0, [r3 + 4], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 966 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 13], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 21], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 967 * 16], m3 - - ; mode17 [row 4] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 4], 1 - pinsrb m0, [r3 + 5], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 968 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 12], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 20], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 969 * 16], m3 - - ; mode17 [row 5] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 970 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 971 * 16], m3 - - ; mode17 [row 6] - movu m6, [r5 + 10 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 5], 1 - pinsrb m0, [r3 + 6], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 3], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 972 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 11], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 19], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 973 * 16], m3 - - ; mode17 [row 7] - movu m6, [r5 + 16 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 6], 1 - pinsrb m0, [r3 + 7], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 974 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 10], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 18], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 975 * 16], m3 - - ; mode17 [row 8] - movu m6, [r5 + 22 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 7], 1 - pinsrb m0, [r3 + 9], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 1], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 976 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 9], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 17], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 977 * 16], m3 - - ; mode17 [row 9] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 9], 1 - pinsrb m0, [r3 + 10], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrw m2, [r4 + 0], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 978 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 8], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 16], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 979 * 16], m3 - - ; mode17 [row 10] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 980 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 981 * 16], m3 - - ; mode17 [row 11] - movu m6, [r5 + 8 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 10], 1 - pinsrb m0, [r3 + 11], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 0], 1 - pinsrb m2, [r3 + 1], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 982 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 7], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 15], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 983 * 16], m3 - - ; mode17 [row 12] - movu m6, [r5 + 14 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 11], 1 - pinsrb m0, [r3 + 12], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 1], 1 - pinsrb m2, [r3 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 984 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 6], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 14], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 985 * 16], m3 - - ; mode17 [row 13] - movu m6, [r5 + 20 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 12], 1 - pinsrb m0, [r3 + 14], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 2], 1 - pinsrb m2, [r3 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 986 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 5], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 13], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 987 * 16], m3 - - ; mode17 [row 14] - movu m6, [r5 + 26 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 14], 1 - pinsrb m0, [r3 + 15], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 4], 1 - pinsrb m2, [r3 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 988 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 4], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 12], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 989 * 16], m3 - - ; mode17 [row 15] - pshufb m5, m0, [tab_S2] - movh [r0 + 990 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 990 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 991 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 991 * 16 + 8], m5 - - ; mode17 [row 16] - movu m6, [r5 + 6 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 15], 1 - pinsrb m0, [r3 + 16], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 5], 1 - pinsrb m2, [r3 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 992 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 3], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 11], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 993 * 16], m3 - - ; mode17 [row 17] - movu m6, [r5 + 12 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 16], 1 - pinsrb m0, [r3 + 17], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 6], 1 - pinsrb m2, [r3 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 994 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 2], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 10], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 995 * 16], m3 - - ; mode17 [row 18] - movu m6, [r5 + 18 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 17], 1 - pinsrb m0, [r3 + 18], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 7], 1 - pinsrb m2, [r3 + 9], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 996 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 1], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 9], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 997 * 16], m3 - - ; mode17 [row 19] - movu m6, [r5 + 24 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 18], 1 - pinsrb m0, [r3 + 20], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 9], 1 - pinsrb m2, [r3 + 10], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 998 * 16], m3 - - pslldq m1, 2 - pinsrw m1, [r4 + 0], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 8], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 999 * 16], m3 - - ; mode17 [row 20] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 20], 1 - pinsrb m0, [r3 + 21], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 10], 1 - pinsrb m2, [r3 + 11], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1000 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r4 + 0], 1 - pinsrb m1, [r3 + 1], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - ;pinsrb m4, [r4 + 8], 1 - ;pinsrb m4, [r4 + 7], 0 - pinsrw m4, [r4 + 7], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1001 * 16], m3 - - ; mode17 [row 21] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1002 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1003 * 16], m3 - - ; mode17 [row 22] - movu m6, [r5 + 10 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 21], 1 - pinsrb m0, [r3 + 22], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 11], 1 - pinsrb m2, [r3 + 12], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1004 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 1], 1 - pinsrb m1, [r3 + 2], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 6], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1005 * 16], m3 - - ; mode17 [row 23] - movu m6, [r5 + 16 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 22], 1 - pinsrb m0, [r3 + 23], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 12], 1 - pinsrb m2, [r3 + 14], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1006 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 2], 1 - pinsrb m1, [r3 + 4], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 5], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1007 * 16], m3 - - ; mode17 [row 24] - movu m6, [r5 + 22 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 23], 1 - pinsrb m0, [r3 + 25], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 14], 1 - pinsrb m2, [r3 + 15], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1008 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 4], 1 - pinsrb m1, [r3 + 5], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 4], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1009 * 16], m3 - - ; mode17 [row 25] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 25], 1 - pinsrb m0, [r3 + 26], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 15], 1 - pinsrb m2, [r3 + 16], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1010 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 5], 1 - pinsrb m1, [r3 + 6], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 3], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1011 * 16], m3 - - ; mode17 [row 26] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1012 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1013 * 16], m3 - - ; mode17 [row 27] - movu m6, [r5 + 8 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 26], 1 - pinsrb m0, [r3 + 27], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 16], 1 - pinsrb m2, [r3 + 17], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1014 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 6], 1 - pinsrb m1, [r3 + 7], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 2], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1015 * 16], m3 - - ; mode17 [row 28] - movu m6, [r5 + 14 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 27], 1 - pinsrb m0, [r3 + 28], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 17], 1 - pinsrb m2, [r3 + 18], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1016 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 7], 1 - pinsrb m1, [r3 + 9], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 1], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1017 * 16], m3 - - ; mode17 [row 29] - movu m6, [r5 + 20 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 28], 1 - pinsrb m0, [r3 + 30], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 18], 1 - pinsrb m2, [r3 + 20], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1018 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 9], 1 - pinsrb m1, [r3 + 10], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrw m4, [r4 + 0], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1019 * 16], m3 - - ; mode17 [row 30] - movu m6, [r5 + 26 * 16] - pslldq m0, 2 - pinsrb m0, [r3 + 30], 1 - pinsrb m0, [r3 + 31], 0 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 20], 1 - pinsrb m2, [r3 + 21], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1020 * 16], m3 - - pslldq m1, 2 - pinsrb m1, [r3 + 10], 1 - pinsrb m1, [r3 + 11], 0 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pslldq m4, 2 - pinsrb m4, [r4 + 0], 1 - pinsrb m4, [r3 + 1], 0 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1021 * 16], m3 - - ; mode17 [row 31] - pshufb m5, m0, [tab_S2] - movh [r0 + 1022 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1022 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1023 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 1023 * 16 + 8], m5 - - ;mode 18[row 0] - movu m0, [r3] - movu [r0 + 1024 * 16], m0 - movu m1, [r3 + 16] - movu [r0 + 1025 * 16], m1 - - ;mode 18[row 1] - pslldq m0, 1 - pinsrb m0, [r4 + 1], 0 - movu [r0 + 1026 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 15], 0 - movu [r0 + 1027 * 16], m1 - - ;mode 18[row 2] - pslldq m0, 1 - pinsrb m0, [r4 + 2], 0 - movu [r0 + 1028 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 14], 0 - movu [r0 + 1029 * 16], m1 - - ;mode 18[row 3] - pslldq m0, 1 - pinsrb m0, [r4 + 3], 0 - movu [r0 + 1030 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 13], 0 - movu [r0 + 1031 * 16], m1 - - ;mode 18[row 4] - pslldq m0, 1 - pinsrb m0, [r4 + 4], 0 - movu [r0 + 1032 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 12], 0 - movu [r0 + 1033 * 16], m1 - - ;mode 18[row 5] - pslldq m0, 1 - pinsrb m0, [r4 + 5], 0 - movu [r0 + 1034 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 11], 0 - movu [r0 + 1035 * 16], m1 - - ;mode 18[row 6] - pslldq m0, 1 - pinsrb m0, [r4 + 6], 0 - movu [r0 + 1036 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 10], 0 - movu [r0 + 1037 * 16], m1 - - ;mode 18[row 7] - pslldq m0, 1 - pinsrb m0, [r4 + 7], 0 - movu [r0 + 1038 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 9], 0 - movu [r0 + 1039 * 16], m1 - - ;mode 18[row 8] - pslldq m0, 1 - pinsrb m0, [r4 + 8], 0 - movu [r0 + 1040 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 8], 0 - movu [r0 + 1041 * 16], m1 - - ;mode 18[row 9] - pslldq m0, 1 - pinsrb m0, [r4 + 9], 0 - movu [r0 + 1042 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 7], 0 - movu [r0 + 1043 * 16], m1 - - ;mode 18[row 10] - pslldq m0, 1 - pinsrb m0, [r4 + 10], 0 - movu [r0 + 1044 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 6], 0 - movu [r0 + 1045 * 16], m1 - - ;mode 18[row 11] - pslldq m0, 1 - pinsrb m0, [r4 + 11], 0 - movu [r0 + 1046 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 5], 0 - movu [r0 + 1047 * 16], m1 - - ;mode 18[row 12] - pslldq m0, 1 - pinsrb m0, [r4 + 12], 0 - movu [r0 + 1048 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 4], 0 - movu [r0 + 1049 * 16], m1 - - ;mode 18[row 13] - pslldq m0, 1 - pinsrb m0, [r4 + 13], 0 - movu [r0 + 1050 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 3], 0 - movu [r0 + 1051 * 16], m1 - - ;mode 18[row 14] - pslldq m0, 1 - pinsrb m0, [r4 + 14], 0 - movu [r0 + 1052 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 2], 0 - movu [r0 + 1053 * 16], m1 - - ;mode 18[row 15] - pslldq m0, 1 - pinsrb m0, [r4 + 15], 0 - movu [r0 + 1054 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 1], 0 - movu [r0 + 1055 * 16], m1 - - ;mode 18[row 16] - pslldq m0, 1 - pinsrb m0, [r4 + 16], 0 - movu [r0 + 1056 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r3 + 0], 0 - movu [r0 + 1057 * 16], m1 - - ;mode 18[row 17] - pslldq m0, 1 - pinsrb m0, [r4 + 17], 0 - movu [r0 + 1058 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 1], 0 - movu [r0 + 1059 * 16], m1 - - ;mode 18[row 18] - pslldq m0, 1 - pinsrb m0, [r4 + 18], 0 - movu [r0 + 1060 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 2], 0 - movu [r0 + 1061 * 16], m1 - - ;mode 18[row 19] - pslldq m0, 1 - pinsrb m0, [r4 + 19], 0 - movu [r0 + 1062 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 3], 0 - movu [r0 + 1063 * 16], m1 - - ;mode 18[row 20] - pslldq m0, 1 - pinsrb m0, [r4 + 20], 0 - movu [r0 + 1064 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 4], 0 - movu [r0 + 1065 * 16], m1 - - ;mode 18[row 21] - pslldq m0, 1 - pinsrb m0, [r4 + 21], 0 - movu [r0 + 1066 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 5], 0 - movu [r0 + 1067 * 16], m1 - - ;mode 18[row 22] - pslldq m0, 1 - pinsrb m0, [r4 + 22], 0 - movu [r0 + 1068 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 6], 0 - movu [r0 + 1069 * 16], m1 - - ;mode 18[row 23] - pslldq m0, 1 - pinsrb m0, [r4 + 23], 0 - movu [r0 + 1070 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 7], 0 - movu [r0 + 1071 * 16], m1 - - ;mode 18[row 24] - pslldq m0, 1 - pinsrb m0, [r4 + 24], 0 - movu [r0 + 1072 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 8], 0 - movu [r0 + 1073 * 16], m1 - - ;mode 18[row 25] - pslldq m0, 1 - pinsrb m0, [r4 + 25], 0 - movu [r0 + 1074 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 9], 0 - movu [r0 + 1075 * 16], m1 - - ;mode 18[row 26] - pslldq m0, 1 - pinsrb m0, [r4 + 26], 0 - movu [r0 + 1076 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 10], 0 - movu [r0 + 1077 * 16], m1 - - ;mode 18[row 27] - pslldq m0, 1 - pinsrb m0, [r4 + 27], 0 - movu [r0 + 1078 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 11], 0 - movu [r0 + 1079 * 16], m1 - - ;mode 18[row 28] - pslldq m0, 1 - pinsrb m0, [r4 + 28], 0 - movu [r0 + 1080 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 12], 0 - movu [r0 + 1081 * 16], m1 - - ;mode 18[row 29] - pslldq m0, 1 - pinsrb m0, [r4 + 29], 0 - movu [r0 + 1082 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 13], 0 - movu [r0 + 1083 * 16], m1 - - ;mode 18[row 30] - pslldq m0, 1 - pinsrb m0, [r4 + 30], 0 - movu [r0 + 1084 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 14], 0 - movu [r0 + 1085 * 16], m1 - - ;mode 18[row 31] - pslldq m0, 1 - pinsrb m0, [r4 + 31], 0 - movu [r0 + 1086 * 16], m0 - pslldq m1, 1 - pinsrb m1, [r4 + 15], 0 - movu [r0 + 1087 * 16], m1 - - ; mode 19 [row 0] - movu m6, [r5 + 6 * 16] - movu m0, [r3 ] - movu m1, [r3 + 1 ] - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r3 + 8] - movu m3, [r3 + 9] - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 1088 * 16], m1 - - movu m1, [r3 + 16] - movu m3, [r3 + 17] - punpcklbw m1, m3 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - movu m3, [r3 + 24] - movu m5, [r3 + 25] - punpcklbw m3, m5 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1089 * 16], m4 - - ; mode 19 [row 1] - movu m6, [r5 + 12 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 0], 1 - pinsrb m0, [r4 + 1], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1090 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 15], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 23], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1091 * 16], m4 - - ; mode 19 [row 2] - movu m6, [r5 + 18 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 1], 1 - pinsrb m0, [r4 + 2], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1092 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 14], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 22], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1093 * 16], m4 - - ; mode 19 [row 3] - movu m6, [r5 + 24 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 2], 1 - pinsrb m0, [r4 + 4], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1094 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 13], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 21], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1095 * 16], m4 - - ; mode 19 [row 4] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 4], 1 - pinsrb m0, [r4 + 5], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1096 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 12], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 20], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1097 * 16], m4 - - ; mode 19 [row 5] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1098 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1099 * 16], m4 - - ; mode 19 [row 6] - movu m6, [r5 + 10 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 5], 1 - pinsrb m0, [r4 + 6], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 3], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1100 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 11], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 19], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1101 * 16], m4 - - ; mode 19 [row 7] - movu m6, [r5 + 16 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 6], 1 - pinsrb m0, [r4 + 7], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1102 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 10], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 18], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1103 * 16], m4 - - ; mode 19 [row 8] - movu m6, [r5 + 22 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 7], 1 - pinsrb m0, [r4 + 9], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 1], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1104 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 9], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 17], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1105 * 16], m4 - - ; mode 19 [row 9] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 9], 1 - pinsrb m0, [r4 + 10], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 0], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1106 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 8], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 16], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1107 * 16], m4 - - ; mode 19 [row 10] - movu m6, [r5 + 2 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1108 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1109 * 16], m4 - - ; mode 19 [row 11] - movu m6, [r5 + 8 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 10], 1 - pinsrb m0, [r4 + 11], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 0], 1 - pinsrb m2, [r4 + 1], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1110 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 7], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 15], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1111 * 16], m4 - - ; mode 19 [row 12] - movu m6, [r5 + 14 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 11], 1 - pinsrb m0, [r4 + 12], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 1], 1 - pinsrb m2, [r4 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1112 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 6], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 14], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1113 * 16], m4 - - ; mode 19 [row 13] - movu m6, [r5 + 20 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 12], 1 - pinsrb m0, [r4 + 14], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 2], 1 - pinsrb m2, [r4 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1114 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 5], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 13], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1115 * 16], m4 - - ; mode 19 [row 14] - movu m6, [r5 + 26 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 14], 1 - pinsrb m0, [r4 + 15], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 4], 1 - pinsrb m2, [r4 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1116 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 4], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 12], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1117 * 16], m4 - - ; mode19 [row 15] - pshufb m5, m0, [tab_S2] - movh [r0 + 1118 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1118 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1119 * 16], m5 - pshufb m5, m3, [tab_S2] - movh [r0 + 1119 * 16 + 8], m5 - - ; mode 19 [row 16] - movu m6, [r5 + 6 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 15], 1 - pinsrb m0, [r4 + 16], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 5], 1 - pinsrb m2, [r4 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1120 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 3], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 11], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1121 * 16], m4 - - ; mode 19 [row 17] - movu m6, [r5 + 12 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 16], 1 - pinsrb m0, [r4 + 17], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 6], 1 - pinsrb m2, [r4 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1122 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 2], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 10], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1123 * 16], m4 - - ; mode 19 [row 18] - movu m6, [r5 + 18 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 17], 1 - pinsrb m0, [r4 + 18], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 7], 1 - pinsrb m2, [r4 + 9], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1124 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 1], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 9], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1125 * 16], m4 - - ; mode 19 [row 19] - movu m6, [r5 + 24 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 18], 1 - pinsrb m0, [r4 + 20], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 9], 1 - pinsrb m2, [r4 + 10], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1126 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 0], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 8], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1127 * 16], m4 - - ; mode 19 [row 20] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 20], 1 - pinsrb m0, [r4 + 21], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 10], 1 - pinsrb m2, [r4 + 11], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1128 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 0], 1 - pinsrb m1, [r4 + 1], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrb m3, [r3 + 8], 1 - pinsrb m3, [r3 + 7], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1129 * 16], m4 - - ; mode 19 [row 21] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1130 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1131 * 16], m4 - - ; mode 19 [row 22] - movu m6, [r5 + 10 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 21], 1 - pinsrb m0, [r4 + 22], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 11], 1 - pinsrb m2, [r4 + 12], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1132 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 1], 1 - pinsrb m1, [r4 + 2], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 6], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1133 * 16], m4 - - ; mode 19 [row 23] - movu m6, [r5 + 16 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 22], 1 - pinsrb m0, [r4 + 23], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 12], 1 - pinsrb m2, [r4 + 14], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1134 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 2], 1 - pinsrb m1, [r4 + 4], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 5], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1135 * 16], m4 - - ; mode 19 [row 24] - movu m6, [r5 + 22 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 23], 1 - pinsrb m0, [r4 + 25], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 14], 1 - pinsrb m2, [r4 + 15], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1136 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 4], 1 - pinsrb m1, [r4 + 5], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 4], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1137 * 16], m4 - - ; mode 19 [row 25] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 25], 1 - pinsrb m0, [r4 + 26], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 15], 1 - pinsrb m2, [r4 + 16], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1138 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 5], 1 - pinsrb m1, [r4 + 6], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 3], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1139 * 16], m4 - - ; mode 19 [row 26] - movu m6, [r5 + 2 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1140 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1141 * 16], m4 - - ; mode 19 [row 27] - movu m6, [r5 + 8 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 26], 1 - pinsrb m0, [r4 + 27], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 16], 1 - pinsrb m2, [r4 + 17], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1142 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 6], 1 - pinsrb m1, [r4 + 7], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 2], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1143 * 16], m4 - - ; mode 19 [row 28] - movu m6, [r5 + 14 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 27], 1 - pinsrb m0, [r4 + 28], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 17], 1 - pinsrb m2, [r4 + 18], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1144 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 7], 1 - pinsrb m1, [r4 + 9], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 1], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1145 * 16], m4 - - ; mode 19 [row 29] - movu m6, [r5 + 20 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 28], 1 - pinsrb m0, [r4 + 30], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 18], 1 - pinsrb m2, [r4 + 20], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1146 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 9], 1 - pinsrb m1, [r4 + 10], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 0], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1147 * 16], m4 - - ; mode 19 [row 30] - movu m6, [r5 + 26 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 30], 1 - pinsrb m0, [r4 + 31], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 20], 1 - pinsrb m2, [r4 + 21], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1148 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 10], 1 - pinsrb m1, [r4 + 11], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrb m3, [r4 + 0], 1 - pinsrb m3, [r4 + 1], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1149 * 16], m4 - - ; mode19 [row 31] - pshufb m5, m0, [tab_S2] - movh [r0 + 1150 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1150 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1151 * 16], m5 - pshufb m5, m3, [tab_S2] - movh [r0 + 1151 * 16 + 8], m5 - - ; mode 20 [row 0] - movu m6, [r5 + 11 * 16] - movu m0, [r3 ] - movu m1, [r3 + 1 ] - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r3 + 8] - movu m3, [r3 + 9] - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 1152 * 16], m1 - - movu m1, [r3 + 16] - movu m3, [r3 + 17] - punpcklbw m1, m3 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - movu m3, [r3 + 24] - movu m5, [r3 + 25] - punpcklbw m3, m5 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1153 * 16], m4 - - ; mode 20 [row 1] - movu m6, [r5 + 22 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 0], 1 - pinsrb m0, [r4 + 2], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1154 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 15], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 23], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1155 * 16], m4 - - ; mode 20 [row 2] - movu m6, [r5 + 1 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1156 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1157 * 16], m4 - - ; mode 20 [row 3] - movu m6, [r5 + 12 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 2], 1 - pinsrb m0, [r4 + 3], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1158 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 14], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 22], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1159 * 16], m4 - - ; mode 20 [row 4] - movu m6, [r5 + 23 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 3], 1 - pinsrb m0, [r4 + 5], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1160 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 13], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 21], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1161 * 16], m4 - - ; mode 20 [row 5] - movu m6, [r5 + 2 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1162 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1163 * 16], m4 - - ; mode 20 [row 6] - movu m6, [r5 + 13 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 5], 1 - pinsrb m0, [r4 + 6], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1164 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 12], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 20], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1165 * 16], m4 - - ; mode 20 [row 7] - movu m6, [r5 + 24 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 6], 1 - pinsrb m0, [r4 + 8], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 3], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1166 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 11], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 19], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1167 * 16], m4 - - ; mode 20 [row 8] - movu m6, [r5 + 3 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1168 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1169 * 16], m4 - - ; mode 20 [row 9] - movu m6, [r5 + 14 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 8], 1 - pinsrb m0, [r4 + 9], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 3], 1 - pinsrb m2, [r3 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1170 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 10], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 18], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1171 * 16], m4 - - ; mode 20 [row 10] - movu m6, [r5 + 25 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 9], 1 - pinsrb m0, [r4 + 11], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 1], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1172 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 9], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 17], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1173 * 16], m4 - - ; mode 20 [row 11] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1174 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1175 * 16], m4 - - ; mode 20 [row 12] - movu m6, [r5 + 15 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 11], 1 - pinsrb m0, [r4 + 12], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r3 + 1], 1 - pinsrb m2, [r3 + 0], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1176 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 8], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 16], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1177 * 16], m4 - - ; mode 20 [row 13] - movu m6, [r5 + 26 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 12], 1 - pinsrb m0, [r4 + 14], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 0], 1 - pinsrb m2, [r4 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1178 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 7], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 15], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1179 * 16], m4 - - ; mode 20 [row 14] - movu m6, [r5 + 5 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1180 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1181 * 16], m4 - - ; mode 20 [row 15] - movu m6, [r5 + 16 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 14], 1 - pinsrb m0, [r4 + 15], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 2], 1 - pinsrb m2, [r4 + 3], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1182 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 6], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 14], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1183 * 16], m4 - - ; mode 20 [row 16] - movu m6, [r5 + 27 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 15], 1 - pinsrb m0, [r4 + 17], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 3], 1 - pinsrb m2, [r4 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1184 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 5], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 13], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1185 * 16], m4 - - ; mode 20 [row 17] - movu m6, [r5 + 6 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1186 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1187 * 16], m4 - - ; mode 20 [row 18] - movu m6, [r5 + 17 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 17], 1 - pinsrb m0, [r4 + 18], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 5], 1 - pinsrb m2, [r4 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1188 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 4], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 12], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1189 * 16], m4 - - ; mode 20 [row 19] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 18], 1 - pinsrb m0, [r4 + 20], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 6], 1 - pinsrb m2, [r4 + 8], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1190 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 3], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 11], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1191 * 16], m4 - - ; mode 20 [row 20] - movu m6, [r5 + 7 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1192 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1193 * 16], m4 - - ; mode 20 [row 21] - movu m6, [r5 + 18 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 20], 1 - pinsrb m0, [r4 + 21], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 8], 1 - pinsrb m2, [r4 + 9], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1194 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 2], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 10], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1195 * 16], m4 - - ; mode 20 [row 22] - movu m6, [r5 + 29 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 21], 1 - pinsrb m0, [r4 + 23], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 9], 1 - pinsrb m2, [r4 + 11], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1196 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 1], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 9], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1197 * 16], m4 - - ; mode 20 [row 23] - movu m6, [r5 + 8 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1198 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1199 * 16], m4 - - ; mode 20 [row 24] - movu m6, [r5 + 19 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 23], 1 - pinsrb m0, [r4 + 24], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 11], 1 - pinsrb m2, [r4 + 12], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1200 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 0], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 8], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1201 * 16], m4 - - ; mode 20 [row 25] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 24], 1 - pinsrb m0, [r4 + 26], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 12], 1 - pinsrb m2, [r4 + 14], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1202 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 0], 1 - pinsrb m1, [r4 + 2], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 7], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1203 * 16], m4 - - ; mode 20 [row 26] - movu m6, [r5 + 9 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1204 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1205 * 16], m4 - - ; mode 20 [row 27] - movu m6, [r5 + 20 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 26], 1 - pinsrb m0, [r4 + 27], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 14], 1 - pinsrb m2, [r4 + 15], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1206 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 2], 1 - pinsrb m1, [r4 + 3], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 6], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1207 * 16], m4 - - ; mode 20 [row 28] - movu m6, [r5 + 31 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 27], 1 - pinsrb m0, [r4 + 29], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 15], 1 - pinsrb m2, [r4 + 17], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1208 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 3], 1 - pinsrb m1, [r4 + 5], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 5], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1209 * 16], m4 - - ; mode 20 [row 29] - movu m6, [r5 + 10 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1210 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1211 * 16], m4 - - ; mode 20 [row 30] - movu m6, [r5 + 21 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 29], 1 - pinsrb m0, [r4 + 30], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 17], 1 - pinsrb m2, [r4 + 18], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1212 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r4 + 5], 1 - pinsrb m1, [r4 + 6], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 4], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1213 * 16], m4 - - ; mode20 [row 31] - pshufb m5, m0, [tab_S2] - movh [r0 + 1214 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1214 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1215 * 16], m5 - pshufb m5, m3, [tab_S2] - movh [r0 + 1215 * 16 + 8], m5 - - ; mode 21 [row 0] - movu m6, [r5 + 15 * 16] - movu m0, [r3 ] - movu m1, [r3 + 1 ] - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r3 + 8] - movu m3, [r3 + 9] - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 1216 * 16], m1 - - movu m1, [r3 + 16] - movu m3, [r3 + 17] - punpcklbw m1, m3 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - movu m3, [r3 + 24] - movu m5, [r3 + 25] - punpcklbw m3, m5 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1217 * 16], m4 - - ; mode 21 [row 1] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 0], 1 - pinsrb m0, [r4 + 2], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1218 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 15], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 23], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1219 * 16], m4 - - ; mode 21 [row 2] - movu m6, [r5 + 13 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1220 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1221 * 16], m4 - - ; mode 21 [row 3] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 2], 1 - pinsrb m0, [r4 + 4], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1222 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 14], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 22], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1223 * 16], m4 - - ; mode 21 [row 4] - movu m6, [r5 + 11 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1224 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1225 * 16], m4 - - ; mode 21 [row 5] - movu m6, [r5 + 26 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 4], 1 - pinsrb m0, [r4 + 6], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1226 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 13], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 21], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1227 * 16], m4 - - ; mode 21 [row 6] - movu m6, [r5 + 9 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1228 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1229 * 16], m4 - - ; mode 21 [row 7] - movu m6, [r5 + 24 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 6], 1 - pinsrb m0, [r4 + 8], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1230 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 12], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 20], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1231 * 16], m4 - - ; mode 21 [row 8] - movu m6, [r5 + 7 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1232 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1233 * 16], m4 - - ; mode 21 [row 9] - movu m6, [r5 + 22 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 8], 1 - pinsrb m0, [r4 + 9], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 3], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1234 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 11], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 19], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1235 * 16], m4 - - ; mode 21 [row 10] - movu m6, [r5 + 5 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1236 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1237 * 16], m4 - - ; mode 21 [row 11] - movu m6, [r5 + 20 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 9], 1 - pinsrb m0, [r4 + 11], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1238 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 10], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 18], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1239 * 16], m4 - - ; mode 21 [row 12] - movu m6, [r5 + 3 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1240 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1241 * 16], m4 - - ; mode 21 [row 13] - movu m6, [r5 + 18 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 11], 1 - pinsrb m0, [r4 + 13], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 1], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1242 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 9], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 17], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1243 * 16], m4 - - ; mode 21 [row 14] - movu m6, [r5 + 1 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1244 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1245 * 16], m4 - - ; mode 21 [row 15] - movu m6, [r5 + 16 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 13], 1 - pinsrb m0, [r4 + 15], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 0], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1246 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 8], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 16], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1247 * 16], m4 - - ; mode 21 [row 16] - movu m6, [r5 + 31 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 15], 1 - pinsrb m0, [r4 + 17], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 0], 1 - pinsrb m2, [r4 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1248 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 7], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 15], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1249 * 16], m4 - - ; mode 21 [row 17] - movu m6, [r5 + 14 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1250 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1251 * 16], m4 - - ; mode 21 [row 18] - movu m6, [r5 + 29 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 17], 1 - pinsrb m0, [r4 + 19], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 2], 1 - pinsrb m2, [r4 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1252 * 16], m4 - pslldq m1, 2 - pinsrb m1, [r3 + 7], 1 - pinsrb m1, [r3 + 6], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrb m3, [r3 + 15], 1 - pinsrb m3, [r3 + 14], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1253 * 16], m4 - - ; mode 21 [row 19] - movu m6, [r5 + 12 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1254 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1255 * 16], m4 - - ; mode 21 [row 20] - movu m6, [r5 + 27 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 19], 1 - pinsrb m0, [r4 + 21], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 4], 1 - pinsrb m2, [r4 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1256 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 5], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 13], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1257 * 16], m4 - - ; mode 21 [row 21] - movu m6, [r5 + 10 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1258 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1259 * 16], m4 - - ; mode 21 [row 22] - movu m6, [r5 + 25 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 21], 1 - pinsrb m0, [r4 + 23], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 6], 1 - pinsrb m2, [r4 + 8], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1260 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 4], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 12], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1261 * 16], m4 - - ; mode 21 [row 23] - movu m6, [r5 + 8 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1262 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1263 * 16], m4 - - ; mode 21 [row 24] - movu m6, [r5 + 23 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 23], 1 - pinsrb m0, [r4 + 24], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 8], 1 - pinsrb m2, [r4 + 9], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1264 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 3], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 11], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1265 * 16], m4 - - ; mode 21 [row 25] - movu m6, [r5 + 6 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1266 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1267 * 16], m4 - - ; mode 21 [row 26] - movu m6, [r5 + 21 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 24], 1 - pinsrb m0, [r4 + 26], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 9], 1 - pinsrb m2, [r4 + 11], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1268 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 2], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 10], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1269 * 16], m4 - - ; mode 21 [row 27] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1270 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1271 * 16], m4 - - ; mode 21 [row 28] - movu m6, [r5 + 19 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 26], 1 - pinsrb m0, [r4 + 28], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 11], 1 - pinsrb m2, [r4 + 13], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1272 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 1], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 9], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1273 * 16], m4 - - ; mode 21 [row 29] - movu m6, [r5 + 2 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1274 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1275 * 16], m4 - - ; mode 21 [row 30] - movu m6, [r5 + 17 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 28], 1 - pinsrb m0, [r4 + 30], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 13], 1 - pinsrb m2, [r4 + 15], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1276 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 0], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 8], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1277 * 16], m4 - - ; mode21 [row 31] - pshufb m5, m0, [tab_S2] - movh [r0 + 1278 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1278 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1279 * 16], m5 - pshufb m5, m3, [tab_S2] - movh [r0 + 1279 * 16 + 8], m5 - - ; mode 22 [row 0] - movu m6, [r5 + 19 * 16] - movu m0, [r3 ] - movu m1, [r3 + 1 ] - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r3 + 8] - movu m3, [r3 + 9] - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 1280 * 16], m1 - - movu m1, [r3 + 16] - movu m3, [r3 + 17] - punpcklbw m1, m3 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - movu m3, [r3 + 24] - movu m5, [r3 + 25] - punpcklbw m3, m5 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1281 * 16], m4 - - ; mode 22 [row 1] - movu m6, [r5 + 6 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1282 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1283 * 16], m4 - - ; mode 22 [row 2] - movu m6, [r5 + 25 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 0], 1 - pinsrb m0, [r4 + 2], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1284 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 15], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 23], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1285 * 16], m4 - - ; mode 22 [row 3] - movu m6, [r5 + 12 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1286 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1287 * 16], m4 - - ; mode 22 [row 4] - movu m6, [r5 + 31 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 2], 1 - pinsrb m0, [r4 + 5], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1288 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 14], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 22], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1289 * 16], m4 - - ; mode 22 [row 5] - movu m6, [r5 + 18 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1290 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1291 * 16], m4 - - ; mode 22 [row 6] - movu m6, [r5 + 5 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1292 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1293 * 16], m4 - - ; mode 22 [row 7] - movu m6, [r5 + 24 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 5], 1 - pinsrb m0, [r4 + 7], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1294 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 13], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 21], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1295 * 16], m4 - - ; mode 22 [row 8] - movu m6, [r5 + 11 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1296 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1297 * 16], m4 - - ; mode 22 [row 9] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 7], 1 - pinsrb m0, [r4 + 10], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1298 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 12], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 20], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1299 * 16], m4 - - ; mode 22 [row 10] - movu m6, [r5 + 17 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1300 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1301 * 16], m4 - - ; mode 22 [row 11] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1302 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1303 * 16], m4 - - ; mode 22 [row 12] - movu m6, [r5 + 23 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 10], 1 - pinsrb m0, [r4 + 12], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 3], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1304 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 11], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 19], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1305 * 16], m4 - - ; mode 22 [row 13] - movu m6, [r5 + 10 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1306 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1307 * 16], m4 - - ; mode 22 [row 14] - movu m6, [r5 + 29 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 12], 1 - pinsrb m0, [r4 + 15], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1308 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 10], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 18], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1309 * 16], m4 - - ; mode 22 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1310 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1311 * 16], m4 - - ; mode 22 [row 16] - movu m6, [r5 + 3 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1312 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1313 * 16], m4 - - ; mode 22 [row 17] - movu m6, [r5 + 22 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 15], 1 - pinsrb m0, [r4 + 17], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 1], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1314 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 9], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 17], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1315 * 16], m4 - - ; mode 22 [row 18] - movu m6, [r5 + 9 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1316 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1317 * 16], m4 - - ; mode 22 [row 19] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 17], 1 - pinsrb m0, [r4 + 20], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 0], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1318 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 8], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 16], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1319 * 16], m4 - - ; mode 22 [row 20] - movu m6, [r5 + 15 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1320 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1321 * 16], m4 - - ; mode 22 [row 21] - movu m6, [r5 + 2 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1322 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1323 * 16], m4 - - ; mode 22 [row 22] - movu m6, [r5 + 21 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 20], 1 - pinsrb m0, [r4 + 22], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 0], 1 - pinsrb m2, [r4 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1324 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 7], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 15], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1325 * 16], m4 - - ; mode 22 [row 23] - movu m6, [r5 + 8 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1326 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1327 * 16], m4 - - ; mode 22 [row 24] - movu m6, [r5 + 27 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 22], 1 - pinsrb m0, [r4 + 25], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 2], 1 - pinsrb m2, [r4 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1328 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 6], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 14], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1329 * 16], m4 - - ; mode 22 [row 25] - movu m6, [r5 + 14 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1330 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1331 * 16], m4 - - ; mode 22 [row 26] - movu m6, [r5 + 1 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1332 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1333 * 16], m4 - - ; mode 22 [row 27] - movu m6, [r5 + 20 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 25], 1 - pinsrb m0, [r4 + 27], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 5], 1 - pinsrb m2, [r4 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1334 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 5], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 13], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1335 * 16], m4 - - ; mode 22 [row 28] - movu m6, [r5 + 7 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1336 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1337 * 16], m4 - - ; mode 22 [row 29] - movu m6, [r5 + 26 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 27], 1 - pinsrb m0, [r4 + 30], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrb m2, [r4 + 7], 1 - pinsrb m2, [r4 + 10], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1338 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 4], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 12], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1339 * 16], m4 - - ; mode 22 [row 30] - movu m6, [r5 + 13 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1340 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1341 * 16], m4 - - ; mode22 [row 31] - pshufb m5, m0, [tab_S2] - movh [r0 + 1342 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1342 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1343 * 16], m5 - pshufb m5, m3, [tab_S2] - movh [r0 + 1343 * 16 + 8], m5 - - ; mode 23 [row 0] - movu m6, [r5 + 23 * 16] - movu m0, [r3 ] - movu m1, [r3 + 1 ] - punpcklbw m0, m1 - pmaddubsw m1, m0, m6 - pmulhrsw m1, m7 - movu m2, [r3 + 8] - movu m3, [r3 + 9] - punpcklbw m2, m3 - pmaddubsw m3, m2, m6 - pmulhrsw m3, m7 - packuswb m1, m3 - movu [r0 + 1344 * 16], m1 - - movu m1, [r3 + 16] - movu m3, [r3 + 17] - punpcklbw m1, m3 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - movu m3, [r3 + 24] - movu m5, [r3 + 25] - punpcklbw m3, m5 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1345 * 16], m4 - - ; mode 23 [row 1] - movu m6, [r5 + 14 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1346 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1347 * 16], m4 - - ; mode 23 [row 2] - movu m6, [r5 + 5 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1348 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1349 * 16], m4 - - ; mode 23 [row 3] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 0], 1 - pinsrb m0, [r4 + 4], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1350 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 15], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 23], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1351 * 16], m4 - - ; mode 23 [row 4] - movu m6, [r5 + 19 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1352 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1353 * 16], m4 - - ; mode 23 [row 5] - movu m6, [r5 + 10 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1354 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1355 * 16], m4 - - ; mode 23 [row 6] - movu m6, [r5 + 1 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1356 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1357 * 16], m4 - - ; mode 23 [row 7] - movu m6, [r5 + 24 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 4], 1 - pinsrb m0, [r4 + 7], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1358 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 14], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 22], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1359 * 16], m4 - - ; mode 23 [row 8] - movu m6, [r5 + 15 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1360 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1361 * 16], m4 - - ; mode 23 [row 9] - movu m6, [r5 + 6 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1362 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1363 * 16], m4 - - ; mode 23 [row 10] - movu m6, [r5 + 29 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 7], 1 - pinsrb m0, [r4 + 11], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1364 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 13], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 21], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1365 * 16], m4 - - ; mode 23 [row 11] - movu m6, [r5 + 20 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1366 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1367 * 16], m4 - - ; mode 23 [row 12] - movu m6, [r5 + 11 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1368 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1369 * 16], m4 - - ; mode 23 [row 13] - movu m6, [r5 + 2 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1370 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1371 * 16], m4 - - ; mode 23 [row 14] - movu m6, [r5 + 25 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 11], 1 - pinsrb m0, [r4 + 14], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1372 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 12], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 20], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1373 * 16], m4 - - ; mode 23 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1374 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1375 * 16], m4 - - ; mode 23 [row 16] - movu m6, [r5 + 7 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1376 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1377 * 16], m4 - - ; mode 23 [row 17] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 14], 1 - pinsrb m0, [r4 + 18], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 3], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1378 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 11], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 19], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1379 * 16], m4 - - ; mode 23 [row 18] - movu m6, [r5 + 21 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1380 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1381 * 16], m4 - - ; mode 23 [row 19] - movu m6, [r5 + 12 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1382 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1383 * 16], m4 - - ; mode 23 [row 20] - movu m6, [r5 + 3 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1384 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1385 * 16], m4 - - ; mode 23 [row 21] - movu m6, [r5 + 26 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 18], 1 - pinsrb m0, [r4 + 21], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 2], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1386 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 10], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 18], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1387 * 16], m4 - - ; mode 23 [row 22] - movu m6, [r5 + 17 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1388 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1389 * 16], m4 - - ; mode 23 [row 23] - movu m6, [r5 + 8 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1390 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1391 * 16], m4 - - ; mode 23 [row 24] - movu m6, [r5 + 31 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 21], 1 - pinsrb m0, [r4 + 25], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 1], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1392 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 9], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 17], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1393 * 16], m4 - - ; mode 23 [row 25] - movu m6, [r5 + 22 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1394 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1395 * 16], m4 - - ; mode 23 [row 26] - movu m6, [r5 + 13 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1396 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1397 * 16], m4 - - ; mode 23 [row 27] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1398 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1399 * 16], m4 - - ; mode 23 [row 28] - movu m6, [r5 + 27 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 25], 1 - pinsrb m0, [r4 + 28], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 0], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1400 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 8], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 16], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1401 * 16], m4 - - ; mode 23 [row 29] - movu m6, [r5 + 18 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1402 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1403 * 16], m4 - - ; mode 23 [row 30] - movu m6, [r5 + 9 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1404 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1405 * 16], m4 - - ; mode23 [row 31] - pshufb m5, m0, [tab_S2] - movh [r0 + 1406 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1406 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1407 * 16], m5 - pshufb m5, m3, [tab_S2] - movh [r0 + 1407 * 16 + 8], m5 - - ; mode 24 [row 0] - movu m6, [r5 + 27 * 16] - movu m0, [r3 ] - movu m1, [r3 + 1 ] - punpcklbw m0, m1 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - movu m2, [r3 + 8] - movu m3, [r3 + 9] - punpcklbw m2, m3 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1408 * 16], m4 - - movu m1, [r3 + 16] - movu m3, [r3 + 17] - punpcklbw m1, m3 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - movu m3, [r3 + 24] - movu m5, [r3 + 25] - punpcklbw m3, m5 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1409 * 16], m4 - - ; mode 24 [row 1] - movu m6, [r5 + 22 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1410 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1411 * 16], m4 - - ; mode 24 [row 2] - movu m6, [r5 + 17 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1412 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1413 * 16], m4 - - ; mode 24 [row 3] - movu m6, [r5 + 12 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1414 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1415 * 16], m4 - - ; mode 24 [row 4] - movu m6, [r5 + 7 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1416 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1417 * 16], m4 - - ; mode 24 [row 5] - movu m6, [r5 + 2 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1418 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1419 * 16], m4 - - ; mode 24 [row 6] - movu m6, [r5 + 29 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 0], 1 - pinsrb m0, [r4 + 6], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1420 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 15], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 23], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1421 * 16], m4 - - ; mode 24 [row 7] - movu m6, [r5 + 24 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1422 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1423 * 16], m4 - - ; mode 24 [row 8] - movu m6, [r5 + 19 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1424 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1425 * 16], m4 - - ; mode 24 [row 9] - movu m6, [r5 + 14 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1426 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1427 * 16], m4 - - ; mode 24 [row 10] - movu m6, [r5 + 9 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1428 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1429 * 16], m4 - - ; mode 24 [row 11] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1430 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1431 * 16], m4 - - ; mode 24 [row 12] - movu m6, [r5 + 31 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 6], 1 - pinsrb m0, [r4 + 13], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 6], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1432 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 14], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 22], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1433 * 16], m4 - - ; mode 24 [row 13] - movu m6, [r5 + 26 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1434 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1435 * 16], m4 - - ; mode 24 [row 14] - movu m6, [r5 + 21 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1436 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1437 * 16], m4 - - ; mode 24 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1438 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1439 * 16], m4 - - ; mode 24 [row 16] - movu m6, [r5 + 11 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1440 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1441 * 16], m4 - - ; mode 24 [row 17] - movu m6, [r5 + 6 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1442 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1443 * 16], m4 - - ; mode 24 [row 18] - movu m6, [r5 + 1 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1444 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1445 * 16], m4 - - ; mode 24 [row 19] - movu m6, [r5 + 28 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 13], 1 - pinsrb m0, [r4 + 19], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 5], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1446 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 13], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 21], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1447 * 16], m4 - - ; mode 24 [row 20] - movu m6, [r5 + 23 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1448 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1449 * 16], m4 - - ; mode 24 [row 21] - movu m6, [r5 + 18 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1450 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1451 * 16], m4 - - ; mode 24 [row 22] - movu m6, [r5 + 13 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1452 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1453 * 16], m4 - - ; mode 24 [row 23] - movu m6, [r5 + 8 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1454 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1455 * 16], m4 - - ; mode 24 [row 24] - movu m6, [r5 + 3 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1456 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1457 * 16], m4 - - ; mode 24 [row 25] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 19], 1 - pinsrb m0, [r4 + 26], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 4], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1458 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 12], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 20], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1459 * 16], m4 - - ; mode 24 [row 26] - movu m6, [r5 + 25 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1460 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1461 * 16], m4 - - ; mode 24 [row 27] - movu m6, [r5 + 20 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1462 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1463 * 16], m4 - - ; mode 24 [row 28] - movu m6, [r5 + 15 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1464 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1465 * 16], m4 - - ; mode 24 [row 29] - movu m6, [r5 + 10 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1466 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1467 * 16], m4 - - ; mode 24 [row 30] - movu m6, [r5 + 5 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1468 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1469 * 16], m4 - - ; mode 24 [row 31] - pshufb m5, m0, [tab_S2] - movh [r0 + 1470 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1470 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1471 * 16], m5 - pshufb m5, m3, [tab_S2] - movh [r0 + 1471 * 16 + 8], m5 - - ; mode 25 [row 0] - movu m6, [r5 + 30 * 16] - movu m0, [r3 ] - movu m1, [r3 + 1 ] - punpcklbw m0, m1 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - movu m2, [r3 + 8] - movu m3, [r3 + 9] - punpcklbw m2, m3 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1472 * 16], m4 - - movu m1, [r3 + 16] - movu m3, [r3 + 17] - punpcklbw m1, m3 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - movu m3, [r3 + 24] - movu m5, [r3 + 25] - punpcklbw m3, m5 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1473 * 16], m4 - - ; mode 25 [row 1] - movu m6, [r5 + 28 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1474 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1475 * 16], m4 - - ; mode 25 [row 2] - movu m6, [r5 + 26 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1476 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1477 * 16], m4 - - ; mode 25 [row 3] - movu m6, [r5 + 24 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1478 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1479 * 16], m4 - - ; mode 25 [row 4] - movu m6, [r5 + 22 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1480 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1481 * 16], m4 - - ; mode 25 [row 5] - movu m6, [r5 + 20 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1482 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1483 * 16], m4 - - ; mode 25 [row 6] - movu m6, [r5 + 18 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1484 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1485 * 16], m4 - - ; mode 25 [row 7] - movu m6, [r5 + 16 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1486 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1487 * 16], m4 - - ; mode 25 [row 8] - movu m6, [r5 + 14 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1488 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1489 * 16], m4 - - ; mode 25 [row 9] - movu m6, [r5 + 12 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1490 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1491 * 16], m4 - - ; mode 25 [row 10] - movu m6, [r5 + 10 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1492 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1493 * 16], m4 - - ; mode 25 [row 11] - movu m6, [r5 + 8 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1494 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1495 * 16], m4 - - ; mode 25 [row 12] - movu m6, [r5 + 6 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1496 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1497 * 16], m4 - - ; mode 25 [row 13] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1498 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1499 * 16], m4 - - ; mode 25 [row 14] - movu m6, [r5 + 2 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1500 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1501 * 16], m4 - - ; mode 25 [row 15] - pshufb m5, m0, [tab_S2] - movh [r0 + 1502 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1502 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1503 * 16], m5 - pshufb m5, m3, [tab_S2] - movh [r0 + 1503 * 16 + 8], m5 - - ; mode 25 [row 16] - movu m6, [r5 + 30 * 16] - pslldq m0, 2 - pinsrb m0, [r4 + 0], 1 - pinsrb m0, [r4 + 16], 0 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pslldq m2, 2 - pinsrw m2, [r3 + 7], 0 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1504 * 16], m4 - pslldq m1, 2 - pinsrw m1, [r3 + 15], 0 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pslldq m3, 2 - pinsrw m3, [r3 + 23], 0 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1505 * 16], m4 - - ; mode 25 [row 17] - movu m6, [r5 + 28 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1506 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1507 * 16], m4 - - ; mode 25 [row 18] - movu m6, [r5 + 26 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1508 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1509 * 16], m4 - - ; mode 25 [row 19] - movu m6, [r5 + 24 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1510 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1511 * 16], m4 - - ; mode 25 [row 20] - movu m6, [r5 + 22 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1512 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1513 * 16], m4 - - ; mode 25 [row 21] - movu m6, [r5 + 20 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1514 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1515 * 16], m4 - - ; mode 25 [row 22] - movu m6, [r5 + 18 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1516 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1517 * 16], m4 - - ; mode 25 [row 23] - movu m6, [r5 + 16 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1518 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1519 * 16], m4 - - ; mode 25 [row 24] - movu m6, [r5 + 14 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1520 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1521 * 16], m4 - - ; mode 25 [row 25] - movu m6, [r5 + 12 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1522 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1523 * 16], m4 - - ; mode 25 [row 26] - movu m6, [r5 + 10 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1524 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1525 * 16], m4 - - ; mode 25 [row 27] - movu m6, [r5 + 8 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1526 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1527 * 16], m4 - - ; mode 25 [row 28] - movu m6, [r5 + 6 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1528 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1529 * 16], m4 - - ; mode 25 [row 29] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1530 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1531 * 16], m4 - - ; mode 25 [row 30] - movu m6, [r5 + 2 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1532 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1533 * 16], m4 - - ; mode 25 [row 31] - pshufb m5, m0, [tab_S2] - movh [r0 + 1534 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1534 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1535 * 16], m5 - pshufb m5, m3, [tab_S2] - movh [r0 + 1535 * 16 + 8], m5 - - ; mode 26 - movu m1, [r1 + 1] - movu m2, [r1 + 17] - movu [r0 + 1536 * 16], m1 - movu [r0 + 1537 * 16], m2 - movu [r0 + 1538 * 16], m1 - movu [r0 + 1539 * 16], m2 - movu [r0 + 1540 * 16], m1 - movu [r0 + 1541 * 16], m2 - movu [r0 + 1542 * 16], m1 - movu [r0 + 1543 * 16], m2 - movu [r0 + 1544 * 16], m1 - movu [r0 + 1545 * 16], m2 - movu [r0 + 1546 * 16], m1 - movu [r0 + 1547 * 16], m2 - movu [r0 + 1548 * 16], m1 - movu [r0 + 1549 * 16], m2 - movu [r0 + 1550 * 16], m1 - movu [r0 + 1551 * 16], m2 - - movu [r0 + 1552 * 16], m1 - movu [r0 + 1553 * 16], m2 - movu [r0 + 1554 * 16], m1 - movu [r0 + 1555 * 16], m2 - movu [r0 + 1556 * 16], m1 - movu [r0 + 1557 * 16], m2 - movu [r0 + 1558 * 16], m1 - movu [r0 + 1559 * 16], m2 - movu [r0 + 1560 * 16], m1 - movu [r0 + 1561 * 16], m2 - movu [r0 + 1562 * 16], m1 - movu [r0 + 1563 * 16], m2 - movu [r0 + 1564 * 16], m1 - movu [r0 + 1565 * 16], m2 - movu [r0 + 1566 * 16], m1 - movu [r0 + 1567 * 16], m2 - - movu [r0 + 1568 * 16], m1 - movu [r0 + 1569 * 16], m2 - movu [r0 + 1570 * 16], m1 - movu [r0 + 1571 * 16], m2 - movu [r0 + 1572 * 16], m1 - movu [r0 + 1573 * 16], m2 - movu [r0 + 1574 * 16], m1 - movu [r0 + 1575 * 16], m2 - movu [r0 + 1576 * 16], m1 - movu [r0 + 1577 * 16], m2 - movu [r0 + 1578 * 16], m1 - movu [r0 + 1579 * 16], m2 - movu [r0 + 1580 * 16], m1 - movu [r0 + 1581 * 16], m2 - movu [r0 + 1582 * 16], m1 - movu [r0 + 1583 * 16], m2 - - movu [r0 + 1584 * 16], m1 - movu [r0 + 1585 * 16], m2 - movu [r0 + 1586 * 16], m1 - movu [r0 + 1587 * 16], m2 - movu [r0 + 1588 * 16], m1 - movu [r0 + 1589 * 16], m2 - movu [r0 + 1590 * 16], m1 - movu [r0 + 1591 * 16], m2 - movu [r0 + 1592 * 16], m1 - movu [r0 + 1593 * 16], m2 - movu [r0 + 1594 * 16], m1 - movu [r0 + 1595 * 16], m2 - movu [r0 + 1596 * 16], m1 - movu [r0 + 1597 * 16], m2 - movu [r0 + 1598 * 16], m1 - movu [r0 + 1599 * 16], m2 - - ; mode 27 [row 0] - movu m6, [r5 + 2 * 16] - movu m0, [r3 + 1 ] - movu m1, [r3 + 2 ] - punpcklbw m0, m1 - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - movu m2, [r3 + 9] - movu m3, [r3 + 10] - punpcklbw m2, m3 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1600 * 16], m4 - - movu m1, [r3 + 17] - movu m3, [r3 + 18] - punpcklbw m1, m3 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - movu m3, [r3 + 25] - movu m5, [r3 + 26] - punpcklbw m3, m5 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1601 * 16], m4 - - ; mode 27 [row 1] - movu m6, [r5 + 4 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1602 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1603 * 16], m4 - - ; mode 27 [row 2] - movu m6, [r5 + 6 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1604 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1605 * 16], m4 - - ; mode 27 [row 3] - movu m6, [r5 + 8 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1606 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1607 * 16], m4 - - ; mode 27 [row 4] - movu m6, [r5 + 10 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1608 * 16], m4 - - ; mode 28 [row 1 -first half] - movu [r0 + 1666 * 16], m4 - - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1609 * 16], m4 - - ; mode 28 [row 1 - second half] - movu [r0 + 1667 * 16], m4 - - ; mode 27 [row 5] - movu m6, [r5 + 12 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1610 * 16], m4 - - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1611 * 16], m4 - - ; mode 27 [row 6] - movu m6, [r5 + 14 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1612 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1613 * 16], m4 - - ; mode 27 [row 7] - movu m6, [r5 + 16 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1614 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1615 * 16], m4 - - ; mode 27 [row 8] - movu m6, [r5 + 18 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1616 * 16], m4 - - ; mode 29 [row 1 - first half] - movu [r0 + 1730 * 16], m4 - - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1617 * 16], m4 - - ; mode 29 [row 1 - second half] - movu [r0 + 1731 * 16], m4 - - ; mode 27 [row 9] - movu m6, [r5 + 20 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1618 * 16], m4 - - ; mode 28 [row 3 -first half] - movu [r0 + 1670 * 16], m4 - - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1619 * 16], m4 - - ; mode 28 [row 3 -second half] - movu [r0 + 1671 * 16], m4 - - ; mode 27 [row 10] - movu m6, [r5 + 22 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1620 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1621 * 16], m4 - - ; mode 27 [row 11] - movu m6, [r5 + 24 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1622 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1623 * 16], m4 - - ; mode 27 [row 12] - movu m6, [r5 + 26 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1624 * 16], m4 - - ; mode 30 [row 1 - first half] - movu [r0 + 1794 * 16], m4 - - ; mode 33 [row 0 - first half] - movu [r0 + 1984 * 16], m4 - - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1625 * 16], m4 - - ; mode 30 [row 1 - second half] - movu [r0 + 1795 * 16], m4 - - ; mode 33 [row 0 - second half] - movu [r0 + 1985 * 16], m4 - - ; mode 27 [row 13] - movu m6, [r5 + 28 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1626 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1627 * 16], m4 - - ; mode 27 [row 14] - movu m6, [r5 + 30 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1628 * 16], m4 - - ; mode 28 [row 5 first half] - movu [r0 + 1674 * 16], m4 - - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1629 * 16], m4 - - ; mode 28 [row 5 second half] - movu [r0 + 1675 * 16], m4 - - ; mode 28 [row 0] - movu m6, [r5 + 5 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1664 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1665 * 16], m4 - - ; mode 28 [row 2] - movu m6, [r5 + 15 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1668 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1669 * 16], m4 - - ; mode 28 [row 4] - movu m6, [r5 + 25 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1672 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1673 * 16], m4 - - ; mode 30 [row 0] - movu m6, [r5 + 13 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1792 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1793 * 16], m4 - - ; mode 29 [row 0] - movu m6, [r5 + 9 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1728 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1729 * 16], m4 - - ; mode 29 [row 2] - movu m6, [r5 + 27 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1732 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1733 * 16], m4 - - ; mode 31 [row 0] - movu m6, [r5 + 17 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1856 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1857 * 16], m4 - - ; mode 32 [row 0] - movu m6, [r5 + 21 * 16] - pmaddubsw m4, m0, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1920 * 16], m4 - pmaddubsw m4, m1, m6 - pmulhrsw m4, m7 - pmaddubsw m5, m3, m6 - pmulhrsw m5, m7 - packuswb m4, m5 - movu [r0 + 1921 * 16], m4 - - ; mode 27 [row 15] - movu m0, [r3 + 2] - movd m1, [r3 + 3] - palignr m1, m0, 1 - punpcklbw m0, m1 - movu m2, [r3 + 10] - movd m3, [r3 + 11] - palignr m3, m2, 1 - punpcklbw m2, m3 - movu m1, [r3 + 18] - movd m3, [r3 + 19] - palignr m3, m1, 1 - punpcklbw m1, m3 - movu m4, [r3 + 26] - movd m5, [r3 + 27] - palignr m5, m4, 1 - punpcklbw m4, m5 - - pshufb m5, m0, [tab_S2] - movh [r0 + 1630 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1630 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1631 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 1631 * 16 + 8], m5 - - ; mode 27 [row 16] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1632 * 16], m3 - - ; mode 31 [row 1 - first half] - movu [r0 + 1858 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1633 * 16], m3 - - ; mode 31 [row 1 - second half] - movu [r0 + 1859 * 16], m3 - - ; mode 27 [row 17] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1634 * 16], m3 - - ; mode 29 [row 3 - first half] - movu [r0 + 1734 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1635 * 16], m3 - - ; mode 29 [row 3 - second half] - movu [r0 + 1735 * 16], m3 - - ; mode 27 [row 18] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1636 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1637 * 16], m3 - - ; mode 27 [row 19] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1638 * 16], m3 - - ; mode 28 [row 7 - first half] - movu [r0 + 1678 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1639 * 16], m3 - - ; mode 28 [row 7 - second half] - movu [r0 + 1679 * 16], m3 - - ; mode 27 [row 20] - movu m6, [r5 + 10 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1640 * 16], m3 - - ; mode 32 [row 1 - first half] - movu [r0 + 1922 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1641 * 16], m3 - - ; mode 32 [row 1 - second half] - movu [r0 + 1923 * 16], m3 - - ; mode 27 [row 21] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1642 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1643 * 16], m3 - - ; mode 27 [row 22] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1644 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1645 * 16], m3 - - ; mode 27 [row 23] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1646 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1647 * 16], m3 - - ; mode 27 [row 24] - movu m6, [r5 + 18 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1648 * 16], m3 - - ; mode 28 [row 9 - first half] - movu [r0 + 1682 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1649 * 16], m3 - - ; mode 28 [row 9 - second half] - movu [r0 + 1683 * 16], m3 - - ; mode 27 [row 25] - movu m6, [r5 + 20 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1650 * 16], m3 - - ; mode 30 [row 3 - first half] - movu [r0 + 1798 * 16], m3 - - ; mode 33 [row 1 - first half] - movu [r0 + 1986 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1651 * 16], m3 - - ; mode 30 [row 3 - second half] - movu [r0 + 1799 * 16], m3 - - ; mode 33 [row 1 - second half] - movu [r0 + 1987 * 16], m3 - - ; mode 27 [row 26] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1652 * 16], m3 - - ; mode 29 [row 5 - first half] - movu [r0 + 1738 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1653 * 16], m3 - - ; mode 29 [row 5 - second half] - movu [r0 + 1739 * 16], m3 - - ; mode 27 [row 27] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1654 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1655 * 16], m3 - - ; mode 27 [row 28] - movu m6, [r5 + 26 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1656 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1657 * 16], m3 - - ; mode 27 [row 29] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1658 * 16], m3 - - ; mode 28 [row 11 - first half] - movu [r0 + 1686 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1659 * 16], m3 - - ; mode 28 [row 11 - second half] - movu [r0 + 1687 * 16], m3 - - ; mode 27 [row 30] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1660 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1661 * 16], m3 - - ; mode 28 [row 6] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1676 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1677 * 16], m3 - - ; mode 28 [row 8] - movu m6, [r5 + 13 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1680 * 16], m3 - - ; mode 29 [row 4 - first half] - movu [r0 + 1736 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1681 * 16], m3 - - ; mode 29 [row 4 - second half] - movu [r0 + 1737 * 16], m3 - - ; mode 28 [row 10] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1684 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1685 * 16], m3 - - ; mode 29 [row 6] - movu m6, [r5 + 31 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1740 * 16], m3 - - ; mode 32 [row 2 - first half] - movu [r0 + 1924 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1741 * 16], m3 - - ; mode 32 [row 2 - second half] - movu [r0 + 1925 * 16], m3 - - ; mode 30 [row 2] - movu m6, [r5 + 7 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1796 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1797 * 16], m3 - - ; mode 31 [row 2] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1860 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1861 * 16], m3 - - ; mode 27 [row 15] - movu m0, [r3 + 3] - movd m1, [r3 + 4] - palignr m1, m0, 1 - punpcklbw m0, m1 - movu m2, [r3 + 11] - movd m3, [r3 + 12] - palignr m3, m2, 1 - punpcklbw m2, m3 - movu m1, [r3 + 19] - movd m3, [r3 + 20] - palignr m3, m1, 1 - punpcklbw m1, m3 - movu m4, [r3 + 27] - movd m5, [r3 + 28] - palignr m5, m4, 1 - punpcklbw m4, m5 - - pshufb m5, m0, [tab_S2] - movh [r0 + 1662 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1662 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1663 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 1663 * 16 + 8], m5 - - ; mode 28 [row 12] - movu m6, [r5 + 1 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1688 * 16], m3 - - ; mode 30 [row 4 - first half] - movu [r0 + 1800 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1689 * 16], m3 - - ; mode 30 [row 4 - second half] - movu [r0 + 1801 * 16], m3 - - ; mode 28 [row 13] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1690 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1691 * 16], m3 - - ; mode 28 [row 14] - movu m6, [r5 + 11 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1692 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1693 * 16], m3 - - ; mode 28 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1694 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1695 * 16], m3 - - ; mode 28 [row 16] - movu m6, [r5 + 21 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1696 * 16], m3 - - ; mode 31 [row 4 - first half] - movu [r0 + 1864 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1697 * 16], m3 - - ; mode 31 [row 4 - second half] - movu [r0 + 1865 * 16], m3 - - ; mode 28 [row 17] - movu m6, [r5 + 26 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1698 * 16], m3 - - ; mode 29 [row 9 - first half] - movu [r0 + 1746 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1699 * 16], m3 - - ; mode 29 [row 9 - second half] - movu [r0 + 1747 * 16], m3 - - ; mode 28 [row 18] - movu m6, [r5 + 31 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1700 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1701 * 16], m3 - - ; mode 29 [row 7] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1742 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1743 * 16], m3 - - ; mode 29 [row 8] - movu m6, [r5 + 17 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1744 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1745 * 16], m3 - - ; mode 30 [row 5] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1802 * 16], m3 - - ; mode 33 [row 2 - first half] - movu [r0 + 1988 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1803 * 16], m3 - - ; mode 33 [row 2 - second half] - movu [r0 + 1989 * 16], m3 - - ; mode 30 [row 6] - movu m6, [r5 + 27 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1804 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1805 * 16], m3 - - ; mode 31 [row 3] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1862 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1863 * 16], m3 - - ; mode 32 [row 3] - movu m6, [r5 + 20 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1926 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1927 * 16], m3 - - ; mode 28 [row 19] - movu m6, [r5 + 4 * 16] - movu m0, [r3 + 4] - movd m1, [r3 + 5] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 12] - movd m4, [r3 + 13] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1702 * 16], m3 - - movu m1, [r3 + 20] - movd m3, [r3 + 21] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 28] - movd m5, [r3 + 29] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1703 * 16], m3 - - ; mode 28 [row 20] - movu m6, [r5 + 9 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1704 * 16], m3 - - ; mode 32 [row 4 - first half] - movu [r0 + 1928 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1705 * 16], m3 - - ; mode 32 [row 4 - second half] - movu [r0 + 1929 * 16], m3 - - ; mode 28 [row 21] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1706 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1707 * 16], m3 - - ; mode 28 [row 22] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1708 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1709 * 16], m3 - - ; mode 28 [row 23] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1710 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1711 * 16], m3 - - ; mode 28 [row 24] - movu m6, [r5 + 29 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1712 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1713 * 16], m3 - - ; mode 29 [row 10] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1748 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1749 * 16], m3 - - ; mode 29 [row 11] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1750 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1751 * 16], m3 - - ; mode 29 [row 12] - movu m6, [r5 + 21 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1752 * 16], m3 - - ; mode 30 [row 8 -first half] - movu [r0 + 1808 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1753 * 16], m3 - - ; mode 30 [row 8 -second half] - movu [r0 + 1809 * 16], m3 - - ; mode 29 [row 13] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1754 * 16], m3 - - ; mode 32 [row 5 - first half] - movu [r0 + 1930 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1755 * 16], m3 - - ; mode 32 [row 5 - second half] - movu [r0 + 1931 * 16], m3 - - ; mode 30 [row 7] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1806 * 16], m3 - - ; mode 33 [row 3 - first half] - movu [r0 + 1990 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1807 * 16], m3 - - ; mode 33 [row 3 - second half] - movu [r0 + 1991 * 16], m3 - - ; mode 31 [row 5] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1866 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1867 * 16], m3 - - ; mode 31 [row 6] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1868 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1869 * 16], m3 - - ; mode 28 [row 25] - movu m6, [r5 + 2 * 16] - movu m0, [r3 + 5] - movd m1, [r3 + 6] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 13] - movd m4, [r3 + 14] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1714 * 16], m3 - - movu m1, [r3 + 21] - movd m3, [r3 + 22] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 29] - movd m5, [r3 + 30] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1715 * 16], m3 - - ; mode 28 [row 26] - movu m6, [r5 + 7 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1716 * 16], m3 - - ; mode 29 [row 14 - first half] - movu [r0 + 1756 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1717 * 16], m3 - - ; mode 29 [row 14 - second half] - movu [r0 + 1757 * 16], m3 - - ; mode 28 [row 27] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1718 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1719 * 16], m3 - - ; mode 28 [row 28] - movu m6, [r5 + 17 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1720 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1721 * 16], m3 - - ; mode 28 [row 29] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1722 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1723 * 16], m3 - - ; mode 28 [row 30] - movu m6, [r5 + 27 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1724 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1725 * 16], m3 - - ; mode 29 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1758 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1759 * 16], m3 - - ; mode 29 [row 16] - movu m6, [r5 + 25 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1760 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1761 * 16], m3 - - ; mode 30 [row 9] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1810 * 16], m3 - - ; mode 33 [row 4 - first half] - movu [r0 + 1992 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1811 * 16], m3 - - ; mode 33 [row 4 - second half] - movu [r0 + 1993 * 16], m3 - - ; mode 30 [row 10] - movu m6, [r5 + 15 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1812 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1813 * 16], m3 - - ; mode 31 [row 7] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1870 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1871 * 16], m3 - - ; mode 31 [row 8] - movu m6, [r5 + 25 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1872 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1873 * 16], m3 - - ; mode 32 [row 6] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1932 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1933 * 16], m3 - - ; mode 30 [row 11] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1814 * 16], m3 - - ; mode 33 [row 5 - first half] - movu [r0 + 1994 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1815 * 16], m3 - - ; mode 33 [row 5 - second half] - movu [r0 + 1995 * 16], m3 - - ; mode 28 [row 31] - movu m0, [r3 + 6] - movd m1, [r3 + 7] - palignr m1, m0, 1 - punpcklbw m0, m1 - movu m2, [r3 + 14] - movd m3, [r3 + 15] - palignr m3, m2, 1 - punpcklbw m2, m3 - movu m1, [r3 + 22] - movd m3, [r3 + 23] - palignr m3, m1, 1 - punpcklbw m1, m3 - movu m4, [r3 + 30] - movd m5, [r3 + 31] - palignr m5, m4, 1 - punpcklbw m4, m5 - - pshufb m5, m0, [tab_S2] - movh [r0 + 1726 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1726 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1727 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 1727 * 16 + 8], m5 - - ; mode 29 [row 17] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1762 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1763 * 16], m3 - - ; mode 29 [row 18] - movu m6, [r5 + 11 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1764 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1765 * 16], m3 - - ; mode 29 [row 19] - movu m6, [r5 + 20 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1766 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1767 * 16], m3 - - ; mode 29 [row 20] - movu m6, [r5 + 29 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1768 * 16], m3 - - ; mode 32 [row 8 - first halif] - movu [r0 + 1936 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1769 * 16], m3 - - ; mode 32 [row 8 - second halif] - movu [r0 + 1937 * 16], m3 - - ; mode 30 [row 12] - movu m6, [r5 + 9 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1816 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1817 * 16], m3 - - ; mode 30 [row 13] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1818 * 16], m3 - - ; mode 33 [row 6 - first half] - movu [r0 + 1996 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1819 * 16], m3 - - ; mode 33 [row 6 - second half] - movu [r0 + 1997 * 16], m3 - - ; mode 31 [row 9] - movu m6, [r5 + 10 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1874 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1875 * 16], m3 - - ; mode 31 [row 10] - movu m6, [r5 + 27 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1876 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1877 * 16], m3 - - ; mode 32 [row 7] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1934 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1935 * 16], m3 - - ; mode 29 [row 21] - movu m6, [r5 + 6 * 16] - movu m0, [r3 + 7] - movd m1, [r3 + 8] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 15] - movd m4, [r3 + 16] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1770 * 16], m3 - - movu m1, [r3 + 23] - movd m3, [r3 + 24] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 31] - movd m5, [r3 + 32] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1771 * 16], m3 - - ; mode 29 [row 22] - movu m6, [r5 + 15 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1772 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1773 * 16], m3 - - ; mode 29 [row 23] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1774 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1775 * 16], m3 - - ; mode 30 [row 14] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1820 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1821 * 16], m3 - - ; mode 30 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1822 * 16], m3 - - ; mode 33 [row 7 - first half] - movu [r0 + 1998 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1823 * 16], m3 - - ; mode 33 [row 7 - second half] - movu [r0 + 1999 * 16], m3 - - ; mode 30 [row 16] - movu m6, [r5 + 29 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1824 * 16], m3 - - ; mode 31 [row 12 - first half] - movu [r0 + 1880 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1825 * 16], m3 - - ; mode 31 [row 12 - second half] - movu [r0 + 1881 * 16], m3 - - ; mode 31 [row 11] - movu m6, [r5 + 12 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1878 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1879 * 16], m3 - - ; mode 32 [row 9] - movu m6, [r5 + 18 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1938 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1939 * 16], m3 - - ; mode 29 [row 24] - movu m6, [r5 + 1 * 16] - movu m0, [r3 + 8] - movd m1, [r3 + 9] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 16] - movd m4, [r3 + 17] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1776 * 16], m3 - - movu m1, [r3 + 24] - movd m3, [r3 + 25] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 32] - movd m5, [r3 + 33] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1777 * 16], m3 - - ; mode 29 [row 25] - movu m6, [r5 + 10 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1778 * 16], m3 - - ; mode 30 [row 17 - first half] - movu [r0 + 1826 * 16], m3 - - ; mode 33 [row 8 - first half] - movu [r0 + 2000 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1779 * 16], m3 - - ; mode 30 [row 17 - second half] - movu [r0 + 1827 * 16], m3 - - ; mode 33 [row 8 - second half] - movu [r0 + 2001 * 16], m3 - - ; mode 29 [row 26] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1780 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1781 * 16], m3 - - ; mode 29 [row 27] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1782 * 16], m3 - - ; mode 32 [row 11 - first half] - movu [r0 + 1942 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1783 * 16], m3 - - ; mode 32 [row 11 - second half] - movu [r0 + 1943 * 16], m3 - - ; mode 30 [row 18] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1828 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1829 * 16], m3 - - ; mode 31 [row 13] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1882 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1883 * 16], m3 - - ; mode 31 [row 14] - movu m6, [r5 + 31 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1884 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1885 * 16], m3 - - ; mode 32 [row 10] - movu m6, [r5 + 7 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1940 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1941 * 16], m3 - - ; mode 29 [row 28] - movu m6, [r5 + 5 * 16] - movu m0, [r3 + 9] - movd m1, [r3 + 10] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 17] - movd m4, [r3 + 18] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1784 * 16], m3 - - movu m1, [r3 + 25] - movd m3, [r3 + 26] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 33] - movd m5, [r3 + 34] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1785 * 16], m3 - - ; mode 29 [row 29] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1786 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1787 * 16], m3 - - ; mode 29 [row 30] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1788 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1789 * 16], m3 - - ; mode 30 [row 19] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1830 * 16], m3 - - ; mode 33 [row 9 - first half] - movu [r0 + 2002 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1831 * 16], m3 - - ; mode 33 [row 9 - second half] - movu [r0 + 2003 * 16], m3 - - ; mode 30 [row 20] - movu m6, [r5 + 17 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1832 * 16], m3 - - ; mode 32 [row 12 - first half] - movu [r0 + 1944 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1833 * 16], m3 - - ; mode 32 [row 12 - second half] - movu [r0 + 1945 * 16], m3 - - ; mode 30 [row 21] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1834 * 16], m3 - - ; mode 33 [row 10 - first half] - movu [r0 + 2004 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1835 * 16], m3 - - ; mode 33 [row 10 - second half] - movu [r0 + 2005 * 16], m3 - - ; mode 31 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1886 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1887 * 16], m3 - - ; mode 29 [row 31] - movu m0, [r3 + 10] - movd m1, [r3 + 11] - palignr m1, m0, 1 - punpcklbw m0, m1 - movu m2, [r3 + 18] - movd m3, [r3 + 19] - palignr m3, m2, 1 - punpcklbw m2, m3 - movu m1, [r3 + 26] - movd m3, [r3 + 27] - palignr m3, m1, 1 - punpcklbw m1, m3 - movu m4, [r3 + 34] - movd m5, [r3 + 35] - palignr m5, m4, 1 - punpcklbw m4, m5 - - pshufb m5, m0, [tab_S2] - movh [r0 + 1790 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1790 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1791 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 1791 * 16 + 8], m5 - - ; mode 30 [row 22] - movu m6, [r5 + 11 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1836 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1837 * 16], m3 - - ; mode 30 [row 23] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1838 * 16], m3 - - ; mode 33 [row 11 - first half] - movu [r0 + 2006 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1839 * 16], m3 - - ; mode 33 [row 11 - second half] - movu [r0 + 2007 * 16], m3 - - ; mode 31 [row 16] - movu m6, [r5 + 1 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1888 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1889 * 16], m3 - - ; mode 31 [row 17] - movu m6, [r5 + 18 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1890 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1891 * 16], m3 - - ; mode 32 [row 13] - movu m6, [r5 + 6 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1946 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1947 * 16], m3 - - ; mode 32 [row 14] - movu m6, [r5 + 27 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1948 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1949 * 16], m3 - - ; mode 30 [row 24] - movu m6, [r5 + 5 * 16] - movu m0, [r3 + 11] - movd m1, [r3 + 12] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 19] - movd m4, [r3 + 20] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1840 * 16], m3 - - movu m1, [r3 + 27] - movd m3, [r3 + 28] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 35] - movd m5, [r3 + 36] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1841 * 16], m3 - - ; mode 30 [row 25] - movu m6, [r5 + 18 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1842 * 16], m3 - - ; mode 33 [row 12 - first half] - movu [r0 + 2008 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1843 * 16], m3 - - ; mode 33 [row 12 - second half] - movu [r0 + 2009 * 16], m3 - - ; mode 30 [row 26] - movu m6, [r5 + 31 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1844 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1845 * 16], m3 - - ; mode 31 [row 18] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1892 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1893 * 16], m3 - - ; mode 31 [row 19] - movu m6, [r5 + 20 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1894 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1895 * 16], m3 - - ; mode 32 [row 15] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1950 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1951 * 16], m3 - - ; mode 30 [row 27] - movu m6, [r5 + 12 * 16] - movu m0, [r3 + 12] - movd m1, [r3 + 13] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 20] - movd m4, [r3 + 21] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1846 * 16], m3 - - ; mode 33 [row 13 - first half] - movu [r0 + 2010 * 16], m3 - - movu m1, [r3 + 28] - movd m3, [r3 + 29] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 36] - movd m5, [r3 + 37] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1847 * 16], m3 - - ; mode 33 [row 13 - second half] - movu [r0 + 2011 * 16], m3 - - ; mode 30 [row 28] - movu m6, [r5 + 25 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1848 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1849 * 16], m3 - - ; mode 31 [row 20] - movu m6, [r5 + 5 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1896 * 16], m3 - - ; mode 32 [row 16 - first half] - movu [r0 + 1952 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1897 * 16], m3 - - ; mode 32 [row 16 - second half] - movu [r0 + 1953 * 16], m3 - - ; mode 31 [row 21] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1898 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1899 * 16], m3 - - ; mode 32 [row 17] - movu m6, [r5 + 26 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1954 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1955 * 16], m3 - - ; mode 30 [row 29] - movu m6, [r5 + 6 * 16] - movu m0, [r3 + 13] - movd m1, [r3 + 14] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 21] - movd m4, [r3 + 22] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1850 * 16], m3 - - ; mode 33 [row 14 - first half] - movu [r0 + 2012 * 16], m3 - - movu m1, [r3 + 29] - movd m3, [r3 + 30] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 37] - movd m5, [r3 + 38] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1851 * 16], m3 - - ; mode 33 [row 14 - second half] - movu [r0 + 2013 * 16], m3 - - ; mode 30 [row 30] - movu m6, [r5 + 19 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1852 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1853 * 16], m3 - - ; mode 31 [row 22] - movu m6, [r5 + 7 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1900 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1901 * 16], m3 - - ; mode 31 [row 23] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1902 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1903 * 16], m3 - - ; mode 32 [row 18] - movu m6, [r5 + 15 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1956 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1957 * 16], m3 - - ; mode 30 [row 31] - movu m0, [r3 + 14] - movd m1, [r3 + 15] - palignr m1, m0, 1 - punpcklbw m0, m1 - movu m2, [r3 + 22] - movd m3, [r3 + 23] - palignr m3, m2, 1 - punpcklbw m2, m3 - movu m1, [r3 + 30] - movd m3, [r3 + 31] - palignr m3, m1, 1 - punpcklbw m1, m3 - movu m4, [r3 + 38] - movd m5, [r3 + 39] - palignr m5, m4, 1 - punpcklbw m4, m5 - - pshufb m5, m0, [tab_S2] - movh [r0 + 1854 * 16], m5 - - ; mode 33 [row 15 - first eight] - movh [r0 + 2014 * 16], m5 - - pshufb m5, m2, [tab_S2] - movh [r0 + 1854 * 16 + 8], m5 - - ; mode 33 [row 15 - second eight] - movh [r0 + 2014 * 16 + 8], m5 - - pshufb m5, m1, [tab_S2] - movh [r0 + 1855 * 16], m5 - - ; mode 33 [row 15 - third eight] - movh [r0 + 2015 * 16], m5 - - pshufb m5, m4, [tab_S2] - movh [r0 + 1855 * 16 + 8], m5 - - ; mode 33 [row 15 - fourth eight] - movh [r0 + 2015 * 16 + 8], m5 - - ; mode 31 [row 24] - movu m6, [r5 + 9 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1904 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1905 * 16], m3 - - ; mode 31 [row 25] - movu m6, [r5 + 26 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1906 * 16], m3 - - ; mode 33 [row 16 - first half] - movu [r0 + 2016 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1907 * 16], m3 - - ; mode 33 [row 16 - second half] - movu [r0 + 2017 * 16], m3 - - ; mode 32 [row 19] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1958 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1959 * 16], m3 - - ; mode 32 [row 20] - movu m6, [r5 + 25 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1960 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1961 * 16], m3 - - ; mode 31 [row 26] - movu m6, [r5 + 11 * 16] - movu m0, [r3 + 15] - movd m1, [r3 + 16] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 23] - movd m4, [r3 + 24] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1908 * 16], m3 - - movu m1, [r3 + 31] - movd m3, [r3 + 32] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 39] - movd m5, [r3 + 40] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1909 * 16], m3 - - ; mode 31 [row 27] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1910 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1911 * 16], m3 - - ; mode 32 [row 21] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1962 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1963 * 16], m3 - - ; mode 33 [row 17] - movu m6, [r5 + 20 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2018 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2019 * 16], m3 - - ; mode 31 [row 28] - movu m6, [r5 + 13 * 16] - movu m0, [r3 + 16] - movd m1, [r3 + 17] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 24] - movd m4, [r3 + 25] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1912 * 16], m3 - - movu m1, [r3 + 32] - movd m3, [r3 + 33] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 40] - movd m5, [r3 + 41] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1913 * 16], m3 - - ; mode 31 [row 29] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1914 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1915 * 16], m3 - - ; mode 32 [row 22] - movu m6, [r5 + 3 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1964 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1965 * 16], m3 - - ; mode 32 [row 23] - movu m6, [r5 + 24 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1966 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1967 * 16], m3 - - ; mode 33 [row 18] - movu m6, [r5 + 14 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2020 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2021 * 16], m3 - - ; mode 31 [row 30] - movu m6, [r5 + 15 * 16] - movu m0, [r3 + 17] - movd m1, [r3 + 18] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 25] - movd m4, [r3 + 26] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1916 * 16], m3 - - movu m1, [r3 + 33] - movd m3, [r3 + 34] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 41] - movd m5, [r3 + 42] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1917 * 16], m3 - - ; mode 32 [row 24] - movu m6, [r5 + 13 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1968 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1969 * 16], m3 - - ; mode 33 [row 19] - movu m6, [r5 + 8 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2022 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2023 * 16], m3 - - ; mode 31 [row 31] - movu m0, [r3 + 18] - movd m1, [r3 + 19] - palignr m1, m0, 1 - punpcklbw m0, m1 - movu m2, [r3 + 26] - movd m3, [r3 + 27] - palignr m3, m2, 1 - punpcklbw m2, m3 - movu m1, [r3 + 34] - movd m3, [r3 + 35] - palignr m3, m1, 1 - punpcklbw m1, m3 - movu m4, [r3 + 42] - movd m5, [r3 + 43] - palignr m5, m4, 1 - punpcklbw m4, m5 - - pshufb m5, m0, [tab_S2] - movh [r0 + 1918 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1918 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1919 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 1919 * 16 + 8], m5 - - ; mode 32 [row 25] - movu m6, [r5 + 2 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1970 * 16], m3 - - ; mode 33 [row 20 - first half] - movu [r0 + 2024 * 16], m3 - - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1971 * 16], m3 - - ; mode 33 [row 20 - second half] - movu [r0 + 2025 * 16], m3 - - ; mode 32 [row 26] - movu m6, [r5 + 23 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1972 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1973 * 16], m3 - - ; mode 33 [row 21] - movu m6, [r5 + 28 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2026 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2027 * 16], m3 - - ; mode 32 [row 27] - movu m6, [r5 + 12 * 16] - movu m0, [r3 + 19] - movd m1, [r3 + 20] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 27] - movd m4, [r3 + 28] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1974 * 16], m3 - - movu m1, [r3 + 35] - movd m3, [r3 + 36] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 43] - movd m5, [r3 + 44] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1975 * 16], m3 - - ; mode 33 [row 22] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2028 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2029 * 16], m3 - - ; mode 32 [row 28] - movu m6, [r5 + 1 * 16] - movu m0, [r3 + 20] - movd m1, [r3 + 21] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 28] - movd m4, [r3 + 29] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1976 * 16], m3 - - movu m1, [r3 + 36] - movd m3, [r3 + 37] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 44] - movd m5, [r3 + 45] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1977 * 16], m3 - - ; mode 32 [row 29] - movu m6, [r5 + 22 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1978 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1979 * 16], m3 - - ; mode 33 [row 23] - movu m6, [r5 + 16 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2030 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2031 * 16], m3 - - ; mode 32 [row 30] - movu m6, [r5 + 11 * 16] - movu m0, [r3 + 21] - movd m1, [r3 + 22] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 29] - movd m4, [r3 + 30] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1980 * 16], m3 - - movu m1, [r3 + 37] - movd m3, [r3 + 38] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 45] - movd m5, [r3 + 46] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 1981 * 16], m3 - - ; mode 33 [row 24] - movu m6, [r5 + 10 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2032 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2033 * 16], m3 - - ; mode 32 [row 31] - movu m0, [r3 + 22] - movd m1, [r3 + 23] - palignr m1, m0, 1 - punpcklbw m0, m1 - movu m2, [r3 + 30] - movd m3, [r3 + 31] - palignr m3, m2, 1 - punpcklbw m2, m3 - movu m1, [r3 + 38] - movd m3, [r3 + 39] - palignr m3, m1, 1 - punpcklbw m1, m3 - movu m4, [r3 + 46] - movd m5, [r3 + 47] - palignr m5, m4, 1 - punpcklbw m4, m5 - - pshufb m5, m0, [tab_S2] - movh [r0 + 1982 * 16], m5 - pshufb m5, m2, [tab_S2] - movh [r0 + 1982 * 16 + 8], m5 - pshufb m5, m1, [tab_S2] - movh [r0 + 1983 * 16], m5 - pshufb m5, m4, [tab_S2] - movh [r0 + 1983 * 16 + 8], m5 - - ; mode 33 [row 25] - movu m6, [r5 + 4 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2034 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2035 * 16], m3 - - ; mode 33 [row 26] - movu m6, [r5 + 30 * 16] - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2036 * 16], m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2037 * 16], m3 - - ; mode 33 [row 27] - movu m6, [r5 + 24 * 16] - movu m0, [r3 + 23] - movd m1, [r3 + 24] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 31] - movd m4, [r3 + 32] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2038 * 16], m3 - - movu m1, [r3 + 39] - movd m3, [r3 + 40] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 47] - movd m5, [r3 + 48] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2039 * 16], m3 - - ; mode 33 [row 28] - movu m6, [r5 + 18 * 16] - movu m0, [r3 + 24] - movd m1, [r3 + 25] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 32] - movd m4, [r3 + 33] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2040 * 16], m3 - - movu m1, [r3 + 40] - movd m3, [r3 + 41] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 48] - movd m5, [r3 + 49] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2041 * 16], m3 - - ; mode 33 [row 29] - movu m6, [r5 + 12 * 16] - movu m0, [r3 + 25] - movd m1, [r3 + 26] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 33] - movd m4, [r3 + 34] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2042 * 16], m3 - - movu m1, [r3 + 41] - movd m3, [r3 + 42] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 49] - movd m5, [r3 + 50] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2043 * 16], m3 - - ; mode 33 [row 30] - movu m6, [r5 + 6 * 16] - movu m0, [r3 + 26] - movd m1, [r3 + 27] - palignr m1, m0, 1 - punpcklbw m0, m1 - pmaddubsw m3, m0, m6 - pmulhrsw m3, m7 - movu m2, [r3 + 34] - movd m4, [r3 + 35] - palignr m4, m2, 1 - punpcklbw m2, m4 - pmaddubsw m5, m2, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2044 * 16], m3 - - movu m1, [r3 + 42] - movd m3, [r3 + 43] - palignr m3, m1, 1 - punpcklbw m1, m3 - pmaddubsw m3, m1, m6 - pmulhrsw m3, m7 - movu m4, [r3 + 50] - movd m5, [r3 + 51] - palignr m5, m4, 1 - punpcklbw m4, m5 - pmaddubsw m5, m4, m6 - pmulhrsw m5, m7 - packuswb m3, m5 - movu [r0 + 2045 * 16], m3 - - ; mode 33 [row 31] - movu m5, [r3 + 27] - movu [r0 + 2046 * 16], m5 - movu m5, [r3 + 43] - movu [r0 + 2047 * 16], m5 - - ;mode 34 [row 0] - movu m0, [r3 + 2] - movu [r0 + 2048 * 16], m0 - movu m1, [r3 + 18] - movu [r0 + 2049 * 16], m1 - - ;mode 34 [row 1] - movu m2, [r3 + 34] - palignr m3, m1, m0, 1 - movu [r0 + 2050 * 16], m3 - palignr m4, m2, m1, 1 - movu [r0 + 2051 * 16], m4 - - ;mode 34 [row 2] - palignr m3, m1, m0, 2 - movu [r0 + 2052 * 16], m3 - palignr m4, m2, m1, 2 - movu [r0 + 2053 * 16], m4 - - ;mode 34 [row 3] - palignr m3, m1, m0, 3 - movu [r0 + 2054 * 16], m3 - palignr m4, m2, m1, 3 - movu [r0 + 2055 * 16], m4 - - ;mode 34 [row 4] - palignr m3, m1, m0, 4 - movu [r0 + 2056 * 16], m3 - palignr m4, m2, m1, 4 - movu [r0 + 2057 * 16], m4 - - ;mode 34 [row 5] - palignr m3, m1, m0, 5 - movu [r0 + 2058 * 16], m3 - palignr m4, m2, m1, 5 - movu [r0 + 2059 * 16], m4 - - ;mode 34 [row 6] - palignr m3, m1, m0, 6 - movu [r0 + 2060 * 16], m3 - palignr m4, m2, m1, 6 - movu [r0 + 2061 * 16], m4 - - ;mode 34 [row 7] - palignr m3, m1, m0, 7 - movu [r0 + 2062 * 16], m3 - palignr m4, m2, m1, 7 - movu [r0 + 2063 * 16], m4 - - ;mode 34 [row 8] - palignr m3, m1, m0, 8 - movu [r0 + 2064 * 16], m3 - palignr m4, m2, m1, 8 - movu [r0 + 2065 * 16], m4 - - ;mode 34 [row 9] - palignr m3, m1, m0, 9 - movu [r0 + 2066 * 16], m3 - palignr m4, m2, m1, 9 - movu [r0 + 2067 * 16], m4 - - ;mode 34 [row 10] - palignr m3, m1, m0, 10 - movu [r0 + 2068 * 16], m3 - palignr m4, m2, m1, 10 - movu [r0 + 2069 * 16], m4 - - ;mode 34 [row 11] - palignr m3, m1, m0, 11 - movu [r0 + 2070 * 16], m3 - palignr m4, m2, m1, 11 - movu [r0 + 2071 * 16], m4 - - ;mode 34 [row 12] - palignr m3, m1, m0, 12 - movu [r0 + 2072 * 16], m3 - palignr m4, m2, m1, 12 - movu [r0 + 2073 * 16], m4 - - ;mode 34 [row 13] - palignr m3, m1, m0, 13 - movu [r0 + 2074 * 16], m3 - palignr m4, m2, m1, 13 - movu [r0 + 2075 * 16], m4 - - ;mode 34 [row 14] - palignr m3, m1, m0, 14 - movu [r0 + 2076 * 16], m3 - palignr m4, m2, m1, 14 - movu [r0 + 2077 * 16], m4 - - ;mode 34 [row 15] - palignr m3, m1, m0, 15 - movu [r0 + 2078 * 16], m3 - palignr m4, m2, m1, 15 - movu [r0 + 2079 * 16], m4 - - ;mode 34 [row 16] - palignr m3, m1, m0, 16 - movu [r0 + 2080 * 16], m3 - palignr m4, m2, m1, 16 - movu [r0 + 2081 * 16], m4 - - ;mode 34 [row 17] - movu m0, [r3 + 19] - movu [r0 + 2082 * 16], m0 - movu m1, [r3 + 35] - movu [r0 + 2083 * 16], m1 - - mov r2d, r6d - mov [r4], r2b - mov r2d, [rsp] - mov [r1 + 64], r2b - - ;mode 34 [row 18] - movu m2, [r3 + 51] - palignr m3, m1, m0, 1 - movu [r0 + 2084 * 16], m3 - palignr m4, m2, m1, 1 - movu [r0 + 2085 * 16], m4 - - ;mode 34 [row 19] - palignr m3, m1, m0, 2 - movu [r0 + 2086 * 16], m3 - palignr m4, m2, m1, 2 - movu [r0 + 2087 * 16], m4 - - ;mode 34 [row 20] - palignr m3, m1, m0, 3 - movu [r0 + 2088 * 16], m3 - palignr m4, m2, m1, 3 - movu [r0 + 2089 * 16], m4 - - ;mode 34 [row 21] - palignr m3, m1, m0, 4 - movu [r0 + 2090 * 16], m3 - palignr m4, m2, m1, 4 - movu [r0 + 2091 * 16], m4 - - ;mode 34 [row 22] - palignr m3, m1, m0, 5 - movu [r0 + 2092 * 16], m3 - palignr m4, m2, m1, 5 - movu [r0 + 2093 * 16], m4 - - ;mode 34 [row 23] - palignr m3, m1, m0, 6 - movu [r0 + 2094 * 16], m3 - palignr m4, m2, m1, 6 - movu [r0 + 2095 * 16], m4 - - ;mode 34 [row 24] - palignr m3, m1, m0, 7 - movu [r0 + 2096 * 16], m3 - palignr m4, m2, m1, 7 - movu [r0 + 2097 * 16], m4 - - ;mode 34 [row 25] - palignr m3, m1, m0, 8 - movu [r0 + 2098 * 16], m3 - palignr m4, m2, m1, 8 - movu [r0 + 2099 * 16], m4 - - ;mode 34 [row 26] - palignr m3, m1, m0, 9 - movu [r0 + 2100 * 16], m3 - palignr m4, m2, m1, 9 - movu [r0 + 2101 * 16], m4 - - ;mode 34 [row 27] - palignr m3, m1, m0, 10 - movu [r0 + 2102 * 16], m3 - palignr m4, m2, m1, 10 - movu [r0 + 2103 * 16], m4 - - ;mode 34 [row 28] - palignr m3, m1, m0, 11 - movu [r0 + 2104 * 16], m3 - palignr m4, m2, m1, 11 - movu [r0 + 2105 * 16], m4 - - ;mode 34 [row 29] - palignr m3, m1, m0, 12 - movu [r0 + 2106 * 16], m3 - palignr m4, m2, m1, 12 - movu [r0 + 2107 * 16], m4 - - ;mode 34 [row 30] - palignr m3, m1, m0, 13 - movu [r0 + 2108 * 16], m3 - palignr m4, m2, m1, 13 - movu [r0 + 2109 * 16], m4 - - ;mode 34 [row 31] - palignr m3, m1, m0, 14 - movu [r0 + 2110 * 16], m3 - palignr m4, m2, m1, 14 - movu [r0 + 2111 * 16], m4 +INIT_YMM avx2 +cglobal intra_pred_ang32_27, 3, 5, 11 + mova m0, [pw_1024] + mova m1, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang32_mode_27] + + vbroadcasti128 m2, [r2 + 1] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 9] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 17] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 25] + pshufb m5, m1 + + ;row [0, 1] + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [2, 3] + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [4, 5] + mova m10, [r4 + 2 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [6, 7] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [8, 9] + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [10, 11] + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [12, 13] + lea r0, [r0 + 4 * r1] + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [14] + mova m10, [r4 + 3 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + vbroadcasti128 m2, [r2 + 2] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 10] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 18] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 26] + pshufb m5, m1 + + ;row [15, 16] + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [17, 18] + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [19, 20] + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [21, 22] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [23, 24] + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [25, 26] + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [27, 28] + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [29, 30] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [31] + vbroadcasti128 m2, [r2 + 3] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 11] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 19] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 27] + pshufb m5, m1 + + mova m10, [r4 + 4 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang32_28, 3, 5, 11 + mova m0, [pw_1024] + mova m1, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang32_mode_28] + + vbroadcasti128 m2, [r2 + 1] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 9] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 17] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 25] + pshufb m5, m1 + + ;row [0, 1] + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [2, 3] + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [4, 5] + mova m10, [r4 + 2 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + vbroadcasti128 m2, [r2 + 2] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 10] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 18] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 26] + pshufb m5, m1 + + ;row [6, 7] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [8, 9] + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [10, 11] + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + vbroadcasti128 m2, [r2 + 3] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 11] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 19] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 27] + pshufb m5, m1 + + ;row [12, 13] + lea r0, [r0 + 4 * r1] + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [14, 15] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [16, 17] + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [18] + mova m10, [r4 + 1 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + ;row [19, 20] + vbroadcasti128 m2, [r2 + 4] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 12] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 20] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 28] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row[21, 22] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row[23, 24] + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [25, 26] + vbroadcasti128 m2, [r2 + 5] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 13] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 21] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 29] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [27, 28] + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [29, 30] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [31] + vbroadcasti128 m2, [r2 + 6] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 14] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 22] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 30] + pshufb m5, m1 + + mova m10, [r4 + 4 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang32_29, 3, 5, 11 + mova m0, [pw_1024] + mova m1, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang32_mode_29] + + ;row [0, 1] + vbroadcasti128 m2, [r2 + 1] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 9] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 17] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 25] + pshufb m5, m1 + + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [2] + mova m10, [r4 + 1 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + ;row [3, 4] + vbroadcasti128 m2, [r2 + 2] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 10] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 18] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 26] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [5, 6] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [7, 8] + vbroadcasti128 m2, [r2 + 3] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 11] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 19] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 27] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [9] + mova m10, [r4 + 1 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r1], m6 + + ;row [10, 11] + vbroadcasti128 m2, [r2 + 4] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 12] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 20] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 28] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [12, 13] + lea r0, [r0 + 4 * r1] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [14, 15] + vbroadcasti128 m2, [r2 + 5] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 13] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 21] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 29] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [16] + lea r0, [r0 + 4 * r1] + mova m10, [r4 + 1 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0], m6 + + ;row [17, 18] + vbroadcasti128 m2, [r2 + 6] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 14] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 22] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 30] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [19, 20] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [21, 22] + vbroadcasti128 m2, [r2 + 7] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 15] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 23] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 31] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [23] + mova m10, [r4 + 1 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + + ;row [24, 25] + vbroadcasti128 m2, [r2 + 8] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 16] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 24] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 32] + pshufb m5, m1 + + lea r0, [r0 + 4 * r1] + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [26, 27] + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [28, 29] + vbroadcasti128 m2, [r2 + 9] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 17] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 25] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 33] + pshufb m5, m1 + + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [30] + mova m10, [r4 + 1 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + ;row [31] + vbroadcasti128 m2, [r2 + 10] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 18] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 26] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 34] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang32_30, 3, 5, 11 + mova m0, [pw_1024] + mova m1, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang32_mode_30] + + ;row [0, 1] + vbroadcasti128 m2, [r2 + 1] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 9] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 17] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 25] + pshufb m5, m1 + + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [2, 3] + vbroadcasti128 m2, [r2 + 2] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 10] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 18] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 26] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [4, 5] + vbroadcasti128 m2, [r2 + 3] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 11] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 19] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 27] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [6] + mova m10, [r4 + 3 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + ;row [7, 8] + vbroadcasti128 m2, [r2 + 4] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 12] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 20] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 28] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [9, 10] + vbroadcasti128 m2, [r2 + 5] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 13] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 21] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 29] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [11] + mova m10, [r4 + 2 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + + ;row [12, 13] + vbroadcasti128 m2, [r2 + 6] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 14] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 22] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 30] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [14, 15] + vbroadcasti128 m2, [r2 + 7] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 15] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 23] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 31] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [16] + mova m10, [r4 + 1 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [17, 18] + vbroadcasti128 m2, [r2 + 8] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 16] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 24] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 32] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [19, 20] + vbroadcasti128 m2, [r2 + 9] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 17] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 25] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 33] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + add r4, 4 * mmsize + + ;row [21] + mova m10, [r4 + 0 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r1], m6 + + ;row [22, 23] + vbroadcasti128 m2, [r2 + 10] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 18] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 26] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 34] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [24, 25] + vbroadcasti128 m2, [r2 + 11] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 19] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 27] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 35] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [26] + mova m10, [r4 + 3 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + ;row [27, 28] + vbroadcasti128 m2, [r2 + 12] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 20] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 28] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 36] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [29, 30] + vbroadcasti128 m2, [r2 + 13] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 21] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 29] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 37] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [31] + vbroadcasti128 m2, [r2 + 14] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 22] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 30] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 38] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang32_31, 3, 5, 11 + mova m0, [pw_1024] + mova m1, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang32_mode_31] + + ;row [0] + vbroadcasti128 m2, [r2 + 1] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 9] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 17] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 25] + pshufb m5, m1 + + mova m10, [r4 + 0 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0], m6 + + ;row [1, 2] + vbroadcasti128 m2, [r2 + 2] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 10] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 18] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 26] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [3, 4] + vbroadcasti128 m2, [r2 + 3] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 11] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 19] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 27] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [5, 6] + vbroadcasti128 m2, [r2 + 4] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 12] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 20] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 28] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [7, 8] + vbroadcasti128 m2, [r2 + 5] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 13] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 21] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 29] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [9, 10] + vbroadcasti128 m2, [r2 + 6] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 14] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 22] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 30] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [11, 12] + vbroadcasti128 m2, [r2 + 7] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 15] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 23] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 31] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [13, 14] + vbroadcasti128 m2, [r2 + 8] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 16] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 24] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 32] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [15] + vbroadcasti128 m2, [r2 + 9] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 17] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 25] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 33] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + + ;row [16, 17] + vbroadcasti128 m2, [r2 + 10] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 18] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 26] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 34] + pshufb m5, m1 + + lea r0, [r0 + 4 * r1] + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [18, 19] + vbroadcasti128 m2, [r2 + 11] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 19] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 27] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 35] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [20, 21] + vbroadcasti128 m2, [r2 + 12] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 20] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 28] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 36] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [22, 23] + vbroadcasti128 m2, [r2 + 13] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 21] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 29] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 37] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [24, 25] + vbroadcasti128 m2, [r2 + 14] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 22] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 30] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 38] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [26, 27] + vbroadcasti128 m2, [r2 + 15] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 23] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 31] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 39] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [28, 29] + vbroadcasti128 m2, [r2 + 16] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 24] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 32] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 40] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [30] + vbroadcasti128 m2, [r2 + 17] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 25] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 33] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 41] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + ;row [31] + vbroadcasti128 m2, [r2 + 18] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 26] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 34] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 42] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + RET + +INIT_YMM avx2 +cglobal intra_pred_ang32_32, 3, 5, 11 + mova m0, [pw_1024] + mova m1, [intra_pred_shuff_0_8] + lea r3, [3 * r1] + lea r4, [c_ang32_mode_32] + + ;row [0] + vbroadcasti128 m2, [r2 + 1] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 9] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 17] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 25] + pshufb m5, m1 + + mova m10, [r4 + 0 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0], m6 + + ;row [1, 2] + vbroadcasti128 m2, [r2 + 2] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 10] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 18] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 26] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [3] + vbroadcasti128 m2, [r2 + 3] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 11] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 19] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 27] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + + ;row [4, 5] + vbroadcasti128 m2, [r2 + 4] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 12] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 20] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 28] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [6] + vbroadcasti128 m2, [r2 + 5] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 13] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 21] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 29] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + ;row [7, 8] + vbroadcasti128 m2, [r2 + 6] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 14] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 22] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 30] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [9] + vbroadcasti128 m2, [r2 + 7] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 15] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 23] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 31] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r1], m6 + + ;row [10, 11] + vbroadcasti128 m2, [r2 + 8] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 16] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 24] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 32] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [12] + vbroadcasti128 m2, [r2 + 9] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 17] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 25] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 33] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + lea r0, [r0 + 4 * r1] + + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0], m6 + + ;row [13, 14] + vbroadcasti128 m2, [r2 + 10] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 18] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 26] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 34] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [15] + vbroadcasti128 m2, [r2 + 11] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 19] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 27] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 35] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + + ;row [16, 17] + vbroadcasti128 m2, [r2 + 12] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 20] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 28] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 36] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [18] + vbroadcasti128 m2, [r2 + 13] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 21] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 29] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 37] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + ;row [19, 20] + vbroadcasti128 m2, [r2 + 14] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 22] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 30] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 38] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r3], m7 + lea r0, [r0 + 4 * r1] + movu [r0], m6 + + ;row [21] + vbroadcasti128 m2, [r2 + 15] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 23] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 31] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 39] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r1], m6 + + ;row [22, 23] + vbroadcasti128 m2, [r2 + 16] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 24] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 32] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 40] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + 2 * r1], m7 + movu [r0 + r3], m6 + + ;row [24] + vbroadcasti128 m2, [r2 + 17] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 25] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 33] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 41] + pshufb m5, m1 + + lea r0, [r0 + 4 * r1] + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0], m6 + + ;row [25, 26] + vbroadcasti128 m2, [r2 + 18] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 26] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 34] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 42] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0 + r1], m7 + movu [r0 + 2 * r1], m6 + + ;row [27] + vbroadcasti128 m2, [r2 + 19] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 27] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 35] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 43] + pshufb m5, m1 + + mova m10, [r4 + 2 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 + + ;row [28, 29] + vbroadcasti128 m2, [r2 + 20] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 28] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 36] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 44] + pshufb m5, m1 + + mova m10, [r4 + 3 * mmsize] + lea r0, [r0 + 4 * r1] + + INTRA_PRED_ANG32_CAL_ROW + movu [r0], m7 + movu [r0 + r1], m6 + + ;row [30] + vbroadcasti128 m2, [r2 + 21] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 29] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 37] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 45] + pshufb m5, m1 + + add r4, 4 * mmsize + mova m10, [r4 + 0 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + 2 * r1], m6 + + ;row [31] + vbroadcasti128 m2, [r2 + 22] + pshufb m2, m1 + vbroadcasti128 m3, [r2 + 30] + pshufb m3, m1 + vbroadcasti128 m4, [r2 + 38] + pshufb m4, m1 + vbroadcasti128 m5, [r2 + 46] + pshufb m5, m1 + + mova m10, [r4 + 1 * mmsize] + vperm2i128 m6, m2, m3, 00100000b + pmaddubsw m6, m10 + pmulhrsw m6, m0 + vperm2i128 m7, m4, m5, 00100000b + pmaddubsw m7, m10 + pmulhrsw m7, m0 + packuswb m6, m7 + vpermq m6, m6, 11011000b + movu [r0 + r3], m6 RET +%endif + diff -Nru x265-1.5/source/common/x86/intrapred.h x265-1.6/source/common/x86/intrapred.h --- x265-1.5/source/common/x86/intrapred.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/intrapred.h 2015-04-02 16:46:36.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2003-2013 x264 project * * Authors: Min Chen - * + * Praveen Kumar Tiwari * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -26,11 +26,19 @@ #ifndef X265_INTRAPRED_H #define X265_INTRAPRED_H -void x265_intra_pred_dc4_sse4 (pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); +void x265_intra_pred_dc4_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); +void x265_intra_pred_dc8_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); +void x265_intra_pred_dc16_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); +void x265_intra_pred_dc32_sse2(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); +void x265_intra_pred_dc4_sse4(pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); void x265_intra_pred_dc8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter); void x265_intra_pred_dc16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter); void x265_intra_pred_dc32_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int filter); +void x265_intra_pred_planar4_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int); +void x265_intra_pred_planar8_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int); +void x265_intra_pred_planar16_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int); +void x265_intra_pred_planar32_sse2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int); void x265_intra_pred_planar4_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int); void x265_intra_pred_planar8_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int); void x265_intra_pred_planar16_sse4(pixel* dst, intptr_t dstStride, const pixel* srcPix, int, int); @@ -39,6 +47,15 @@ #define DECL_ANG(bsize, mode, cpu) \ void x265_intra_pred_ang ## bsize ## _ ## mode ## _ ## cpu(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +DECL_ANG(4, 2, sse2); +DECL_ANG(4, 3, sse2); +DECL_ANG(4, 4, sse2); +DECL_ANG(4, 5, sse2); +DECL_ANG(4, 6, sse2); +DECL_ANG(4, 7, sse2); +DECL_ANG(4, 8, sse2); +DECL_ANG(4, 9, sse2); + DECL_ANG(4, 2, ssse3); DECL_ANG(4, 3, sse4); DECL_ANG(4, 4, sse4); @@ -157,6 +174,44 @@ DECL_ANG(32, 33, sse4); #undef DECL_ANG +void x265_intra_pred_ang8_3_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_33_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_4_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_5_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_6_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_7_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_29_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_8_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_9_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_12_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_24_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang8_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_29_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_33_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_24_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang16_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_34_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_2_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_26_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_29_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); +void x265_intra_pred_ang32_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter); void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); diff -Nru x265-1.5/source/common/x86/ipfilter16.asm x265-1.6/source/common/x86/ipfilter16.asm --- x265-1.5/source/common/x86/ipfilter16.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/ipfilter16.asm 2015-04-02 16:46:36.000000000 +0000 @@ -31,6 +31,7 @@ tab_c_n32768: times 4 dd -32768 tab_c_524800: times 4 dd 524800 tab_c_n8192: times 8 dw -8192 +pd_524800: times 8 dd 524800 tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 @@ -91,9 +92,28 @@ times 4 dw -5, 17 times 4 dw 58, -10 times 4 dw 4, -1 +ALIGN 32 +tab_LumaCoeffVer: times 8 dw 0, 0 + times 8 dw 0, 64 + times 8 dw 0, 0 + times 8 dw 0, 0 + + times 8 dw -1, 4 + times 8 dw -10, 58 + times 8 dw 17, -5 + times 8 dw 1, 0 + + times 8 dw -1, 4 + times 8 dw -11, 40 + times 8 dw 40, -11 + times 8 dw 4, -1 + + times 8 dw 0, 1 + times 8 dw -5, 17 + times 8 dw 58, -10 + times 8 dw 4, -1 SECTION .text - cextern pd_32 cextern pw_pixel_max cextern pd_n32768 @@ -2562,6 +2582,2681 @@ FILTER_VER_LUMA_PP 64, 16 FILTER_VER_LUMA_PP 16, 64 +%macro FILTER_VER_LUMA_AVX2_4x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 + mov r4d, r4m + add r1d, r1d + add r3d, r3d + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %1,pp + vbroadcasti128 m6, [pd_32] +%elifidn %1, sp + mova m6, [pd_524800] +%else + vbroadcasti128 m6, [pd_n32768] +%endif + + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + pmaddwd m4, [r5 + 1 * mmsize] + paddd m0, m5 + paddd m2, m4 + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + pmaddwd m1, [r5 + 2 * mmsize] + paddd m0, m5 + paddd m2, m1 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + 2 * r1] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [A 9 9 8] + pmaddwd m4, [r5 + 3 * mmsize] + paddd m2, m4 + +%ifidn %1,ss + psrad m0, 6 + psrad m2, 6 +%else + paddd m0, m6 + paddd m2, m6 +%ifidn %1,pp + psrad m0, 6 + psrad m2, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m2, 10 +%else + psrad m0, 2 + psrad m2, 2 +%endif +%endif + + packssdw m0, m2 + pxor m1, m1 +%ifidn %1,pp + CLIPW m0, m1, [pw_pixel_max] +%elifidn %1, sp + CLIPW m0, m1, [pw_pixel_max] +%endif + + vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 + RET +%endmacro + +FILTER_VER_LUMA_AVX2_4x4 pp +FILTER_VER_LUMA_AVX2_4x4 ps +FILTER_VER_LUMA_AVX2_4x4 sp +FILTER_VER_LUMA_AVX2_4x4 ss + +%macro FILTER_VER_LUMA_AVX2_8x8 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 + mov r4d, r4m + add r1d, r1d + add r3d, r3d + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %1,pp + vbroadcasti128 m11, [pd_32] +%elifidn %1, sp + mova m11, [pd_524800] +%else + vbroadcasti128 m11, [pd_n32768] +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m7 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m8 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + pmaddwd m7, [r5] + paddd m5, m9 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + pmaddwd m8, [r5 + 1 * mmsize] + paddd m4, m10 + paddd m6, m8 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm8, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm8, 1 + pmaddwd m8, m9, [r5 + 3 * mmsize] + paddd m3, m8 + pmaddwd m8, m9, [r5 + 2 * mmsize] + pmaddwd m9, [r5 + 1 * mmsize] + paddd m5, m8 + paddd m7, m9 + movu xm8, [r0 + r4] ; m8 = row 11 + punpckhwd xm9, xm10, xm8 + punpcklwd xm10, xm8 + vinserti128 m10, m10, xm9, 1 + pmaddwd m9, m10, [r5 + 3 * mmsize] + pmaddwd m10, [r5 + 2 * mmsize] + paddd m4, m9 + paddd m6, m10 + + lea r4, [r3 * 3] +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%else + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 +%ifidn %1,pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 +%else + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 +%endif +%endif + + packssdw m0, m1 + packssdw m2, m3 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + pxor m10, m10 + mova m9, [pw_pixel_max] +%ifidn %1,pp + CLIPW m0, m10, m9 + CLIPW m2, m10, m9 +%elifidn %1, sp + CLIPW m0, m10, m9 + CLIPW m2, m10, m9 +%endif + + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 12 + punpckhwd xm3, xm8, xm2 + punpcklwd xm8, xm2 + vinserti128 m8, m8, xm3, 1 + pmaddwd m3, m8, [r5 + 3 * mmsize] + pmaddwd m8, [r5 + 2 * mmsize] + paddd m5, m3 + paddd m7, m8 + movu xm3, [r0 + r1] ; m3 = row 13 + punpckhwd xm0, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 3 * mmsize] + paddd m6, m2 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm3, xm0 + punpcklwd xm3, xm0 + vinserti128 m3, m3, xm1, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m7, m3 + +%ifidn %1,ss + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%else + paddd m4, m11 + paddd m5, m11 + paddd m6, m11 + paddd m7, m11 +%ifidn %1,pp + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%elifidn %1, sp + psrad m4, 10 + psrad m5, 10 + psrad m6, 10 + psrad m7, 10 +%else + psrad m4, 2 + psrad m5, 2 + psrad m6, 2 + psrad m7, 2 +%endif +%endif + + packssdw m4, m5 + packssdw m6, m7 + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b +%ifidn %1,pp + CLIPW m4, m10, m9 + CLIPW m6, m10, m9 +%elifidn %1, sp + CLIPW m4, m10, m9 + CLIPW m6, m10, m9 +%endif + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm7 + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_8x8 pp +FILTER_VER_LUMA_AVX2_8x8 ps +FILTER_VER_LUMA_AVX2_8x8 sp +FILTER_VER_LUMA_AVX2_8x8 ss + +%macro PROCESS_LUMA_AVX2_W8_16R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] + +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%else + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 +%ifidn %1,pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 + psrad m4, 10 + psrad m5, 10 +%else + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 + psrad m4, 2 + psrad m5, 2 +%endif +%endif + + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + pxor m5, m5 + mova m3, [pw_pixel_max] +%ifidn %1,pp + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 +%elifidn %1, sp + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 +%endif + + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm1 + lea r8, [r2 + r3 * 4] + vextracti128 xm1, m4, 1 + movu [r8], xm4 + movu [r8 + r3], xm1 + + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m13, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m1, m13, [r5 + 1 * mmsize] + paddd m11, m1 + pmaddwd m13, [r5] + +%ifidn %1,ss + psrad m6, 6 + psrad m7, 6 +%else + paddd m6, m14 + paddd m7, m14 +%ifidn %1,pp + psrad m6, 6 + psrad m7, 6 +%elifidn %1, sp + psrad m6, 10 + psrad m7, 10 +%else + psrad m6, 2 + psrad m7, 2 +%endif +%endif + + packssdw m6, m7 + vpermq m6, m6, 11011000b +%ifidn %1,pp + CLIPW m6, m5, m3 +%elifidn %1, sp + CLIPW m6, m5, m3 +%endif + vextracti128 xm7, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 + + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m10, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m12, m2 + pmaddwd m0, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm6, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm6, 1 + pmaddwd m6, m1, [r5 + 3 * mmsize] + paddd m9, m6 + pmaddwd m6, m1, [r5 + 2 * mmsize] + paddd m11, m6 + pmaddwd m6, m1, [r5 + 1 * mmsize] + paddd m13, m6 + pmaddwd m1, [r5] + movu xm6, [r7 + r1] ; m6 = row 17 + punpckhwd xm4, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm2, 1 + pmaddwd m2, m6, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m6, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + movu xm2, [r7 + r4] ; m2 = row 19 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m12, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r7, [r7 + r1 * 4] + movu xm6, [r7] ; m6 = row 20 + punpckhwd xm7, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 3 * mmsize] + paddd m13, m7 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m1, m2 + movu xm7, [r7 + r1] ; m7 = row 21 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m0, m6 + movu xm2, [r7 + r1 * 2] ; m2 = row 22 + punpckhwd xm6, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm6, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m1, m7 + +%ifidn %1,ss + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%else + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m1, m14 +%ifidn %1,pp + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%elifidn %1, sp + psrad m8, 10 + psrad m9, 10 + psrad m10, 10 + psrad m11, 10 + psrad m12, 10 + psrad m13, 10 + psrad m0, 10 + psrad m1, 10 +%else + psrad m8, 2 + psrad m9, 2 + psrad m10, 2 + psrad m11, 2 + psrad m12, 2 + psrad m13, 2 + psrad m0, 2 + psrad m1, 2 +%endif +%endif + + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b +%ifidn %1,pp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 +%elifidn %1, sp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 +%endif + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + lea r8, [r8 + r3 * 4] + movu [r8], xm12 + movu [r8 + r3], xm13 + movu [r8 + r3 * 2], xm0 + movu [r8 + r6], xm1 +%endmacro + +%macro FILTER_VER_LUMA_AVX2_Nx16 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + mova m14, [pd_524800] +%else + vbroadcasti128 m14, [pd_n32768] +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_LUMA_AVX2_W8_16R %1 + add r2, 16 + add r0, 16 + dec r9d + jnz .loopW + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_Nx16 pp, 16 +FILTER_VER_LUMA_AVX2_Nx16 pp, 32 +FILTER_VER_LUMA_AVX2_Nx16 pp, 64 +FILTER_VER_LUMA_AVX2_Nx16 ps, 16 +FILTER_VER_LUMA_AVX2_Nx16 ps, 32 +FILTER_VER_LUMA_AVX2_Nx16 ps, 64 +FILTER_VER_LUMA_AVX2_Nx16 sp, 16 +FILTER_VER_LUMA_AVX2_Nx16 sp, 32 +FILTER_VER_LUMA_AVX2_Nx16 sp, 64 +FILTER_VER_LUMA_AVX2_Nx16 ss, 16 +FILTER_VER_LUMA_AVX2_Nx16 ss, 32 +FILTER_VER_LUMA_AVX2_Nx16 ss, 64 + +%macro FILTER_VER_LUMA_AVX2_NxN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %3,pp + vbroadcasti128 m14, [pd_32] +%elifidn %3, sp + mova m14, [pd_524800] +%else + vbroadcasti128 m14, [pd_n32768] +%endif + + lea r6, [r3 * 3] + lea r11, [r1 * 4] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 8 +.loopW: + PROCESS_LUMA_AVX2_W8_16R %3 + add r2, 16 + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 2 * %1 + 16] + lea r2, [r8 + r3 * 4 - 2 * %1 + 16] + dec r9d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_NxN 16, 32, pp +FILTER_VER_LUMA_AVX2_NxN 16, 64, pp +FILTER_VER_LUMA_AVX2_NxN 24, 32, pp +FILTER_VER_LUMA_AVX2_NxN 32, 32, pp +FILTER_VER_LUMA_AVX2_NxN 32, 64, pp +FILTER_VER_LUMA_AVX2_NxN 48, 64, pp +FILTER_VER_LUMA_AVX2_NxN 64, 32, pp +FILTER_VER_LUMA_AVX2_NxN 64, 48, pp +FILTER_VER_LUMA_AVX2_NxN 64, 64, pp +FILTER_VER_LUMA_AVX2_NxN 16, 32, ps +FILTER_VER_LUMA_AVX2_NxN 16, 64, ps +FILTER_VER_LUMA_AVX2_NxN 24, 32, ps +FILTER_VER_LUMA_AVX2_NxN 32, 32, ps +FILTER_VER_LUMA_AVX2_NxN 32, 64, ps +FILTER_VER_LUMA_AVX2_NxN 48, 64, ps +FILTER_VER_LUMA_AVX2_NxN 64, 32, ps +FILTER_VER_LUMA_AVX2_NxN 64, 48, ps +FILTER_VER_LUMA_AVX2_NxN 64, 64, ps +FILTER_VER_LUMA_AVX2_NxN 16, 32, sp +FILTER_VER_LUMA_AVX2_NxN 16, 64, sp +FILTER_VER_LUMA_AVX2_NxN 24, 32, sp +FILTER_VER_LUMA_AVX2_NxN 32, 32, sp +FILTER_VER_LUMA_AVX2_NxN 32, 64, sp +FILTER_VER_LUMA_AVX2_NxN 48, 64, sp +FILTER_VER_LUMA_AVX2_NxN 64, 32, sp +FILTER_VER_LUMA_AVX2_NxN 64, 48, sp +FILTER_VER_LUMA_AVX2_NxN 64, 64, sp +FILTER_VER_LUMA_AVX2_NxN 16, 32, ss +FILTER_VER_LUMA_AVX2_NxN 16, 64, ss +FILTER_VER_LUMA_AVX2_NxN 24, 32, ss +FILTER_VER_LUMA_AVX2_NxN 32, 32, ss +FILTER_VER_LUMA_AVX2_NxN 32, 64, ss +FILTER_VER_LUMA_AVX2_NxN 48, 64, ss +FILTER_VER_LUMA_AVX2_NxN 64, 32, ss +FILTER_VER_LUMA_AVX2_NxN 64, 48, ss +FILTER_VER_LUMA_AVX2_NxN 64, 64, ss + +%macro FILTER_VER_LUMA_AVX2_8xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + mova m14, [pd_524800] +%else + vbroadcasti128 m14, [pd_n32768] +%endif + lea r6, [r3 * 3] + lea r7, [r1 * 4] + mov r8d, %2 / 16 +.loopH: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] + +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%else + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 +%ifidn %1,pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 + psrad m4, 10 + psrad m5, 10 +%else + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 + psrad m4, 2 + psrad m5, 2 +%endif +%endif + + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + pxor m5, m5 + mova m3, [pw_pixel_max] +%ifidn %1,pp + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 +%elifidn %1, sp + CLIPW m0, m5, m3 + CLIPW m2, m5, m3 + CLIPW m4, m5, m3 +%endif + + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm1 + lea r2, [r2 + r3 * 4] + vextracti128 xm1, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm1 + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m13, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m1, m13, [r5 + 1 * mmsize] + paddd m11, m1 + pmaddwd m13, [r5] + +%ifidn %1,ss + psrad m6, 6 + psrad m7, 6 +%else + paddd m6, m14 + paddd m7, m14 +%ifidn %1,pp + psrad m6, 6 + psrad m7, 6 +%elifidn %1, sp + psrad m6, 10 + psrad m7, 10 +%else + psrad m6, 2 + psrad m7, 2 +%endif +%endif + + packssdw m6, m7 + vpermq m6, m6, 11011000b +%ifidn %1,pp + CLIPW m6, m5, m3 +%elifidn %1, sp + CLIPW m6, m5, m3 +%endif + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m10, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m12, m2 + pmaddwd m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhwd xm6, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm6, 1 + pmaddwd m6, m1, [r5 + 3 * mmsize] + paddd m9, m6 + pmaddwd m6, m1, [r5 + 2 * mmsize] + paddd m11, m6 + pmaddwd m6, m1, [r5 + 1 * mmsize] + paddd m13, m6 + pmaddwd m1, [r5] + movu xm6, [r0 + r1] ; m6 = row 17 + punpckhwd xm4, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm2, 1 + pmaddwd m2, m6, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m6, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + movu xm2, [r0 + r4] ; m2 = row 19 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m12, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhwd xm7, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 3 * mmsize] + paddd m13, m7 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m1, m2 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhwd xm6, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm6, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m1, m7 + +%ifidn %1,ss + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%else + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m1, m14 +%ifidn %1,pp + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%elifidn %1, sp + psrad m8, 10 + psrad m9, 10 + psrad m10, 10 + psrad m11, 10 + psrad m12, 10 + psrad m13, 10 + psrad m0, 10 + psrad m1, 10 +%else + psrad m8, 2 + psrad m9, 2 + psrad m10, 2 + psrad m11, 2 + psrad m12, 2 + psrad m13, 2 + psrad m0, 2 + psrad m1, 2 +%endif +%endif + + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b +%ifidn %1,pp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 +%elifidn %1, sp + CLIPW m8, m5, m3 + CLIPW m10, m5, m3 + CLIPW m12, m5, m3 + CLIPW m0, m5, m3 +%endif + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + lea r2, [r2 + r3 * 4] + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 + lea r2, [r2 + r3 * 4] + sub r0, r7 + dec r8d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_8xN pp, 16 +FILTER_VER_LUMA_AVX2_8xN pp, 32 +FILTER_VER_LUMA_AVX2_8xN ps, 16 +FILTER_VER_LUMA_AVX2_8xN ps, 32 +FILTER_VER_LUMA_AVX2_8xN sp, 16 +FILTER_VER_LUMA_AVX2_8xN sp, 32 +FILTER_VER_LUMA_AVX2_8xN ss, 16 +FILTER_VER_LUMA_AVX2_8xN ss, 32 + +%macro PROCESS_LUMA_AVX2_W8_8R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m8, [r5 + 1 * mmsize] + paddd m6, m8 + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm8, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm8, 1 + pmaddwd m8, m9, [r5 + 3 * mmsize] + paddd m3, m8 + pmaddwd m8, m9, [r5 + 2 * mmsize] + paddd m5, m8 + pmaddwd m9, [r5 + 1 * mmsize] + paddd m7, m9 + movu xm8, [r7 + r4] ; m8 = row 11 + punpckhwd xm9, xm10, xm8 + punpcklwd xm10, xm8 + vinserti128 m10, m10, xm9, 1 + pmaddwd m9, m10, [r5 + 3 * mmsize] + paddd m4, m9 + pmaddwd m10, [r5 + 2 * mmsize] + paddd m6, m10 + lea r7, [r7 + r1 * 4] + movu xm9, [r7] ; m9 = row 12 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m5, m10 + pmaddwd m8, [r5 + 2 * mmsize] + paddd m7, m8 + +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%else + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 + paddd m4, m11 + paddd m5, m11 +%ifidn %1,pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 + psrad m4, 10 + psrad m5, 10 +%else + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 + psrad m4, 2 + psrad m5, 2 +%endif +%endif + + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + pxor m8, m8 +%ifidn %1,pp + CLIPW m0, m8, m12 + CLIPW m2, m8, m12 + CLIPW m4, m8, m12 +%elifidn %1, sp + CLIPW m0, m8, m12 + CLIPW m2, m8, m12 + CLIPW m4, m8, m12 +%endif + + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r8, [r2 + r3 * 4] + movu [r8], xm4 + movu [r8 + r3], xm5 + + movu xm10, [r7 + r1] ; m10 = row 13 + punpckhwd xm0, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm0, 1 + pmaddwd m9, [r5 + 3 * mmsize] + paddd m6, m9 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm10, xm0 + punpcklwd xm10, xm0 + vinserti128 m10, m10, xm1, 1 + pmaddwd m10, [r5 + 3 * mmsize] + paddd m7, m10 + +%ifidn %1,ss + psrad m6, 6 + psrad m7, 6 +%else + paddd m6, m11 + paddd m7, m11 +%ifidn %1,pp + psrad m6, 6 + psrad m7, 6 +%elifidn %1, sp + psrad m6, 10 + psrad m7, 10 +%else + psrad m6, 2 + psrad m7, 2 +%endif +%endif + + packssdw m6, m7 + vpermq m6, m6, 11011000b +%ifidn %1,pp + CLIPW m6, m8, m12 +%elifidn %1, sp + CLIPW m6, m8, m12 +%endif + vextracti128 xm7, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%endmacro + +%macro FILTER_VER_LUMA_AVX2_Nx8 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_%2x8, 4, 10, 13 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m11, [pd_32] +%elifidn %1, sp + mova m11, [pd_524800] +%else + vbroadcasti128 m11, [pd_n32768] +%endif + mova m12, [pw_pixel_max] + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_LUMA_AVX2_W8_8R %1 + add r2, 16 + add r0, 16 + dec r9d + jnz .loopW + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_Nx8 pp, 32 +FILTER_VER_LUMA_AVX2_Nx8 pp, 16 +FILTER_VER_LUMA_AVX2_Nx8 ps, 32 +FILTER_VER_LUMA_AVX2_Nx8 ps, 16 +FILTER_VER_LUMA_AVX2_Nx8 sp, 32 +FILTER_VER_LUMA_AVX2_Nx8 sp, 16 +FILTER_VER_LUMA_AVX2_Nx8 ss, 32 +FILTER_VER_LUMA_AVX2_Nx8 ss, 16 + +%macro FILTER_VER_LUMA_AVX2_32x24 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + mova m14, [pd_524800] +%else + vbroadcasti128 m14, [pd_n32768] +%endif + lea r6, [r3 * 3] + mov r9d, 4 +.loopW: + PROCESS_LUMA_AVX2_W8_16R %1 + add r2, 16 + add r0, 16 + dec r9d + jnz .loopW + lea r9, [r1 * 4] + sub r7, r9 + lea r0, [r7 - 48] + lea r2, [r8 + r3 * 4 - 48] + mova m11, m14 + mova m12, m3 + mov r9d, 4 +.loop: + PROCESS_LUMA_AVX2_W8_8R %1 + add r2, 16 + add r0, 16 + dec r9d + jnz .loop + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_32x24 pp +FILTER_VER_LUMA_AVX2_32x24 ps +FILTER_VER_LUMA_AVX2_32x24 sp +FILTER_VER_LUMA_AVX2_32x24 ss + +%macro PROCESS_LUMA_AVX2_W8_4R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m4, m5, [r5 + 2 * mmsize] + paddd m1, m4 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 + movu xm4, [r0 + r4] ; m4 = row 7 + punpckhwd xm5, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm5, 1 + pmaddwd m5, m6, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m6, [r5 + 2 * mmsize] + paddd m2, m6 + lea r0, [r0 + r1 * 4] + movu xm5, [r0] ; m5 = row 8 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m1, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m3, m4 + movu xm6, [r0 + r1] ; m6 = row 9 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m5, [r5 + 3 * mmsize] + paddd m2, m5 + movu xm4, [r0 + r1 * 2] ; m4 = row 10 + punpckhwd xm5, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm5, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m3, m6 + +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%else + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 +%ifidn %1,pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 +%else + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 +%endif +%endif + + packssdw m0, m1 + packssdw m2, m3 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + pxor m4, m4 +%ifidn %1,pp + CLIPW m0, m4, [pw_pixel_max] + CLIPW m2, m4, [pw_pixel_max] +%elifidn %1, sp + CLIPW m0, m4, [pw_pixel_max] + CLIPW m2, m4, [pw_pixel_max] +%endif + + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 +%endmacro + +%macro FILTER_VER_LUMA_AVX2_16x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + mova m7, [pd_524800] +%else + vbroadcasti128 m7, [pd_n32768] +%endif + mov dword [rsp], 2 +.loopW: + PROCESS_LUMA_AVX2_W8_4R %1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + lea r6, [r3 * 3] + movu [r2 + r6], xm3 + add r2, 16 + lea r6, [8 * r1 - 16] + sub r0, r6 + dec dword [rsp] + jnz .loopW + RET +%endmacro + +FILTER_VER_LUMA_AVX2_16x4 pp +FILTER_VER_LUMA_AVX2_16x4 ps +FILTER_VER_LUMA_AVX2_16x4 sp +FILTER_VER_LUMA_AVX2_16x4 ss + +%macro FILTER_VER_LUMA_AVX2_8x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + mova m7, [pd_524800] +%else + vbroadcasti128 m7, [pd_n32768] +%endif + + PROCESS_LUMA_AVX2_W8_4R %1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + lea r4, [r3 * 3] + movu [r2 + r4], xm3 + RET +%endmacro + +FILTER_VER_LUMA_AVX2_8x4 pp +FILTER_VER_LUMA_AVX2_8x4 ps +FILTER_VER_LUMA_AVX2_8x4 sp +FILTER_VER_LUMA_AVX2_8x4 ss + +%macro FILTER_VER_LUMA_AVX2_16x12 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + mova m14, [pd_524800] +%else + vbroadcasti128 m14, [pd_n32768] +%endif + mova m13, [pw_pixel_max] + pxor m12, m12 + lea r6, [r3 * 3] + mov r9d, 2 +.loopW: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + +%ifidn %1,ss + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%else + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 +%ifidn %1,pp + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m1, 10 + psrad m2, 10 + psrad m3, 10 +%else + psrad m0, 2 + psrad m1, 2 + psrad m2, 2 + psrad m3, 2 +%endif +%endif + + packssdw m0, m1 + packssdw m2, m3 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b +%ifidn %1,pp + CLIPW m0, m12, m13 + CLIPW m2, m12, m13 +%elifidn %1, sp + CLIPW m0, m12, m13 + CLIPW m2, m12, m13 +%endif + + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhwd xm0, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm0, 1 + pmaddwd m0, m10, [r5 + 3 * mmsize] + paddd m4, m0 + pmaddwd m0, m10, [r5 + 2 * mmsize] + paddd m6, m0 + pmaddwd m0, m10, [r5 + 1 * mmsize] + paddd m8, m0 + pmaddwd m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm0, [r7] ; m0 = row 12 + punpckhwd xm1, xm11, xm0 + punpcklwd xm11, xm0 + vinserti128 m11, m11, xm1, 1 + pmaddwd m1, m11, [r5 + 3 * mmsize] + paddd m5, m1 + pmaddwd m1, m11, [r5 + 2 * mmsize] + paddd m7, m1 + pmaddwd m1, m11, [r5 + 1 * mmsize] + paddd m9, m1 + pmaddwd m11, [r5] + movu xm2, [r7 + r1] ; m2 = row 13 + punpckhwd xm1, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm1, 1 + pmaddwd m1, m0, [r5 + 3 * mmsize] + paddd m6, m1 + pmaddwd m1, m0, [r5 + 2 * mmsize] + paddd m8, m1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m10, m0 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm2, xm0 + punpcklwd xm2, xm0 + vinserti128 m2, m2, xm1, 1 + pmaddwd m1, m2, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m2, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m11, m2 + +%ifidn %1,ss + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%else + paddd m4, m14 + paddd m5, m14 + paddd m6, m14 + paddd m7, m14 +%ifidn %1,pp + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%elifidn %1, sp + psrad m4, 10 + psrad m5, 10 + psrad m6, 10 + psrad m7, 10 +%else + psrad m4, 2 + psrad m5, 2 + psrad m6, 2 + psrad m7, 2 +%endif +%endif + + packssdw m4, m5 + packssdw m6, m7 + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b +%ifidn %1,pp + CLIPW m4, m12, m13 + CLIPW m6, m12, m13 +%elifidn %1, sp + CLIPW m4, m12, m13 + CLIPW m6, m12, m13 +%endif + lea r8, [r2 + r3 * 4] + vextracti128 xm1, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 + + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m0, [r5 + 2 * mmsize] + paddd m10, m0 + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m3, m1, [r5 + 3 * mmsize] + paddd m9, m3 + pmaddwd m1, [r5 + 2 * mmsize] + paddd m11, m1 + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m2, [r5 + 3 * mmsize] + paddd m10, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m11, m3 + +%ifidn %1,ss + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 +%else + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 +%ifidn %1,pp + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 +%elifidn %1, sp + psrad m8, 10 + psrad m9, 10 + psrad m10, 10 + psrad m11, 10 +%else + psrad m8, 2 + psrad m9, 2 + psrad m10, 2 + psrad m11, 2 +%endif +%endif + + packssdw m8, m9 + packssdw m10, m11 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b +%ifidn %1,pp + CLIPW m8, m12, m13 + CLIPW m10, m12, m13 +%elifidn %1, sp + CLIPW m8, m12, m13 + CLIPW m10, m12, m13 +%endif + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + add r2, 16 + add r0, 16 + dec r9d + jnz .loopW + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_16x12 pp +FILTER_VER_LUMA_AVX2_16x12 ps +FILTER_VER_LUMA_AVX2_16x12 sp +FILTER_VER_LUMA_AVX2_16x12 ss + +%macro FILTER_VER_LUMA_AVX2_4x8 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %1,pp + vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + mova m7, [pd_524800] +%else + vbroadcasti128 m7, [pd_n32768] +%endif + lea r6, [r3 * 3] + + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + paddd m0, m5 + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m5, m1, [r5 + 2 * mmsize] + paddd m2, m5 + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m2, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m4, m3 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + +%ifidn %1,ss + psrad m0, 6 + psrad m2, 6 +%else + paddd m0, m7 + paddd m2, m7 +%ifidn %1,pp + psrad m0, 6 + psrad m2, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m2, 10 +%else + psrad m0, 2 + psrad m2, 2 +%endif +%endif + + packssdw m0, m2 + pxor m6, m6 + mova m3, [pw_pixel_max] +%ifidn %1,pp + CLIPW m0, m6, m3 +%elifidn %1, sp + CLIPW m0, m6, m3 +%endif + + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 + + movq xm2, [r0 + r4] + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [C B B A] + pmaddwd m2, m5, [r5 + 3 * mmsize] + paddd m4, m2 + pmaddwd m5, [r5 + 2 * mmsize] + paddd m1, m5 + movq xm2, [r0 + r1] + punpcklwd xm0, xm2 + movq xm5, [r0 + 2 * r1] + punpcklwd xm2, xm5 + vinserti128 m0, m0, xm2, 1 ; m0 = [E D D C] + pmaddwd m0, [r5 + 3 * mmsize] + paddd m1, m0 + +%ifidn %1,ss + psrad m4, 6 + psrad m1, 6 +%else + paddd m4, m7 + paddd m1, m7 +%ifidn %1,pp + psrad m4, 6 + psrad m1, 6 +%elifidn %1, sp + psrad m4, 10 + psrad m1, 10 +%else + psrad m4, 2 + psrad m1, 2 +%endif +%endif + + packssdw m4, m1 +%ifidn %1,pp + CLIPW m4, m6, m3 +%elifidn %1, sp + CLIPW m4, m6, m3 +%endif + + vextracti128 xm1, m4, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 + RET +%endmacro + +FILTER_VER_LUMA_AVX2_4x8 pp +FILTER_VER_LUMA_AVX2_4x8 ps +FILTER_VER_LUMA_AVX2_4x8 sp +FILTER_VER_LUMA_AVX2_4x8 ss + +%macro PROCESS_LUMA_AVX2_W4_16R 1 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + paddd m0, m5 + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m5, m1, [r5 + 2 * mmsize] + paddd m2, m5 + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m2, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m4, m3 + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m1, m3 + pmaddwd m6, [r5] + +%ifidn %1,ss + psrad m0, 6 + psrad m2, 6 +%else + paddd m0, m7 + paddd m2, m7 +%ifidn %1,pp + psrad m0, 6 + psrad m2, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m2, 10 +%else + psrad m0, 2 + psrad m2, 2 +%endif +%endif + + packssdw m0, m2 + pxor m3, m3 +%ifidn %1,pp + CLIPW m0, m3, [pw_pixel_max] +%elifidn %1, sp + CLIPW m0, m3, [pw_pixel_max] +%endif + + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 + + movq xm2, [r0 + r4] + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] + pmaddwd m2, m5, [r5 + 3 * mmsize] + paddd m4, m2 + pmaddwd m2, m5, [r5 + 2 * mmsize] + paddd m1, m2 + pmaddwd m2, m5, [r5 + 1 * mmsize] + paddd m6, m2 + pmaddwd m5, [r5] + movq xm2, [r0 + r1] + punpcklwd xm0, xm2 + movq xm3, [r0 + 2 * r1] + punpcklwd xm2, xm3 + vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m1, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m6, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m0, [r5] + +%ifidn %1,ss + psrad m4, 6 + psrad m1, 6 +%else + paddd m4, m7 + paddd m1, m7 +%ifidn %1,pp + psrad m4, 6 + psrad m1, 6 +%elifidn %1, sp + psrad m4, 10 + psrad m1, 10 +%else + psrad m4, 2 + psrad m1, 2 +%endif +%endif + + packssdw m4, m1 + pxor m2, m2 +%ifidn %1,pp + CLIPW m4, m2, [pw_pixel_max] +%elifidn %1, sp + CLIPW m4, m2, [pw_pixel_max] +%endif + + vextracti128 xm1, m4, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 + + movq xm4, [r0 + r4] + punpcklwd xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] + punpcklwd xm4, xm1 + vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] + pmaddwd m4, m3, [r5 + 3 * mmsize] + paddd m6, m4 + pmaddwd m4, m3, [r5 + 2 * mmsize] + paddd m5, m4 + pmaddwd m4, m3, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m3, [r5] + movq xm4, [r0 + r1] + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] + pmaddwd m4, m1, [r5 + 3 * mmsize] + paddd m5, m4 + pmaddwd m4, m1, [r5 + 2 * mmsize] + paddd m0, m4 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m3, m1 + +%ifidn %1,ss + psrad m6, 6 + psrad m5, 6 +%else + paddd m6, m7 + paddd m5, m7 +%ifidn %1,pp + psrad m6, 6 + psrad m5, 6 +%elifidn %1, sp + psrad m6, 10 + psrad m5, 10 +%else + psrad m6, 2 + psrad m5, 2 +%endif +%endif + + packssdw m6, m5 + pxor m1, m1 +%ifidn %1,pp + CLIPW m6, m1, [pw_pixel_max] +%elifidn %1, sp + CLIPW m6, m1, [pw_pixel_max] +%endif + + vextracti128 xm5, m6, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm6 + movq [r2 + r3], xm5 + movhps [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm5 + + movq xm4, [r0 + r4] + punpcklwd xm2, xm4 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm4, xm6 + vinserti128 m2, m2, xm4, 1 ; m2 = [20 19 19 18] + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m3, m2 + movq xm4, [r0 + r1] + punpcklwd xm6, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m6, m6, xm4, 1 ; m6 = [22 21 21 20] + pmaddwd m6, [r5 + 3 * mmsize] + paddd m3, m6 + +%ifidn %1,ss + psrad m0, 6 + psrad m3, 6 +%else + paddd m0, m7 + paddd m3, m7 +%ifidn %1,pp + psrad m0, 6 + psrad m3, 6 +%elifidn %1, sp + psrad m0, 10 + psrad m3, 10 +%else + psrad m0, 2 + psrad m3, 2 +%endif +%endif + + packssdw m0, m3 +%ifidn %1,pp + CLIPW m0, m1, [pw_pixel_max] +%elifidn %1, sp + CLIPW m0, m1, [pw_pixel_max] +%endif + + vextracti128 xm3, m0, 1 + lea r2, [r2 + r3 * 4] + movq [r2], xm0 + movq [r2 + r3], xm3 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm3 +%endmacro + +%macro FILTER_VER_LUMA_AVX2_4x16 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_4x16, 4, 7, 8 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m7, [pd_32] +%elifidn %1, sp + mova m7, [pd_524800] +%else + vbroadcasti128 m7, [pd_n32768] +%endif + lea r6, [r3 * 3] + PROCESS_LUMA_AVX2_W4_16R %1 + RET +%endmacro + +FILTER_VER_LUMA_AVX2_4x16 pp +FILTER_VER_LUMA_AVX2_4x16 ps +FILTER_VER_LUMA_AVX2_4x16 sp +FILTER_VER_LUMA_AVX2_4x16 ss + +%macro FILTER_VER_LUMA_AVX2_12x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_12x16, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + add r3d, r3d + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + vbroadcasti128 m14, [pd_32] +%elifidn %1, sp + mova m14, [pd_524800] +%else + vbroadcasti128 m14, [pd_n32768] +%endif + lea r6, [r3 * 3] + PROCESS_LUMA_AVX2_W8_16R %1 + add r2, 16 + add r0, 16 + mova m7, m14 + PROCESS_LUMA_AVX2_W4_16R %1 + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_12x16 pp +FILTER_VER_LUMA_AVX2_12x16 ps +FILTER_VER_LUMA_AVX2_12x16 sp +FILTER_VER_LUMA_AVX2_12x16 ss + ;--------------------------------------------------------------------------------------------------------------- ; void interp_8tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;--------------------------------------------------------------------------------------------------------------- diff -Nru x265-1.5/source/common/x86/ipfilter8.asm x265-1.6/source/common/x86/ipfilter8.asm --- x265-1.5/source/common/x86/ipfilter8.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/ipfilter8.asm 2015-04-02 16:46:36.000000000 +0000 @@ -35,10 +35,20 @@ const interp4_vpp_shuf, times 2 db 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15 ALIGN 32 +const interp_vert_shuf, times 2 db 0, 2, 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9 + times 2 db 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13 + +ALIGN 32 const interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4 dd 2, 3, 3, 4, 4, 5, 5, 6 ALIGN 32 +const pb_8tap_hps_0, times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 + times 2 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10 + times 2 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12 + times 2 db 6, 7, 7, 8, 8, 9, 9,10,10,11,11,12,12,13,13,14 + +ALIGN 32 tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10 db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12 @@ -51,6 +61,8 @@ tab_c_526336: times 4 dd 8192*64+2048 +pd_526336: times 8 dd 8192*64+2048 + tab_ChromaCoeff: db 0, 64, 0, 0 db -2, 58, 10, -2 db -4, 54, 16, -2 @@ -59,6 +71,30 @@ db -4, 28, 46, -6 db -2, 16, 54, -4 db -2, 10, 58, -2 +ALIGN 32 +tab_ChromaCoeff_V: times 8 db 0, 64 + times 8 db 0, 0 + + times 8 db -2, 58 + times 8 db 10, -2 + + times 8 db -4, 54 + times 8 db 16, -2 + + times 8 db -6, 46 + times 8 db 28, -4 + + times 8 db -4, 36 + times 8 db 36, -4 + + times 8 db -4, 28 + times 8 db 46, -6 + + times 8 db -2, 16 + times 8 db 54, -4 + + times 8 db -2, 10 + times 8 db 58, -2 tab_ChromaCoeffV: times 4 dw 0, 64 times 4 dw 0, 0 @@ -84,6 +120,31 @@ times 4 dw -2, 10 times 4 dw 58, -2 +ALIGN 32 +pw_ChromaCoeffV: times 8 dw 0, 64 + times 8 dw 0, 0 + + times 8 dw -2, 58 + times 8 dw 10, -2 + + times 8 dw -4, 54 + times 8 dw 16, -2 + + times 8 dw -6, 46 + times 8 dw 28, -4 + + times 8 dw -4, 36 + times 8 dw 36, -4 + + times 8 dw -4, 28 + times 8 dw 46, -6 + + times 8 dw -2, 16 + times 8 dw 54, -4 + + times 8 dw -2, 10 + times 8 dw 58, -2 + tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0 db -1, 4, -10, 58, 17, -5, 1, 0 db -1, 4, -11, 40, 40, -11, 4, -1 @@ -109,6 +170,47 @@ times 4 dw 58, -10 times 4 dw 4, -1 +ALIGN 32 +pw_LumaCoeffVer: times 8 dw 0, 0 + times 8 dw 0, 64 + times 8 dw 0, 0 + times 8 dw 0, 0 + + times 8 dw -1, 4 + times 8 dw -10, 58 + times 8 dw 17, -5 + times 8 dw 1, 0 + + times 8 dw -1, 4 + times 8 dw -11, 40 + times 8 dw 40, -11 + times 8 dw 4, -1 + + times 8 dw 0, 1 + times 8 dw -5, 17 + times 8 dw 58, -10 + times 8 dw 4, -1 + +pb_LumaCoeffVer: times 16 db 0, 0 + times 16 db 0, 64 + times 16 db 0, 0 + times 16 db 0, 0 + + times 16 db -1, 4 + times 16 db -10, 58 + times 16 db 17, -5 + times 16 db 1, 0 + + times 16 db -1, 4 + times 16 db -11, 40 + times 16 db 40, -11 + times 16 db 4, -1 + + times 16 db 0, 1 + times 16 db -5, 17 + times 16 db 58, -10 + times 16 db 4, -1 + tab_LumaCoeffVer: times 8 db 0, 0 times 8 db 0, 64 times 8 db 0, 0 @@ -183,6 +285,15 @@ interp4_horiz_shuf1: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +ALIGN 32 +interp4_hpp_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + +ALIGN 32 +interp8_hps_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7 + +ALIGN 32 +interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12 + SECTION .text cextern pb_128 @@ -913,6 +1024,105 @@ pextrd [r2+r0], xm3, 3 RET +%macro FILTER_HORIZ_LUMA_AVX2_4xN 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_pp_4x%1, 4, 6, 9 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastq m0, [r5 + r4 * 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] +%endif + + mova m1, [tab_Lm] + mova m2, [pw_1] + mova m7, [interp8_hps_shuf] + mova m8, [pw_512] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + lea r4, [r1 * 3] + lea r5, [r3 * 3] + sub r0, 3 +%rep %1 / 8 + ; Row 0-1 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + ; Row 2-3 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + + packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A] + lea r0, [r0 + r1 * 4] + ; Row 4-5 + vbroadcasti128 m5, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + phaddd m5, m4 ; DWORD [R5D R5C R4D R4C R5B R5A R4B R4A] + + ; Row 6-7 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m6, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m6, m1 + pmaddubsw m6, m0 + pmaddwd m6, m2 + phaddd m4, m6 ; DWORD [R7D R7C R6D R6C R7B R7A R6B R6A] + + packssdw m5, m4 ; WORD [R7D R7C R6D R6C R5D R5C R4D R4C R7B R7A R6B R6A R5B R5A R4B R4A] + vpermd m3, m7, m3 + vpermd m5, m7, m5 + pmulhrsw m3, m8 + pmulhrsw m5, m8 + packuswb m3, m5 + vextracti128 xm5, m3, 1 + + movd [r2], xm3 + pextrd [r2 + r3], xm3, 1 + movd [r2 + r3 * 2], xm5 + pextrd [r2 + r5], xm5, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm3, 2 + pextrd [r2 + r3], xm3, 3 + pextrd [r2 + r3 * 2], xm5, 2 + pextrd [r2 + r5], xm5, 3 + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] +%endrep + RET +%endif +%endmacro + +FILTER_HORIZ_LUMA_AVX2_4xN 8 +FILTER_HORIZ_LUMA_AVX2_4xN 16 + INIT_YMM avx2 cglobal interp_8tap_horiz_pp_8x4, 4, 6, 7 mov r4d, r4m @@ -1455,6 +1665,89 @@ pextrd [r2+r0], xm3, 3 RET +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_2x4, 4, 6, 3 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + dec r0 + lea r4, [r1 * 3] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + pshufb m1, [interp4_hpp_shuf] + pmaddubsw m1, m0 + pmaddwd m1, [pw_1] + vextracti128 xm2, m1, 1 + packssdw xm1, xm2 + pmulhrsw xm1, [pw_512] + packuswb xm1, xm1 + + lea r4, [r3 * 3] + pextrw [r2], xm1, 0 + pextrw [r2 + r3], xm1, 1 + pextrw [r2 + r3 * 2], xm1, 2 + pextrw [r2 + r4], xm1, 3 + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + mova m4, [interp4_hpp_shuf] + mova m5, [pw_1] + dec r0 + lea r4, [r1 * 3] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + lea r0, [r0 + r1 * 4] + movq xm3, [r0] + movhps xm3, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m3, m3, xm2, 1 + + pshufb m1, m4 + pshufb m3, m4 + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddwd m1, m5 + pmaddwd m3, m5 + packssdw m1, m3 + pmulhrsw m1, [pw_512] + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 + + lea r4, [r3 * 3] + pextrw [r2], xm1, 0 + pextrw [r2 + r3], xm1, 1 + pextrw [r2 + r3 * 2], xm1, 4 + pextrw [r2 + r4], xm1, 5 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm1, 2 + pextrw [r2 + r3], xm1, 3 + pextrw [r2 + r3 * 2], xm1, 6 + pextrw [r2 + r4], xm1, 7 + RET + INIT_YMM avx2 cglobal interp_4tap_horiz_pp_32x32, 4,6,7 mov r4d, r4m @@ -1664,6 +1957,461 @@ IPFILTER_LUMA_64x_avx2 64 , 32 IPFILTER_LUMA_64x_avx2 64 , 16 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x2, 4, 6, 5 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [tab_Tm] + mova m2, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, [pw_512] + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + pshufd xm3, xm3, 11011000b + movq [r2], xm3 + movhps [r2 + r3], xm3 + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_8x6, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [tab_Tm] + mova m2, [pw_1] + mova m6, [pw_512] + lea r4, [r1 * 3] + lea r5, [r3 * 3] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 2 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + ; Row 3 + vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + mova m5, [interp8_hps_shuf] + vpermd m3, m5, m3 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movhps [r2 + r3], xm3 + movq [r2 + r3 * 2], xm4 + movhps [r2 + r5], xm4 + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + ; Row 4 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 5 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 + pshufd xm3, xm3, 11011000b + movq [r2], xm3 + movhps [r2 + r3], xm3 + RET + +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_6x8, 4, 6, 7 + mov r4d, r4m + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + mova m1, [tab_Tm] + mova m2, [pw_1] + mova m6, [pw_512] + lea r4, [r1 * 3] + lea r5, [r3 * 3] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + + dec r0 +%rep 2 + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 + + ; Row 2 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + ; Row 3 + vbroadcasti128 m5, [r0 + r4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 + + packuswb m3, m4 + vextracti128 xm4, m3, 1 + movd [r2], xm3 + pextrw [r2 + 4], xm4, 0 + pextrd [r2 + r3], xm3, 1 + pextrw [r2 + r3 + 4], xm4, 2 + pextrd [r2 + r3 * 2], xm3, 2 + pextrw [r2 + r3 * 2 + 4], xm4, 4 + pextrd [r2 + r5], xm3, 3 + pextrw [r2 + r5 + 4], xm4, 6 + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] +%endrep + RET + +;----------------------------------------------------------------------------------------------------------------------------- +;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- + +%macro IPFILTER_LUMA_PS_4xN_AVX2 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_4x%1, 6,7,6 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] +%endif + mova m1, [tab_Lm] + add r3d, r3d + vbroadcasti128 m2, [pw_2000] + + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - pw_2000 + + sub r0, 3 + test r5d, r5d + mov r5d, %1 ; loop count variable - height + jz .preloop + lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride + sub r0, r6 ; r0(src) - 3 * srcStride + add r5d, 7 ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) + +.preloop: + lea r6, [r3 * 3] +.loop + ; Row 0-1 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 ; shuffled based on the col order tab_Lm + pmaddubsw m3, m0 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + ; Row 2-3 + lea r0, [r0 + r1 * 2] ;3rd row(i.e 2nd row) + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + phaddw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + phaddw m3, m4 ; all rows and col completed. + + mova m5, [interp8_hps_shuf] + vpermd m3, m5, m3 + psubw m3, m2 + + vextracti128 xm4, m3, 1 + movq [r2], xm3 ;row 0 + movhps [r2 + r3], xm3 ;row 1 + movq [r2 + r3 * 2], xm4 ;row 2 + movhps [r2 + r6], xm4 ;row 3 + + lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) + lea r2, [r2 + r3 * 4] ; first loop dst ->5th row(i.e 4) + sub r5d, 4 + jz .end + cmp r5d, 4 + jge .loop + + ; Row 8-9 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + ; Row 10 + vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + phaddw m4, m4 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + phaddw m3, m4 + + vpermd m3, m5, m3 ; m5 don't broken in above + psubw m3, m2 + + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movhps [r2 + r3], xm3 + movq [r2 + r3 * 2], xm4 +.end + RET +%endif +%endmacro + + IPFILTER_LUMA_PS_4xN_AVX2 4 + IPFILTER_LUMA_PS_4xN_AVX2 8 + IPFILTER_LUMA_PS_4xN_AVX2 16 + +%macro IPFILTER_LUMA_PS_8xN_AVX2 1 +; TODO: verify and enable on X86 mode +%if ARCH_X86_64 == 1 +; void filter_hps(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_8x%1, 4,7,6 + mov r5d, r5m + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r6, [pb_LumaCoeffVer] + add r6, r4 +%else + lea r6, [pb_LumaCoeffVer + r4] +%endif + add r3d, r3d + vpbroadcastd m0, [pw_2000] + sub r0, 3 + lea r4, [pb_8tap_hps_0] + vbroadcasti128 m5, [r4 + 0 * mmsize] + + ; check row count extend for interpolateHV + test r5d, r5d; + mov r5d, %1 + jz .enter_loop + lea r4, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride + sub r0, r4 ; r0(src)-r8 + add r5d, 8-1-2 ; blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop) + +.enter_loop: + lea r4, [pb_8tap_hps_0] + + ; ***** register map ***** + ; m0 - pw_2000 + ; r4 - base pointer of shuffle order table + ; r5 - count of loop + ; r6 - point to LumaCoeff +.loop: + + ; Row 0-1 + movu xm1, [r0] + movu xm2, [r0 + r1] + vinserti128 m1, m1, xm2, 1 + pshufb m2, m1, m5 ; [0 1 1 2 2 3 3 4 ...] + pshufb m3, m1, [r4 + 1 * mmsize] ; [2 3 3 4 4 5 5 6 ...] + pshufb m4, m1, [r4 + 2 * mmsize] ; [4 5 5 6 6 7 7 8 ...] + pshufb m1, m1, [r4 + 3 * mmsize] ; [6 7 7 8 8 9 9 A ...] + pmaddubsw m2, [r6 + 0 * mmsize] + pmaddubsw m3, [r6 + 1 * mmsize] + pmaddubsw m4, [r6 + 2 * mmsize] + pmaddubsw m1, [r6 + 3 * mmsize] + paddw m2, m3 + paddw m1, m4 + paddw m1, m2 + psubw m1, m0 + + vextracti128 xm2, m1, 1 + movu [r2], xm1 ; row 0 + movu [r2 + r3], xm2 ; row 1 + + lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4) + lea r2, [r2 + r3 * 2] ; first loop dst ->5th row(i.e 4) + sub r5d, 2 + jg .loop + jz .end + + ; last row + movu xm1, [r0] + pshufb xm2, xm1, xm5 ; [0 1 1 2 2 3 3 4 ...] + pshufb xm3, xm1, [r4 + 1 * mmsize] ; [2 3 3 4 4 5 5 6 ...] + pshufb xm4, xm1, [r4 + 2 * mmsize] ; [4 5 5 6 6 7 7 8 ...] + pshufb xm1, xm1, [r4 + 3 * mmsize] ; [6 7 7 8 8 9 9 A ...] + pmaddubsw xm2, [r6 + 0 * mmsize] + pmaddubsw xm3, [r6 + 1 * mmsize] + pmaddubsw xm4, [r6 + 2 * mmsize] + pmaddubsw xm1, [r6 + 3 * mmsize] + paddw xm2, xm3 + paddw xm1, xm4 + paddw xm1, xm2 + psubw xm1, xm0 + movu [r2], xm1 ;row 0 +.end + RET +%endif +%endmacro ; IPFILTER_LUMA_PS_8xN_AVX2 + +IPFILTER_LUMA_PS_8xN_AVX2 4 +IPFILTER_LUMA_PS_8xN_AVX2 8 +IPFILTER_LUMA_PS_8xN_AVX2 16 +IPFILTER_LUMA_PS_8xN_AVX2 32 + + +%macro IPFILTER_LUMA_PS_16x_AVX2 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_horiz_ps_%1x%2, 6, 10, 7 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] +%endif + mova m6, [tab_Lm + 32] + mova m1, [tab_Lm] + mov r9, %2 ;height + add r3d, r3d + vbroadcasti128 m2, [pw_2000] + + ; register map + ; m0 - interpolate coeff + ; m1 , m6 - shuffle order table + ; m2 - pw_2000 + + xor r7, r7 ; loop count variable + sub r0, 3 + test r5d, r5d + jz .label + lea r8, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride + sub r0, r8 ; r0(src)-r8 + add r9, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) + +.label + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A] + + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m6 ;row 1 (col 4 to 7) + pshufb m4, m1 ;row 1 (col 0 to 3) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + phaddw m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A] + phaddw m3, m4 ; all rows and col completed. + + mova m5, [interp8_hps_shuf] + vpermd m3, m5, m3 + psubw m3, m2 + + movu [r2], m3 ;row 0 + + lea r0, [r0 + r1] ; first loop src ->5th row(i.e 4) + lea r2, [r2 + r3] ; first loop dst ->5th row(i.e 4) + dec r9d + jnz .label + +RET +%endif +%endmacro + + +IPFILTER_LUMA_PS_16x_AVX2 16 , 16 +IPFILTER_LUMA_PS_16x_AVX2 16 , 8 +IPFILTER_LUMA_PS_16x_AVX2 16 , 12 +IPFILTER_LUMA_PS_16x_AVX2 16 , 4 +IPFILTER_LUMA_PS_16x_AVX2 16 , 32 +IPFILTER_LUMA_PS_16x_AVX2 16 , 64 + + ;-------------------------------------------------------------------------------------------------------------- ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;-------------------------------------------------------------------------------------------------------------- @@ -1963,26 +2711,152 @@ RET -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W2_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 - -mov r4d, r4m -sub r0, r1 +%macro FILTER_VER_CHROMA_AVX2_2x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x4, 4, 6, 2 + mov r4d, r4m + shl r4d, 5 + sub r0, r1 %ifdef PIC -lea r5, [tab_ChromaCoeff] -movd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeff_V] + add r5, r4 %else -movd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeff_V + r4] %endif -pshufb m0, [tab_Cm] - -mova m1, [pw_512] + lea r4, [r1 * 3] + + pinsrw xm1, [r0], 0 + pinsrw xm1, [r0 + r1], 1 + pinsrw xm1, [r0 + r1 * 2], 2 + pinsrw xm1, [r0 + r4], 3 + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + + pshufb xm0, xm1, [interp_vert_shuf] + pshufb xm1, [interp_vert_shuf + 32] + vinserti128 m0, m0, xm1, 1 + pmaddubsw m0, [r5] + vextracti128 xm1, m0, 1 + paddw xm0, xm1 +%ifidn %1,pp + pmulhrsw xm0, [pw_512] + packuswb xm0, xm0 + lea r4, [r3 * 3] + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r4], xm0, 3 +%else + add r3d, r3d + lea r4, [r3 * 3] + psubw xm0, [pw_2000] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r4], xm0, 3 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_2x4 pp +FILTER_VER_CHROMA_AVX2_2x4 ps + +%macro FILTER_VER_CHROMA_AVX2_2x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x8, 4, 6, 2 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + + pinsrw xm1, [r0], 0 + pinsrw xm1, [r0 + r1], 1 + pinsrw xm1, [r0 + r1 * 2], 2 + pinsrw xm1, [r0 + r4], 3 + lea r0, [r0 + r1 * 4] + pinsrw xm1, [r0], 4 + pinsrw xm1, [r0 + r1], 5 + pinsrw xm1, [r0 + r1 * 2], 6 + pinsrw xm1, [r0 + r4], 7 + movhlps xm0, xm1 + lea r0, [r0 + r1 * 4] + pinsrw xm0, [r0], 4 + pinsrw xm0, [r0 + r1], 5 + pinsrw xm0, [r0 + r1 * 2], 6 + vinserti128 m1, m1, xm0, 1 + + pshufb m0, m1, [interp_vert_shuf] + pshufb m1, [interp_vert_shuf + 32] + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m0, m1 +%ifidn %1,pp + pmulhrsw m0, [pw_512] + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + lea r4, [r3 * 3] + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 4 + pextrw [r2 + r3], xm0, 5 + pextrw [r2 + r3 * 2], xm0, 6 + pextrw [r2 + r4], xm0, 7 +%else + add r3d, r3d + lea r4, [r3 * 3] + psubw m0, [pw_2000] + vextracti128 xm1, m0, 1 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + r3 * 2], xm0, 2 + pextrd [r2 + r4], xm0, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r4], xm1, 3 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_2x8 pp +FILTER_VER_CHROMA_AVX2_2x8 ps + +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W2_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m0, [tab_Cm] + +mova m1, [pw_512] mov r4d, %2 lea r5, [3 * r1] @@ -2097,6 +2971,55 @@ RET +%macro FILTER_VER_CHROMA_AVX2_4x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x2, 4, 6, 4 + mov r4d, r4m + shl r4d, 5 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeff_V] + add r5, r4 +%else + lea r5, [tab_ChromaCoeff_V + r4] +%endif + + lea r4, [r1 * 3] + + movd xm1, [r0] + movd xm2, [r0 + r1] + punpcklbw xm1, xm2 + movd xm3, [r0 + r1 * 2] + punpcklbw xm2, xm3 + movlhps xm1, xm2 + movd xm0, [r0 + r4] + punpcklbw xm3, xm0 + movd xm2, [r0 + r1 * 4] + punpcklbw xm0, xm2 + movlhps xm3, xm0 + vinserti128 m1, m1, xm3, 1 ; m1 = row[x x x 4 3 2 1 0] + + pmaddubsw m1, [r5] + vextracti128 xm3, m1, 1 + paddw xm1, xm3 +%ifidn %1,pp + pmulhrsw xm1, [pw_512] + packuswb xm1, xm1 + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 +%else + add r3d, r3d + psubw xm1, [pw_2000] + movq [r2], xm1 + movhps [r2 + r3], xm1 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_4x2 pp +FILTER_VER_CHROMA_AVX2_4x2 ps + ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- @@ -2167,11 +3090,10 @@ lea r2, [r2 + 2 * r3] pextrd [r2], m2, 2 pextrd [r2 + r3], m2, 3 - RET - +%macro FILTER_VER_CHROMA_AVX2_4x4 1 INIT_YMM avx2 -cglobal interp_4tap_vert_pp_4x4, 4, 6, 3 +cglobal interp_4tap_vert_%1_4x4, 4, 6, 3 mov r4d, r4m shl r4d, 6 sub r0, r1 @@ -2205,6 +3127,7 @@ pmaddubsw m0, [r5] pmaddubsw m1, [r5 + mmsize] paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] +%ifidn %1,pp pmulhrsw m0, [pw_512] vextracti128 xm1, m0, 1 packuswb xm0, xm1 @@ -2213,7 +3136,252 @@ pextrd [r2 + r3], xm0, 1 pextrd [r2 + r3 * 2], xm0, 2 pextrd [r2 + r5], xm0, 3 +%else + add r3d, r3d + psubw m0, [pw_2000] + vextracti128 xm1, m0, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r5], xm1 +%endif + RET +%endmacro +FILTER_VER_CHROMA_AVX2_4x4 pp +FILTER_VER_CHROMA_AVX2_4x4 ps + +%macro FILTER_VER_CHROMA_AVX2_4x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x8, 4, 6, 5 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] + mova m3, [interp4_vpp_shuf1] + vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] + mova m3, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] + + mova m3, [interp4_vpp_shuf] + pshufb m0, m0, m3 + pshufb m1, m1, m3 + pshufb m2, m2, m3 + pshufb m4, m4, m3 + pmaddubsw m0, [r5] + pmaddubsw m4, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m2, [r5 + mmsize] + paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] + paddw m4, m2 ; m4 = WORD ROW[7 6 5 4] +%ifidn %1,pp + pmulhrsw m0, [pw_512] + pmulhrsw m4, [pw_512] + packuswb m0, m4 + vextracti128 xm1, m0, 1 + lea r5, [r3 * 3] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + r3 * 2], xm1 + pextrd [r2 + r5], xm1, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r5], xm1, 3 +%else + add r3d, r3d + psubw m0, [pw_2000] + psubw m4, [pw_2000] + vextracti128 xm1, m0, 1 + vextracti128 xm2, m4, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r5], xm1 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_4x8 pp +FILTER_VER_CHROMA_AVX2_4x8 ps + +%macro FILTER_VER_CHROMA_AVX2_4x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_4x16, 4, 6, 9 + mov r4d, r4m + shl r4d, 6 + sub r0, r1 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 + pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] + lea r0, [r0 + r1 * 4] + movd xm4, [r0] + pinsrd xm4, [r0 + r1], 1 + pinsrd xm4, [r0 + r1 * 2], 2 + pinsrd xm4, [r0 + r4], 3 ; m4 = row[15 14 13 12] + vinserti128 m3, m3, xm4, 1 ; m3 = row[15 14 13 12 11 10 9 8] + lea r0, [r0 + r1 * 4] + movd xm5, [r0] + pinsrd xm5, [r0 + r1], 1 + pinsrd xm5, [r0 + r1 * 2], 2 ; m5 = row[x 18 17 16] + vinserti128 m4, m4, xm5, 1 ; m4 = row[x 18 17 16 15 14 13 12] + mova m5, [interp4_vpp_shuf1] + vpermd m0, m5, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m6, m5, m2 ; m6 = row[8 7 7 6 6 5 5 4] + vpermd m7, m5, m3 ; m7 = row[12 11 11 10 10 9 9 8] + vpermd m8, m5, m4 ; m8 = row[16 15 15 14 14 13 13 12] + mova m5, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m5, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m5, m2 ; m2 = row[10 9 9 8 8 7 7 6] + vpermd m3, m5, m3 ; m3 = row[14 13 13 12 12 11 11 10] + vpermd m4, m5, m4 ; m4 = row[18 17 17 16 16 15 15 14] + + mova m5, [interp4_vpp_shuf] + pshufb m0, m0, m5 + pshufb m1, m1, m5 + pshufb m2, m2, m5 + pshufb m4, m4, m5 + pshufb m3, m3, m5 + pshufb m6, m6, m5 + pshufb m7, m7, m5 + pshufb m8, m8, m5 + pmaddubsw m0, [r5] + pmaddubsw m6, [r5] + pmaddubsw m7, [r5] + pmaddubsw m8, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m2, [r5 + mmsize] + pmaddubsw m3, [r5 + mmsize] + pmaddubsw m4, [r5 + mmsize] + paddw m0, m1 ; m0 = WORD ROW[3 2 1 0] + paddw m6, m2 ; m6 = WORD ROW[7 6 5 4] + paddw m7, m3 ; m7 = WORD ROW[11 10 9 8] + paddw m8, m4 ; m8 = WORD ROW[15 14 13 12] +%ifidn %1,pp + mova m5, [pw_512] + pmulhrsw m0, m5 + pmulhrsw m6, m5 + pmulhrsw m7, m5 + pmulhrsw m8, m5 + packuswb m0, m6 + packuswb m7, m8 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m7, 1 + lea r5, [r3 * 3] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + r3 * 2], xm1 + pextrd [r2 + r5], xm1, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r5], xm1, 3 + lea r2, [r2 + r3 * 4] + movd [r2], xm7 + pextrd [r2 + r3], xm7, 1 + movd [r2 + r3 * 2], xm2 + pextrd [r2 + r5], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm7, 2 + pextrd [r2 + r3], xm7, 3 + pextrd [r2 + r3 * 2], xm2, 2 + pextrd [r2 + r5], xm2, 3 +%else + add r3d, r3d + mova m5, [pw_2000] + psubw m0, m5 + psubw m6, m5 + psubw m7, m5 + psubw m8, m5 + vextracti128 xm1, m0, 1 + vextracti128 xm2, m6, 1 + vextracti128 xm3, m7, 1 + vextracti128 xm4, m8, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r5], xm1 + lea r2, [r2 + r3 * 4] + movq [r2], xm6 + movhps [r2 + r3], xm6 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm7 + movhps [r2 + r3], xm7 + movq [r2 + r3 * 2], xm3 + movhps [r2 + r5], xm3 + lea r2, [r2 + r3 * 4] + movq [r2], xm8 + movhps [r2 + r3], xm8 + movq [r2 + r3 * 2], xm4 + movhps [r2 + r5], xm4 +%endif RET +%endif +%endmacro + +FILTER_VER_CHROMA_AVX2_4x16 pp +FILTER_VER_CHROMA_AVX2_4x16 ps ;----------------------------------------------------------------------------- ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) @@ -3462,8 +4630,9 @@ paddw m4, m0 %endmacro +%macro FILTER_VER_CHROMA_AVX2_8x8 1 INIT_YMM avx2 -cglobal interp_4tap_vert_pp_8x8, 4, 6, 7 +cglobal interp_4tap_vert_%1_8x8, 4, 6, 7 mov r4d, r4m shl r4d, 6 @@ -3477,6 +4646,7 @@ lea r4, [r1 * 3] sub r0, r1 PROCESS_CHROMA_AVX2_W8_8R +%ifidn %1,pp lea r4, [r3 * 3] mova m3, [pw_512] pmulhrsw m5, m3 ; m5 = word: row 0, row 1 @@ -3496,80 +4666,609 @@ movq [r2 + r3], xm4 movhps [r2 + r3 * 2], xm1 movhps [r2 + r4], xm4 +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm6 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm4 + vextracti128 xm4, m4, 1 + movu [r2 + r4], xm4 +%endif RET +%endmacro -;----------------------------------------------------------------------------- -;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W6_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 +FILTER_VER_CHROMA_AVX2_8x8 pp +FILTER_VER_CHROMA_AVX2_8x8 ps -mov r4d, r4m -sub r0, r1 +%macro FILTER_VER_CHROMA_AVX2_8x6 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x6, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 %ifdef PIC -lea r5, [tab_ChromaCoeff] -movd m5, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else -movd m5, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif -pshufb m6, m5, [tab_Vm] -pshufb m5, [tab_Vm + 16] -mova m4, [pw_512] - -mov r4d, %2 -lea r5, [3 * r1] - -.loop: -movq m0, [r0] -movq m1, [r0 + r1] -movq m2, [r0 + 2 * r1] -movq m3, [r0 + r5] - -punpcklbw m0, m1 -punpcklbw m1, m2 -punpcklbw m2, m3 + lea r4, [r1 * 3] + sub r0, r1 -pmaddubsw m0, m6 -pmaddubsw m7, m2, m5 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m4, [r5 + 1 * mmsize] + paddw m1, m4 +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + packuswb m5, m2 + packuswb m1, m1 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 +%else + add r3d, r3d + mova m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + vextracti128 xm4, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm4 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm1 + movu [r2 + r3], xm0 +%endif + RET +%endmacro -paddw m0, m7 +FILTER_VER_CHROMA_AVX2_8x6 pp +FILTER_VER_CHROMA_AVX2_8x6 ps -pmulhrsw m0, m4 -packuswb m0, m0 -movd [r2], m0 -pextrw [r2 + 4], m0, 2 +%macro PROCESS_CHROMA_AVX2_W8_16R 1 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r6], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + vextracti128 xm5, m1, 1 + vextracti128 xm3, m4, 1 + movu [r2], xm1 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm3 +%endif + movq xm3, [r0 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r0, [r0 + r1 * 4] + movq xm5, [r0] ; m5 = row 12 + punpcklbw xm3, xm5 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m6, [r5] + movq xm3, [r0 + r1] ; m3 = row 13 + punpcklbw xm5, xm3 + movq xm2, [r0 + r1 * 2] ; m2 = row 14 + punpcklbw xm3, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, [r5 + 1 * mmsize] + paddw m6, m3 + pmaddubsw m5, [r5] + movq xm3, [r0 + r4] ; m3 = row 15 + punpcklbw xm2, xm3 + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 16 + punpcklbw xm3, xm1 + vinserti128 m2, m2, xm3, 1 + pmaddubsw m3, m2, [r5 + 1 * mmsize] + paddw m5, m3 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 17 + punpcklbw xm1, xm3 + movq xm4, [r0 + r1 * 2] ; m4 = row 18 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m2, m1 + lea r2, [r2 + r3 * 4] +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 8, row 9 + pmulhrsw m6, m7 ; m6 = word: row 10, row 11 + pmulhrsw m5, m7 ; m5 = word: row 12, row 13 + pmulhrsw m2, m7 ; m2 = word: row 14, row 15 + packuswb m0, m6 + packuswb m5, m2 + vextracti128 xm6, m0, 1 + vextracti128 xm2, m5, 1 + movq [r2], xm0 + movq [r2 + r3], xm6 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm6 + lea r2, [r2 + r3 * 4] + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 +%else + psubw m0, m7 ; m0 = word: row 8, row 9 + psubw m6, m7 ; m6 = word: row 10, row 11 + psubw m5, m7 ; m5 = word: row 12, row 13 + psubw m2, m7 ; m2 = word: row 14, row 15 + vextracti128 xm1, m0, 1 + vextracti128 xm3, m6, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + vextracti128 xm1, m5, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm5 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif +%endmacro -lea r0, [r0 + 4 * r1] +%macro FILTER_VER_CHROMA_AVX2_8x16 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x16, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 -movq m0, [r0] -punpcklbw m3, m0 +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif -pmaddubsw m1, m6 -pmaddubsw m7, m3, m5 + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] + PROCESS_CHROMA_AVX2_W8_16R %1 + RET +%endmacro -paddw m1, m7 +FILTER_VER_CHROMA_AVX2_8x16 pp +FILTER_VER_CHROMA_AVX2_8x16 ps -pmulhrsw m1, m4 -packuswb m1, m1 -movd [r2 + r3], m1 -pextrw [r2 + r3 + 4], m1, 2 +%macro FILTER_VER_CHROMA_AVX2_8x32 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x32, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 -movq m1, [r0 + r1] -punpcklbw m7, m0, m1 +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif -pmaddubsw m2, m6 -pmaddubsw m7, m5 + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] +%rep 2 + PROCESS_CHROMA_AVX2_W8_16R %1 + lea r2, [r2 + r3 * 4] +%endrep + RET +%endmacro -paddw m2, m7 +FILTER_VER_CHROMA_AVX2_8x32 pp +FILTER_VER_CHROMA_AVX2_8x32 ps -pmulhrsw m2, m4 -packuswb m2, m2 -lea r2, [r2 + 2 * r3] -movd [r2], m2 -pextrw [r2 + 4], m2, 2 +%macro PROCESS_CHROMA_AVX2_W8_4R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m0, m1, xm2, 1 ; m0 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m0, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m1, [r5 + 1 * mmsize] + paddw m2, m1 +%endmacro + +%macro FILTER_VER_CHROMA_AVX2_8x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x4, 4, 6, 5 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + PROCESS_CHROMA_AVX2_W8_4R +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m0, m3 ; m0 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + packuswb m0, m2 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m0, m3 ; m0 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + vextracti128 xm1, m0, 1 + vextracti128 xm4, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm4 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_8x4 pp +FILTER_VER_CHROMA_AVX2_8x4 ps + +%macro FILTER_VER_CHROMA_AVX2_8x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x2, 4, 6, 4 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m1, m1, xm2, 1 ; m1 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m1, [r5] + movq xm2, [r0 + r4] ; m2 = row 3 + punpcklbw xm3, xm2 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + movq xm0, [r0 + r1 * 4] ; m0 = row 4 + punpcklbw xm2, xm0 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m3, m3, xm2, 1 ; m3 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 +%ifidn %1,pp + pmulhrsw m1, [pw_512] ; m1 = word: row 0, row 1 + packuswb m1, m1 + vextracti128 xm0, m1, 1 + movq [r2], xm1 + movq [r2 + r3], xm0 +%else + add r3d, r3d + psubw m1, [pw_2000] ; m1 = word: row 0, row 1 + vextracti128 xm0, m1, 1 + movu [r2], xm1 + movu [r2 + r3], xm0 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_8x2 pp +FILTER_VER_CHROMA_AVX2_8x2 ps + +%macro FILTER_VER_CHROMA_AVX2_6x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_6x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 + PROCESS_CHROMA_AVX2_W8_8R +%ifidn %1,pp + lea r4, [r3 * 3] + mova m3, [pw_512] + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movd [r2], xm5 + pextrw [r2 + 4], xm5, 2 + movd [r2 + r3], xm2 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm5, 2 + pextrw [r2 + r3 * 2 + 4], xm5, 6 + pextrd [r2 + r4], xm2, 2 + pextrw [r2 + r4 + 4], xm2, 6 + lea r2, [r2 + r3 * 4] + movd [r2], xm1 + pextrw [r2 + 4], xm1, 2 + movd [r2 + r3], xm4 + pextrw [r2 + r3 + 4], xm4, 2 + pextrd [r2 + r3 * 2], xm1, 2 + pextrw [r2 + r3 * 2 + 4], xm1, 6 + pextrd [r2 + r4], xm4, 2 + pextrw [r2 + r4 + 4], xm4, 6 +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] + lea r4, [r3 * 3] + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movq [r2], xm5 + pextrd [r2 + 8], xm5, 2 + movq [r2 + r3], xm6 + pextrd [r2 + r3 + 8], xm6, 2 + movq [r2 + r3 * 2], xm2 + pextrd [r2 + r3 * 2 + 8], xm2, 2 + movq [r2 + r4], xm3 + pextrd [r2 + r4 + 8], xm3, 2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + pextrd [r2 + 8], xm1, 2 + movq [r2 + r3], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movq [r2 + r3 * 2], xm4 + pextrd [r2 + r3 * 2 + 8], xm4, 2 + vextracti128 xm4, m4, 1 + movq [r2 + r4], xm4 + pextrd [r2 + r4 + 8], xm4, 2 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_6x8 pp +FILTER_VER_CHROMA_AVX2_6x8 ps + +;----------------------------------------------------------------------------- +;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8 + +mov r4d, r4m +sub r0, r1 + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m5, [r5 + r4 * 4] +%else +movd m5, [tab_ChromaCoeff + r4 * 4] +%endif + +pshufb m6, m5, [tab_Vm] +pshufb m5, [tab_Vm + 16] +mova m4, [pw_512] + +mov r4d, %2 +lea r5, [3 * r1] + +.loop: +movq m0, [r0] +movq m1, [r0 + r1] +movq m2, [r0 + 2 * r1] +movq m3, [r0 + r5] + +punpcklbw m0, m1 +punpcklbw m1, m2 +punpcklbw m2, m3 + +pmaddubsw m0, m6 +pmaddubsw m7, m2, m5 + +paddw m0, m7 + +pmulhrsw m0, m4 +packuswb m0, m0 +movd [r2], m0 +pextrw [r2 + 4], m0, 2 + +lea r0, [r0 + 4 * r1] + +movq m0, [r0] +punpcklbw m3, m0 + +pmaddubsw m1, m6 +pmaddubsw m7, m3, m5 + +paddw m1, m7 + +pmulhrsw m1, m4 +packuswb m1, m1 +movd [r2 + r3], m1 +pextrw [r2 + r3 + 4], m1, 2 + +movq m1, [r0 + r1] +punpcklbw m7, m0, m1 + +pmaddubsw m2, m6 +pmaddubsw m7, m5 + +paddw m2, m7 + +pmulhrsw m2, m4 +packuswb m2, m2 +lea r2, [r2 + 2 * r3] +movd [r2], m2 +pextrw [r2 + 4], m2, 2 movq m2, [r0 + 2 * r1] punpcklbw m1, m2 @@ -3779,9 +5478,10 @@ FILTER_V4_W16_H2 16, 24 FILTER_V4_W16_H2 16, 64 +%macro FILTER_VER_CHROMA_AVX2_16x16 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_pp_16x16, 4, 6, 15 +cglobal interp_4tap_vert_%1_16x16, 4, 6, 15 mov r4d, r4m shl r4d, 6 @@ -3796,8 +5496,13 @@ mova m13, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 - lea r5, [r3 * 3] +%ifidn %1,pp mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r5, [r3 * 3] movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 @@ -3869,6 +5574,7 @@ paddw m7, m11 pmaddubsw m9, m12 +%ifidn %1,pp pmulhrsw m0, m14 ; m0 = word: row 0 pmulhrsw m1, m14 ; m1 = word: row 1 pmulhrsw m2, m14 ; m2 = word: row 2 @@ -3898,6 +5604,25 @@ movu [r2 + r3], xm5 movu [r2 + r3 * 2], xm6 movu [r2 + r5], xm7 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 + movu [r2 + r3 * 2], m6 + movu [r2 + r5], m7 +%endif lea r2, [r2 + r3 * 4] movu xm11, [r0 + r4] ; m11 = row 11 @@ -3958,6 +5683,7 @@ pmaddubsw m3, m13 paddw m1, m3 +%ifidn %1,pp pmulhrsw m8, m14 ; m8 = word: row 8 pmulhrsw m9, m14 ; m9 = word: row 9 pmulhrsw m10, m14 ; m10 = word: row 10 @@ -3987,233 +5713,1162 @@ movu [r2 + r3], xm7 movu [r2 + r3 * 2], xm0 movu [r2 + r5], xm1 +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m6, m14 ; m6 = word: row 12 + psubw m7, m14 ; m7 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r5], m11 + lea r2, [r2 + r3 * 4] + movu [r2], m6 + movu [r2 + r3], m7 + movu [r2 + r3 * 2], m0 + movu [r2 + r5], m1 +%endif RET %endif +%endmacro -;----------------------------------------------------------------------------- -;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W24 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 - -mov r4d, r4m -sub r0, r1 +FILTER_VER_CHROMA_AVX2_16x16 pp +FILTER_VER_CHROMA_AVX2_16x16 ps +%macro FILTER_VER_CHROMA_AVX2_16x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_16x8, 4, 7, 7 + mov r4d, r4m + shl r4d, 6 %ifdef PIC -lea r5, [tab_ChromaCoeff] -movd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 %else -movd m0, [tab_ChromaCoeff + r4 * 4] + lea r5, [tab_ChromaCoeffVer_32 + r4] %endif -pshufb m1, m0, [tab_Vm] -pshufb m0, [tab_Vm + 16] - -mov r4d, %2 - -.loop: -movu m2, [r0] -movu m3, [r0 + r1] - -punpcklbw m4, m2, m3 -punpckhbw m2, m3 - -pmaddubsw m4, m1 -pmaddubsw m2, m1 - -lea r5, [r0 + 2 * r1] -movu m5, [r5] -movu m7, [r5 + r1] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m6, [pw_512] +%else + add r3d, r3d + mova m6, [pw_2000] +%endif + lea r6, [r3 * 3] -punpcklbw m6, m5, m7 -pmaddubsw m6, m0 -paddw m4, m6 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] +%ifidn %1,pp + pmulhrsw m0, m6 ; m0 = word: row 0 + pmulhrsw m1, m6 ; m1 = word: row 1 + packuswb m0, m1 + vpermq m0, m0, 11011000b + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 +%else + psubw m0, m6 ; m0 = word: row 0 + psubw m1, m6 ; m1 = word: row 1 + movu [r2], m0 + movu [r2 + r3], m1 +%endif -punpckhbw m6, m5, m7 -pmaddubsw m6, m0 -paddw m2, m6 + movu xm0, [r0 + r1] ; m0 = row 5 + punpckhbw xm1, xm4, xm0 + punpcklbw xm4, xm0 + vinserti128 m4, m4, xm1, 1 + pmaddubsw m1, m4, [r5 + mmsize] + paddw m2, m1 + pmaddubsw m4, [r5] + movu xm1, [r0 + r1 * 2] ; m1 = row 6 + punpckhbw xm5, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm5, 1 + pmaddubsw m5, m0, [r5 + mmsize] + paddw m3, m5 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m2, m6 ; m2 = word: row 2 + pmulhrsw m3, m6 ; m3 = word: row 3 + packuswb m2, m3 + vpermq m2, m2, 11011000b + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%else + psubw m2, m6 ; m2 = word: row 2 + psubw m3, m6 ; m3 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 +%endif -mova m6, [pw_512] + movu xm2, [r0 + r4] ; m2 = row 7 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + mmsize] + paddw m4, m3 + pmaddubsw m1, [r5] + lea r0, [r0 + r1 * 4] + movu xm3, [r0] ; m3 = row 8 + punpckhbw xm5, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm5, 1 + pmaddubsw m5, m2, [r5 + mmsize] + paddw m0, m5 + pmaddubsw m2, [r5] + lea r2, [r2 + r3 * 4] +%ifidn %1,pp + pmulhrsw m4, m6 ; m4 = word: row 4 + pmulhrsw m0, m6 ; m0 = word: row 5 + packuswb m4, m0 + vpermq m4, m4, 11011000b + vextracti128 xm0, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm0 +%else + psubw m4, m6 ; m4 = word: row 4 + psubw m0, m6 ; m0 = word: row 5 + movu [r2], m4 + movu [r2 + r3], m0 +%endif -pmulhrsw m4, m6 -pmulhrsw m2, m6 + movu xm5, [r0 + r1] ; m5 = row 9 + punpckhbw xm4, xm3, xm5 + punpcklbw xm3, xm5 + vinserti128 m3, m3, xm4, 1 + pmaddubsw m3, [r5 + mmsize] + paddw m1, m3 + movu xm4, [r0 + r1 * 2] ; m4 = row 10 + punpckhbw xm0, xm5, xm4 + punpcklbw xm5, xm4 + vinserti128 m5, m5, xm0, 1 + pmaddubsw m5, [r5 + mmsize] + paddw m2, m5 +%ifidn %1,pp + pmulhrsw m1, m6 ; m1 = word: row 6 + pmulhrsw m2, m6 ; m2 = word: row 7 + packuswb m1, m2 + vpermq m1, m1, 11011000b + vextracti128 xm2, m1, 1 + movu [r2 + r3 * 2], xm1 + movu [r2 + r6], xm2 +%else + psubw m1, m6 ; m1 = word: row 6 + psubw m2, m6 ; m2 = word: row 7 + movu [r2 + r3 * 2], m1 + movu [r2 + r6], m2 +%endif + RET +%endmacro -packuswb m4, m2 +FILTER_VER_CHROMA_AVX2_16x8 pp +FILTER_VER_CHROMA_AVX2_16x8 ps -movu [r2], m4 +%macro FILTER_VER_CHROMA_AVX2_16x12 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_16x12, 4, 6, 10 + mov r4d, r4m + shl r4d, 6 -punpcklbw m4, m3, m5 -punpckhbw m3, m5 +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif -pmaddubsw m4, m1 -pmaddubsw m3, m1 + mova m8, [r5] + mova m9, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m7, [pw_2000] +%endif + lea r5, [r3 * 3] -movu m2, [r5 + 2 * r1] + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 -punpcklbw m5, m7, m2 -punpckhbw m7, m2 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, m8 + pmaddubsw m3, m2, m9 + paddw m4, m3 + pmaddubsw m2, m8 -pmaddubsw m5, m0 -pmaddubsw m7, m0 + vextracti128 xm0, m0, 1 + lea r0, [r0 + r1 * 4] + vinserti128 m0, m0, [r0], 1 -paddw m4, m5 -paddw m3, m7 + punpcklbw m5, m1, m0 + punpckhbw m3, m1, m0 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, m8 + pmaddubsw m3, m5, m9 + paddw m6, m3 + pmaddubsw m5, m8 +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 +%else + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 +%endif + + movu xm4, [r0 + r1 * 2] + vinserti128 m4, m4, [r0 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 + + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m1, m0, m9 + paddw m5, m1 + pmaddubsw m0, m8 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m6, m8 + +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r5], xm5 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r5], m5 +%endif + lea r2, [r2 + r3 * 4] -pmulhrsw m4, m6 -pmulhrsw m3, m6 + movu xm1, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m1, m1, [r0], 1 + vinserti128 m4, m4, xm1, 1 -packuswb m4, m3 + punpcklbw m2, m4, m1 + punpckhbw m5, m4, m1 + vperm2i128 m3, m2, m5, 0x20 + vperm2i128 m2, m2, m5, 0x31 + pmaddubsw m5, m3, m9 + paddw m6, m5 + pmaddubsw m3, m8 + pmaddubsw m5, m2, m9 + paddw m0, m5 + pmaddubsw m2, m8 + +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 4 + pmulhrsw m0, m7 ; m0 = word: row 5 + packuswb m6, m0 + vpermq m6, m6, 11011000b + vextracti128 xm0, m6, 1 + movu [r2], xm6 + movu [r2 + r3], xm0 +%else + psubw m6, m7 ; m6 = word: row 4 + psubw m0, m7 ; m0 = word: row 5 + movu [r2], m6 + movu [r2 + r3], m0 +%endif + + movu xm6, [r0 + r1 * 2] + vinserti128 m6, m6, [r0 + r1], 1 + vextracti128 xm0, m6, 1 + vinserti128 m1, m1, xm0, 0 + + punpcklbw m4, m1, m6 + punpckhbw m5, m1, m6 + vperm2i128 m0, m4, m5, 0x20 + vperm2i128 m5, m4, m5, 0x31 + pmaddubsw m4, m0, m9 + paddw m2, m4 + pmaddubsw m0, m8 + pmaddubsw m4, m5, m9 + paddw m3, m4 + pmaddubsw m5, m8 + +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 6 + pmulhrsw m2, m7 ; m2 = word: row 7 + packuswb m3, m2 + vpermq m3, m3, 11011000b + vextracti128 xm2, m3, 1 + movu [r2 + r3 * 2], xm3 + movu [r2 + r5], xm2 +%else + psubw m3, m7 ; m3 = word: row 6 + psubw m2, m7 ; m2 = word: row 7 + movu [r2 + r3 * 2], m3 + movu [r2 + r5], m2 +%endif + lea r2, [r2 + r3 * 4] -movu [r2 + r3], m4 + movu xm3, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m3, m3, [r0], 1 + vinserti128 m6, m6, xm3, 1 -movq m2, [r0 + 16] -movq m3, [r0 + r1 + 16] -movq m4, [r5 + 16] -movq m5, [r5 + r1 + 16] + punpcklbw m2, m6, m3 + punpckhbw m1, m6, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, m9 + paddw m5, m1 + pmaddubsw m4, m8 + pmaddubsw m1, m2, m9 + paddw m0, m1 + pmaddubsw m2, m8 -punpcklbw m2, m3 -punpcklbw m4, m5 +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 8 + pmulhrsw m0, m7 ; m0 = word: row 9 + packuswb m5, m0 + vpermq m5, m5, 11011000b + vextracti128 xm0, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm0 +%else + psubw m5, m7 ; m5 = word: row 8 + psubw m0, m7 ; m0 = word: row 9 + movu [r2], m5 + movu [r2 + r3], m0 +%endif + + movu xm5, [r0 + r1 * 2] + vinserti128 m5, m5, [r0 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m1, m0, m9 + paddw m4, m1 + +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 10 + pmulhrsw m2, m7 ; m2 = word: row 11 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r2 + r3 * 2], xm4 + movu [r2 + r5], xm2 +%else + psubw m4, m7 ; m4 = word: row 10 + psubw m2, m7 ; m2 = word: row 11 + movu [r2 + r3 * 2], m4 + movu [r2 + r5], m2 +%endif + RET +%endif +%endmacro -pmaddubsw m2, m1 -pmaddubsw m4, m0 +FILTER_VER_CHROMA_AVX2_16x12 pp +FILTER_VER_CHROMA_AVX2_16x12 ps -paddw m2, m4 +%macro FILTER_VER_CHROMA_AVX2_16x32 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_16x32, 4, 8, 8 + mov r4d, r4m + shl r4d, 6 -pmulhrsw m2, m6 +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif -movq m3, [r0 + r1 + 16] -movq m4, [r5 + 16] -movq m5, [r5 + r1 + 16] -movq m7, [r5 + 2 * r1 + 16] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + lea r6, [r3 * 3] + mov r7d, 2 +.loopH: + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 -punpcklbw m3, m4 -punpcklbw m5, m7 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, [r5] + pmaddubsw m3, m2, [r5 + mmsize] + paddw m4, m3 + pmaddubsw m2, [r5] -pmaddubsw m3, m1 -pmaddubsw m5, m0 + vextracti128 xm0, m0, 1 + lea r0, [r0 + r1 * 4] + vinserti128 m0, m0, [r0], 1 -paddw m3, m5 + punpcklbw m5, m1, m0 + punpckhbw m3, m1, m0 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, [r5] + pmaddubsw m3, m5, [r5 + mmsize] + paddw m6, m3 + pmaddubsw m5, [r5] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 +%else + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 +%endif + + movu xm4, [r0 + r1 * 2] + vinserti128 m4, m4, [r0 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 + + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m1, m0, [r5 + mmsize] + paddw m5, m1 + pmaddubsw m0, [r5] + pmaddubsw m1, m6, [r5 + mmsize] + paddw m2, m1 + pmaddubsw m6, [r5] -pmulhrsw m3, m6 -packuswb m2, m3 +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm5 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m5 +%endif + lea r2, [r2 + r3 * 4] -movh [r2 + 16], m2 -movhps [r2 + r3 + 16], m2 + movu xm1, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m1, m1, [r0], 1 + vinserti128 m4, m4, xm1, 1 -mov r0, r5 -lea r2, [r2 + 2 * r3] + punpcklbw m2, m4, m1 + punpckhbw m5, m4, m1 + vperm2i128 m3, m2, m5, 0x20 + vperm2i128 m2, m2, m5, 0x31 + pmaddubsw m5, m3, [r5 + mmsize] + paddw m6, m5 + pmaddubsw m3, [r5] + pmaddubsw m5, m2, [r5 + mmsize] + paddw m0, m5 + pmaddubsw m2, [r5] -sub r4, 2 -jnz .loop -RET -%endmacro +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 4 + pmulhrsw m0, m7 ; m0 = word: row 5 + packuswb m6, m0 + vpermq m6, m6, 11011000b + vextracti128 xm0, m6, 1 + movu [r2], xm6 + movu [r2 + r3], xm0 +%else + psubw m6, m7 ; m6 = word: row 4 + psubw m0, m7 ; m0 = word: row 5 + movu [r2], m6 + movu [r2 + r3], m0 +%endif + + movu xm6, [r0 + r1 * 2] + vinserti128 m6, m6, [r0 + r1], 1 + vextracti128 xm0, m6, 1 + vinserti128 m1, m1, xm0, 0 + + punpcklbw m4, m1, m6 + punpckhbw m5, m1, m6 + vperm2i128 m0, m4, m5, 0x20 + vperm2i128 m5, m4, m5, 0x31 + pmaddubsw m4, m0, [r5 + mmsize] + paddw m2, m4 + pmaddubsw m0, [r5] + pmaddubsw m4, m5, [r5 + mmsize] + paddw m3, m4 + pmaddubsw m5, [r5] -FILTER_V4_W24 24, 32 +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 6 + pmulhrsw m2, m7 ; m2 = word: row 7 + packuswb m3, m2 + vpermq m3, m3, 11011000b + vextracti128 xm2, m3, 1 + movu [r2 + r3 * 2], xm3 + movu [r2 + r6], xm2 +%else + psubw m3, m7 ; m3 = word: row 6 + psubw m2, m7 ; m2 = word: row 7 + movu [r2 + r3 * 2], m3 + movu [r2 + r6], m2 +%endif + lea r2, [r2 + r3 * 4] -FILTER_V4_W24 24, 64 + movu xm3, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m3, m3, [r0], 1 + vinserti128 m6, m6, xm3, 1 -;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------- -%macro FILTER_V4_W32 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 + punpcklbw m2, m6, m3 + punpckhbw m1, m6, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, [r5 + mmsize] + paddw m5, m1 + pmaddubsw m4, [r5] + pmaddubsw m1, m2, [r5 + mmsize] + paddw m0, m1 + pmaddubsw m2, [r5] -mov r4d, r4m -sub r0, r1 +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 8 + pmulhrsw m0, m7 ; m0 = word: row 9 + packuswb m5, m0 + vpermq m5, m5, 11011000b + vextracti128 xm0, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm0 +%else + psubw m5, m7 ; m5 = word: row 8 + psubw m0, m7 ; m0 = word: row 9 + movu [r2], m5 + movu [r2 + r3], m0 +%endif + + movu xm5, [r0 + r1 * 2] + vinserti128 m5, m5, [r0 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m1, m6, [r5 + mmsize] + paddw m2, m1 + pmaddubsw m6, [r5] + pmaddubsw m1, m0, [r5 + mmsize] + paddw m4, m1 + pmaddubsw m0, [r5] -%ifdef PIC -lea r5, [tab_ChromaCoeff] -movd m0, [r5 + r4 * 4] -%else -movd m0, [tab_ChromaCoeff + r4 * 4] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 10 + pmulhrsw m2, m7 ; m2 = word: row 11 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm2 +%else + psubw m4, m7 ; m4 = word: row 10 + psubw m2, m7 ; m2 = word: row 11 + movu [r2 + r3 * 2], m4 + movu [r2 + r6], m2 %endif + lea r2, [r2 + r3 * 4] -pshufb m1, m0, [tab_Vm] -pshufb m0, [tab_Vm + 16] + movu xm3, [r0 + r4] + lea r0, [r0 + r1 * 4] + vinserti128 m3, m3, [r0], 1 + vinserti128 m5, m5, xm3, 1 -mova m7, [pw_512] + punpcklbw m2, m5, m3 + punpckhbw m1, m5, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, [r5 + mmsize] + paddw m0, m1 + pmaddubsw m4, [r5] + pmaddubsw m1, m2, [r5 + mmsize] + paddw m6, m1 + pmaddubsw m2, [r5] -mov r4d, %2 +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 12 + pmulhrsw m6, m7 ; m6 = word: row 13 + packuswb m0, m6 + vpermq m0, m0, 11011000b + vextracti128 xm6, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm6 +%else + psubw m0, m7 ; m0 = word: row 12 + psubw m6, m7 ; m6 = word: row 13 + movu [r2], m0 + movu [r2 + r3], m6 +%endif + + movu xm5, [r0 + r1 * 2] + vinserti128 m5, m5, [r0 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m6, [r5 + mmsize] + paddw m2, m6 + pmaddubsw m0, [r5 + mmsize] + paddw m4, m0 -.loop: -movu m2, [r0] -movu m3, [r0 + r1] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 14 + pmulhrsw m2, m7 ; m2 = word: row 15 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r2 + r3 * 2], xm4 + movu [r2 + r6], xm2 +%else + psubw m4, m7 ; m4 = word: row 14 + psubw m2, m7 ; m2 = word: row 15 + movu [r2 + r3 * 2], m4 + movu [r2 + r6], m2 +%endif + lea r2, [r2 + r3 * 4] + dec r7d + jnz .loopH + RET +%endif +%endmacro -punpcklbw m4, m2, m3 -punpckhbw m2, m3 - -pmaddubsw m4, m1 -pmaddubsw m2, m1 - -lea r5, [r0 + 2 * r1] -movu m3, [r5] -movu m5, [r5 + r1] - -punpcklbw m6, m3, m5 -punpckhbw m3, m5 +FILTER_VER_CHROMA_AVX2_16x32 pp +FILTER_VER_CHROMA_AVX2_16x32 ps -pmaddubsw m6, m0 -pmaddubsw m3, m0 - -paddw m4, m6 -paddw m2, m3 +%macro FILTER_VER_CHROMA_AVX2_24x32 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_24x32, 4, 9, 10 + mov r4d, r4m + shl r4d, 6 -pmulhrsw m4, m7 -pmulhrsw m2, m7 +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif -packuswb m4, m2 + mova m8, [r5] + mova m9, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m7, [pw_2000] +%endif + lea r6, [r3 * 3] + mov r5d, 2 +.loopH: + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 -movu [r2], m4 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, m8 + pmaddubsw m3, m2, m9 + paddw m4, m3 + pmaddubsw m2, m8 -movu m2, [r0 + 16] -movu m3, [r0 + r1 + 16] + vextracti128 xm0, m0, 1 + lea r7, [r0 + r1 * 4] + vinserti128 m0, m0, [r7], 1 -punpcklbw m4, m2, m3 -punpckhbw m2, m3 + punpcklbw m5, m1, m0 + punpckhbw m3, m1, m0 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, m8 + pmaddubsw m3, m5, m9 + paddw m6, m3 + pmaddubsw m5, m8 +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 +%else + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 +%endif + + movu xm4, [r7 + r1 * 2] + vinserti128 m4, m4, [r7 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 + + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m1, m0, m9 + paddw m5, m1 + pmaddubsw m0, m8 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m6, m8 + +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm5 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m5 +%endif + lea r8, [r2 + r3 * 4] -pmaddubsw m4, m1 -pmaddubsw m2, m1 + movu xm1, [r7 + r4] + lea r7, [r7 + r1 * 4] + vinserti128 m1, m1, [r7], 1 + vinserti128 m4, m4, xm1, 1 -movu m3, [r5 + 16] -movu m5, [r5 + r1 + 16] + punpcklbw m2, m4, m1 + punpckhbw m5, m4, m1 + vperm2i128 m3, m2, m5, 0x20 + vperm2i128 m2, m2, m5, 0x31 + pmaddubsw m5, m3, m9 + paddw m6, m5 + pmaddubsw m3, m8 + pmaddubsw m5, m2, m9 + paddw m0, m5 + pmaddubsw m2, m8 + +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 4 + pmulhrsw m0, m7 ; m0 = word: row 5 + packuswb m6, m0 + vpermq m6, m6, 11011000b + vextracti128 xm0, m6, 1 + movu [r8], xm6 + movu [r8 + r3], xm0 +%else + psubw m6, m7 ; m6 = word: row 4 + psubw m0, m7 ; m0 = word: row 5 + movu [r8], m6 + movu [r8 + r3], m0 +%endif + + movu xm6, [r7 + r1 * 2] + vinserti128 m6, m6, [r7 + r1], 1 + vextracti128 xm0, m6, 1 + vinserti128 m1, m1, xm0, 0 + + punpcklbw m4, m1, m6 + punpckhbw m5, m1, m6 + vperm2i128 m0, m4, m5, 0x20 + vperm2i128 m5, m4, m5, 0x31 + pmaddubsw m4, m0, m9 + paddw m2, m4 + pmaddubsw m0, m8 + pmaddubsw m4, m5, m9 + paddw m3, m4 + pmaddubsw m5, m8 + +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 6 + pmulhrsw m2, m7 ; m2 = word: row 7 + packuswb m3, m2 + vpermq m3, m3, 11011000b + vextracti128 xm2, m3, 1 + movu [r8 + r3 * 2], xm3 + movu [r8 + r6], xm2 +%else + psubw m3, m7 ; m3 = word: row 6 + psubw m2, m7 ; m2 = word: row 7 + movu [r8 + r3 * 2], m3 + movu [r8 + r6], m2 +%endif + lea r8, [r8 + r3 * 4] -punpcklbw m6, m3, m5 -punpckhbw m3, m5 + movu xm3, [r7 + r4] + lea r7, [r7 + r1 * 4] + vinserti128 m3, m3, [r7], 1 + vinserti128 m6, m6, xm3, 1 -pmaddubsw m6, m0 -pmaddubsw m3, m0 + punpcklbw m2, m6, m3 + punpckhbw m1, m6, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, m9 + paddw m5, m1 + pmaddubsw m4, m8 + pmaddubsw m1, m2, m9 + paddw m0, m1 + pmaddubsw m2, m8 -paddw m4, m6 -paddw m2, m3 +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 8 + pmulhrsw m0, m7 ; m0 = word: row 9 + packuswb m5, m0 + vpermq m5, m5, 11011000b + vextracti128 xm0, m5, 1 + movu [r8], xm5 + movu [r8 + r3], xm0 +%else + psubw m5, m7 ; m5 = word: row 8 + psubw m0, m7 ; m0 = word: row 9 + movu [r8], m5 + movu [r8 + r3], m0 +%endif + + movu xm5, [r7 + r1 * 2] + vinserti128 m5, m5, [r7 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m1, m6, m9 + paddw m2, m1 + pmaddubsw m6, m8 + pmaddubsw m1, m0, m9 + paddw m4, m1 + pmaddubsw m0, m8 + +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 10 + pmulhrsw m2, m7 ; m2 = word: row 11 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm2 +%else + psubw m4, m7 ; m4 = word: row 10 + psubw m2, m7 ; m2 = word: row 11 + movu [r8 + r3 * 2], m4 + movu [r8 + r6], m2 +%endif + lea r8, [r8 + r3 * 4] -pmulhrsw m4, m7 -pmulhrsw m2, m7 + movu xm3, [r7 + r4] + lea r7, [r7 + r1 * 4] + vinserti128 m3, m3, [r7], 1 + vinserti128 m5, m5, xm3, 1 -packuswb m4, m2 + punpcklbw m2, m5, m3 + punpckhbw m1, m5, m3 + vperm2i128 m4, m2, m1, 0x20 + vperm2i128 m2, m2, m1, 0x31 + pmaddubsw m1, m4, m9 + paddw m0, m1 + pmaddubsw m4, m8 + pmaddubsw m1, m2, m9 + paddw m6, m1 + pmaddubsw m2, m8 + +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 12 + pmulhrsw m6, m7 ; m6 = word: row 13 + packuswb m0, m6 + vpermq m0, m0, 11011000b + vextracti128 xm6, m0, 1 + movu [r8], xm0 + movu [r8 + r3], xm6 +%else + psubw m0, m7 ; m0 = word: row 12 + psubw m6, m7 ; m6 = word: row 13 + movu [r8], m0 + movu [r8 + r3], m6 +%endif + + movu xm5, [r7 + r1 * 2] + vinserti128 m5, m5, [r7 + r1], 1 + vextracti128 xm0, m5, 1 + vinserti128 m3, m3, xm0, 0 + + punpcklbw m1, m3, m5 + punpckhbw m0, m3, m5 + vperm2i128 m6, m1, m0, 0x20 + vperm2i128 m0, m1, m0, 0x31 + pmaddubsw m6, m9 + paddw m2, m6 + pmaddubsw m0, m9 + paddw m4, m0 -movu [r2 + 16], m4 +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 14 + pmulhrsw m2, m7 ; m2 = word: row 15 + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm2, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm2 + add r2, 16 +%else + psubw m4, m7 ; m4 = word: row 14 + psubw m2, m7 ; m2 = word: row 15 + movu [r8 + r3 * 2], m4 + movu [r8 + r6], m2 + add r2, 32 +%endif + add r0, 16 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, m8 + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r7, [r0 + r1 * 4] + movq xm1, [r7] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, m9 + paddw m5, m0 + pmaddubsw m2, m8 + movq xm3, [r7 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m0, m1, m9 + paddw m2, m0 + pmaddubsw m1, m8 + movq xm3, [r7 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r7, [r7 + r1 * 4] + movq xm0, [r7] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, m9 + paddw m1, m3 + pmaddubsw m4, m8 + movq xm3, [r7 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r7 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, m9 + paddw m4, m3 + pmaddubsw m0, m8 -lea r0, [r0 + r1] -lea r2, [r2 + r3] +%ifidn %1,pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r8, [r2 + r3 * 4] + movq [r8], xm1 + movq [r8 + r3], xm4 + movhps [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m1, 1 + lea r8, [r2 + r3 * 4] + movu [r8], xm1 + movu [r8 + r3], xm3 + vextracti128 xm3, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm3 +%endif + lea r8, [r8 + r3 * 4] -dec r4 -jnz .loop -RET + movq xm3, [r7 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r7, [r7 + r1 * 4] + movq xm5, [r7] ; m5 = row 12 + punpcklbw xm3, xm5 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, m9 + paddw m0, m3 + pmaddubsw m6, m8 + movq xm3, [r7 + r1] ; m3 = row 13 + punpcklbw xm5, xm3 + movq xm2, [r7 + r1 * 2] ; m2 = row 14 + punpcklbw xm3, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, m9 + paddw m6, m3 + pmaddubsw m5, m8 + movq xm3, [r7 + r4] ; m3 = row 15 + punpcklbw xm2, xm3 + lea r7, [r7 + r1 * 4] + movq xm1, [r7] ; m1 = row 16 + punpcklbw xm3, xm1 + vinserti128 m2, m2, xm3, 1 + pmaddubsw m3, m2, m9 + paddw m5, m3 + pmaddubsw m2, m8 + movq xm3, [r7 + r1] ; m3 = row 17 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 18 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, m9 + paddw m2, m3 +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 8, row 9 + pmulhrsw m6, m7 ; m6 = word: row 10, row 11 + pmulhrsw m5, m7 ; m5 = word: row 12, row 13 + pmulhrsw m2, m7 ; m2 = word: row 14, row 15 + packuswb m0, m6 + packuswb m5, m2 + vextracti128 xm6, m0, 1 + vextracti128 xm2, m5, 1 + movq [r8], xm0 + movq [r8 + r3], xm6 + movhps [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm6 + lea r8, [r8 + r3 * 4] + movq [r8], xm5 + movq [r8 + r3], xm2 + movhps [r8 + r3 * 2], xm5 + movhps [r8 + r6], xm2 + lea r2, [r8 + r3 * 4 - 16] +%else + psubw m0, m7 ; m0 = word: row 8, row 9 + psubw m6, m7 ; m6 = word: row 10, row 11 + psubw m5, m7 ; m5 = word: row 12, row 13 + psubw m2, m7 ; m2 = word: row 14, row 15 + vextracti128 xm3, m0, 1 + movu [r8], xm0 + movu [r8 + r3], xm3 + vextracti128 xm3, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm3 + vextracti128 xm3, m5, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm5 + movu [r8 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r8 + r3 * 2], xm2 + movu [r8 + r6], xm3 + lea r2, [r8 + r3 * 4 - 32] +%endif + lea r0, [r7 - 16] + dec r5d + jnz .loopH + RET +%endif %endmacro -FILTER_V4_W32 32, 8 -FILTER_V4_W32 32, 16 -FILTER_V4_W32 32, 24 -FILTER_V4_W32 32, 32 - -FILTER_V4_W32 32, 48 -FILTER_V4_W32 32, 64 +FILTER_VER_CHROMA_AVX2_24x32 pp +FILTER_VER_CHROMA_AVX2_24x32 ps +%macro FILTER_VER_CHROMA_AVX2_16x4 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_4tap_vert_pp_32x32, 4, 7, 13 +cglobal interp_4tap_vert_%1_16x4, 4, 6, 8 mov r4d, r4m shl r4d, 6 @@ -4224,90 +6879,411 @@ lea r5, [tab_ChromaCoeffVer_32 + r4] %endif - mova m10, [r5] - mova m11, [r5 + mmsize] lea r4, [r1 * 3] sub r0, r1 - lea r5, [r3 * 3] - mova m12, [pw_512] - mov r6d, 8 -.loopW: - movu m0, [r0] ; m0 = row 0 - movu m1, [r0 + r1] ; m1 = row 1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + mova m7, [pw_2000] +%endif + + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r1 * 2], 1 + movu xm1, [r0 + r1] + vinserti128 m1, m1, [r0 + r4], 1 + punpcklbw m2, m0, m1 punpckhbw m3, m0, m1 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - movu m0, [r0 + r1 * 2] ; m0 = row 2 - punpcklbw m4, m1, m0 - punpckhbw m5, m1, m0 - pmaddubsw m4, m10 - pmaddubsw m5, m10 - movu m1, [r0 + r4] ; m1 = row 3 - punpcklbw m6, m0, m1 - punpckhbw m7, m0, m1 - pmaddubsw m8, m6, m11 - pmaddubsw m9, m7, m11 - pmaddubsw m6, m10 - pmaddubsw m7, m10 - paddw m2, m8 - paddw m3, m9 - pmulhrsw m2, m12 - pmulhrsw m3, m12 - packuswb m2, m3 - movu [r2], m2 + vperm2i128 m4, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + pmaddubsw m4, [r5] + pmaddubsw m3, m2, [r5 + mmsize] + paddw m4, m3 + pmaddubsw m2, [r5] + vextracti128 xm0, m0, 1 lea r0, [r0 + r1 * 4] - movu m0, [r0] ; m0 = row 4 - punpcklbw m2, m1, m0 + vinserti128 m0, m0, [r0], 1 + + punpcklbw m5, m1, m0 punpckhbw m3, m1, m0 - pmaddubsw m8, m2, m11 - pmaddubsw m9, m3, m11 - pmaddubsw m2, m10 - pmaddubsw m3, m10 - paddw m4, m8 - paddw m5, m9 - pmulhrsw m4, m12 - pmulhrsw m5, m12 - packuswb m4, m5 - movu [r2 + r3], m4 + vperm2i128 m6, m5, m3, 0x20 + vperm2i128 m5, m5, m3, 0x31 + pmaddubsw m6, [r5] + pmaddubsw m3, m5, [r5 + mmsize] + paddw m6, m3 + pmaddubsw m5, [r5] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 0 + pmulhrsw m6, m7 ; m6 = word: row 1 + packuswb m4, m6 + vpermq m4, m4, 11011000b + vextracti128 xm6, m4, 1 + movu [r2], xm4 + movu [r2 + r3], xm6 +%else + psubw m4, m7 ; m4 = word: row 0 + psubw m6, m7 ; m6 = word: row 1 + movu [r2], m4 + movu [r2 + r3], m6 +%endif + lea r2, [r2 + r3 * 2] - movu m1, [r0 + r1] ; m1 = row 5 - punpcklbw m4, m0, m1 - punpckhbw m5, m0, m1 - pmaddubsw m4, m11 - pmaddubsw m5, m11 - paddw m6, m4 - paddw m7, m5 - pmulhrsw m6, m12 - pmulhrsw m7, m12 - packuswb m6, m7 - movu [r2 + r3 * 2], m6 + movu xm4, [r0 + r1 * 2] + vinserti128 m4, m4, [r0 + r1], 1 + vextracti128 xm1, m4, 1 + vinserti128 m0, m0, xm1, 0 + + punpcklbw m6, m0, m4 + punpckhbw m1, m0, m4 + vperm2i128 m0, m6, m1, 0x20 + vperm2i128 m6, m6, m1, 0x31 + pmaddubsw m0, [r5 + mmsize] + paddw m5, m0 + pmaddubsw m6, [r5 + mmsize] + paddw m2, m6 - movu m0, [r0 + r1 * 2] ; m0 = row 6 - punpcklbw m6, m1, m0 - punpckhbw m7, m1, m0 - pmaddubsw m6, m11 - pmaddubsw m7, m11 +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m5, m7 ; m5 = word: row 3 + packuswb m2, m5 + vpermq m2, m2, 11011000b + vextracti128 xm5, m2, 1 + movu [r2], xm2 + movu [r2 + r3], xm5 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m5, m7 ; m5 = word: row 3 + movu [r2], m2 + movu [r2 + r3], m5 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_16x4 pp +FILTER_VER_CHROMA_AVX2_16x4 ps + +%macro FILTER_VER_CHROMA_AVX2_12x16 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_12x16, 4, 7, 8 + mov r4d, r4m + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m7, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m7, [pw_2000] +%endif + lea r6, [r3 * 3] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] +%ifidn %1,pp + pmulhrsw m0, m7 ; m0 = word: row 0 + pmulhrsw m1, m7 ; m1 = word: row 1 + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movd [r2 + 8], xm1 + movhps [r2 + r3], xm0 + pextrd [r2 + r3 + 8], xm1, 2 +%else + psubw m0, m7 ; m0 = word: row 0 + psubw m1, m7 ; m1 = word: row 1 + movu [r2], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + 16], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 + 16], xm1 +%endif + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 1 * mmsize] paddw m2, m6 - paddw m3, m7 - pmulhrsw m2, m12 - pmulhrsw m3, m12 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm0, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm0, 1 + pmaddubsw m0, m5, [r5 + 1 * mmsize] + paddw m3, m0 + pmaddubsw m5, [r5] +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 2 + pmulhrsw m3, m7 ; m3 = word: row 3 packuswb m2, m3 - movu [r2 + r5], m2 + vextracti128 xm3, m2, 1 + movq [r2 + r3 * 2], xm2 + movd [r2 + r3 * 2 + 8], xm3 + movhps [r2 + r6], xm2 + pextrd [r2 + r6 + 8], xm3, 2 +%else + psubw m2, m7 ; m2 = word: row 2 + psubw m3, m7 ; m3 = word: row 3 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r3 * 2 + 16], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m3, 1 + movq [r2 + r6 + 16], xm3 +%endif + lea r2, [r2 + r3 * 4] + + movu xm0, [r0 + r4] ; m0 = row 7 + punpckhbw xm3, xm6, xm0 + punpcklbw xm6, xm0 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm3, [r0] ; m3 = row 8 + punpckhbw xm1, xm0, xm3 + punpcklbw xm0, xm3 + vinserti128 m0, m0, xm1, 1 + pmaddubsw m1, m0, [r5 + 1 * mmsize] + paddw m5, m1 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m4, m7 ; m4 = word: row 4 + pmulhrsw m5, m7 ; m5 = word: row 5 + packuswb m4, m5 + vextracti128 xm5, m4, 1 + movq [r2], xm4 + movd [r2 + 8], xm5 + movhps [r2 + r3], xm4 + pextrd [r2 + r3 + 8], xm5, 2 +%else + psubw m4, m7 ; m4 = word: row 4 + psubw m5, m7 ; m5 = word: row 5 + movu [r2], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + 16], xm4 + movu [r2 + r3], xm5 + vextracti128 xm5, m5, 1 + movq [r2 + r3 + 16], xm5 +%endif + + movu xm1, [r0 + r1] ; m1 = row 9 + punpckhbw xm2, xm3, xm1 + punpcklbw xm3, xm1 + vinserti128 m3, m3, xm2, 1 + pmaddubsw m2, m3, [r5 + 1 * mmsize] + paddw m6, m2 + pmaddubsw m3, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 10 + punpckhbw xm4, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm4, 1 + pmaddubsw m4, m1, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m1, [r5] + +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 6 + pmulhrsw m0, m7 ; m0 = word: row 7 + packuswb m6, m0 + vextracti128 xm0, m6, 1 + movq [r2 + r3 * 2], xm6 + movd [r2 + r3 * 2 + 8], xm0 + movhps [r2 + r6], xm6 + pextrd [r2 + r6 + 8], xm0, 2 +%else + psubw m6, m7 ; m6 = word: row 6 + psubw m0, m7 ; m0 = word: row 7 + movu [r2 + r3 * 2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + r3 * 2 + 16], xm6 + movu [r2 + r6], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + r6 + 16], xm0 +%endif + lea r2, [r2 + r3 * 4] + + movu xm4, [r0 + r4] ; m4 = row 11 + punpckhbw xm6, xm2, xm4 + punpcklbw xm2, xm4 + vinserti128 m2, m2, xm6, 1 + pmaddubsw m6, m2, [r5 + 1 * mmsize] + paddw m3, m6 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 12 + punpckhbw xm0, xm4, xm6 + punpcklbw xm4, xm6 + vinserti128 m4, m4, xm0, 1 + pmaddubsw m0, m4, [r5 + 1 * mmsize] + paddw m1, m0 + pmaddubsw m4, [r5] +%ifidn %1,pp + pmulhrsw m3, m7 ; m3 = word: row 8 + pmulhrsw m1, m7 ; m1 = word: row 9 + packuswb m3, m1 + vextracti128 xm1, m3, 1 + movq [r2], xm3 + movd [r2 + 8], xm1 + movhps [r2 + r3], xm3 + pextrd [r2 + r3 + 8], xm1, 2 +%else + psubw m3, m7 ; m3 = word: row 8 + psubw m1, m7 ; m1 = word: row 9 + movu [r2], xm3 + vextracti128 xm3, m3, 1 + movq [r2 + 16], xm3 + movu [r2 + r3], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 + 16], xm1 +%endif + movu xm0, [r0 + r1] ; m0 = row 13 + punpckhbw xm1, xm6, xm0 + punpcklbw xm6, xm0 + vinserti128 m6, m6, xm1, 1 + pmaddubsw m1, m6, [r5 + 1 * mmsize] + paddw m2, m1 + pmaddubsw m6, [r5] + movu xm1, [r0 + r1 * 2] ; m1 = row 14 + punpckhbw xm5, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm5, 1 + pmaddubsw m5, m0, [r5 + 1 * mmsize] + paddw m4, m5 + pmaddubsw m0, [r5] +%ifidn %1,pp + pmulhrsw m2, m7 ; m2 = word: row 10 + pmulhrsw m4, m7 ; m4 = word: row 11 + packuswb m2, m4 + vextracti128 xm4, m2, 1 + movq [r2 + r3 * 2], xm2 + movd [r2 + r3 * 2 + 8], xm4 + movhps [r2 + r6], xm2 + pextrd [r2 + r6 + 8], xm4, 2 +%else + psubw m2, m7 ; m2 = word: row 10 + psubw m4, m7 ; m4 = word: row 11 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r3 * 2 + 16], xm2 + movu [r2 + r6], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + r6 + 16], xm4 +%endif lea r2, [r2 + r3 * 4] - dec r6d - jnz .loopW - RET + + movu xm5, [r0 + r4] ; m5 = row 15 + punpckhbw xm2, xm1, xm5 + punpcklbw xm1, xm5 + vinserti128 m1, m1, xm2, 1 + pmaddubsw m2, m1, [r5 + 1 * mmsize] + paddw m6, m2 + pmaddubsw m1, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm5, xm2 + punpcklbw xm5, xm2 + vinserti128 m5, m5, xm3, 1 + pmaddubsw m3, m5, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m5, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m1, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm2, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m5, m3 + +%ifidn %1,pp + pmulhrsw m6, m7 ; m6 = word: row 12 + pmulhrsw m0, m7 ; m0 = word: row 13 + pmulhrsw m1, m7 ; m1 = word: row 14 + pmulhrsw m5, m7 ; m5 = word: row 15 + packuswb m6, m0 + packuswb m1, m5 + vextracti128 xm0, m6, 1 + vextracti128 xm5, m1, 1 + movq [r2], xm6 + movd [r2 + 8], xm0 + movhps [r2 + r3], xm6 + pextrd [r2 + r3 + 8], xm0, 2 + movq [r2 + r3 * 2], xm1 + movd [r2 + r3 * 2 + 8], xm5 + movhps [r2 + r6], xm1 + pextrd [r2 + r6 + 8], xm5, 2 +%else + psubw m6, m7 ; m6 = word: row 12 + psubw m0, m7 ; m0 = word: row 13 + psubw m1, m7 ; m1 = word: row 14 + psubw m5, m7 ; m5 = word: row 15 + movu [r2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + 16], xm6 + movu [r2 + r3], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + r3 + 16], xm0 + movu [r2 + r3 * 2], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 * 2 + 16], xm1 + movu [r2 + r6], xm5 + vextracti128 xm5, m5, 1 + movq [r2 + r6 + 16], xm5 %endif + RET +%endmacro + +FILTER_VER_CHROMA_AVX2_12x16 pp +FILTER_VER_CHROMA_AVX2_12x16 ps ;----------------------------------------------------------------------------- -; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) ;----------------------------------------------------------------------------- -%macro FILTER_V4_W16n_H2 2 +%macro FILTER_V4_W24 2 INIT_XMM sse4 -cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 +cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8 mov r4d, r4m sub r0, r1 @@ -4322,14 +7298,9 @@ pshufb m1, m0, [tab_Vm] pshufb m0, [tab_Vm + 16] -mov r4d, %2/2 +mov r4d, %2 .loop: - -mov r6d, %1/16 - -.loopW: - movu m2, [r0] movu m3, [r0 + r1] @@ -4341,20 +7312,20 @@ lea r5, [r0 + 2 * r1] movu m5, [r5] -movu m6, [r5 + r1] +movu m7, [r5 + r1] -punpckhbw m7, m5, m6 -pmaddubsw m7, m0 -paddw m2, m7 +punpcklbw m6, m5, m7 +pmaddubsw m6, m0 +paddw m4, m6 -punpcklbw m7, m5, m6 -pmaddubsw m7, m0 -paddw m4, m7 +punpckhbw m6, m5, m7 +pmaddubsw m6, m0 +paddw m2, m6 -mova m7, [pw_512] +mova m6, [pw_512] -pmulhrsw m4, m7 -pmulhrsw m2, m7 +pmulhrsw m4, m6 +pmulhrsw m2, m6 packuswb m4, m2 @@ -4366,1315 +7337,3807 @@ pmaddubsw m4, m1 pmaddubsw m3, m1 -movu m5, [r5 + 2 * r1] +movu m2, [r5 + 2 * r1] -punpcklbw m2, m6, m5 -punpckhbw m6, m5 +punpcklbw m5, m7, m2 +punpckhbw m7, m2 -pmaddubsw m2, m0 -pmaddubsw m6, m0 +pmaddubsw m5, m0 +pmaddubsw m7, m0 -paddw m4, m2 -paddw m3, m6 +paddw m4, m5 +paddw m3, m7 -pmulhrsw m4, m7 -pmulhrsw m3, m7 +pmulhrsw m4, m6 +pmulhrsw m3, m6 packuswb m4, m3 movu [r2 + r3], m4 -add r0, 16 -add r2, 16 -dec r6d -jnz .loopW - -lea r0, [r0 + r1 * 2 - %1] -lea r2, [r2 + r3 * 2 - %1] +movq m2, [r0 + 16] +movq m3, [r0 + r1 + 16] +movq m4, [r5 + 16] +movq m5, [r5 + r1 + 16] -dec r4d -jnz .loop -RET -%endmacro +punpcklbw m2, m3 +punpcklbw m4, m5 -FILTER_V4_W16n_H2 64, 64 -FILTER_V4_W16n_H2 64, 32 -FILTER_V4_W16n_H2 64, 48 -FILTER_V4_W16n_H2 48, 64 -FILTER_V4_W16n_H2 64, 16 +pmaddubsw m2, m1 +pmaddubsw m4, m0 +paddw m2, m4 -;----------------------------------------------------------------------------- -; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) -;----------------------------------------------------------------------------- -INIT_XMM ssse3 -cglobal luma_p2s, 3, 7, 6 +pmulhrsw m2, m6 - ; load width and height - mov r3d, r3m - mov r4d, r4m +movq m3, [r0 + r1 + 16] +movq m4, [r5 + 16] +movq m5, [r5 + r1 + 16] +movq m7, [r5 + 2 * r1 + 16] - ; load constant - mova m4, [pb_128] - mova m5, [tab_c_64_n64] +punpcklbw m3, m4 +punpcklbw m5, m7 -.loopH: +pmaddubsw m3, m1 +pmaddubsw m5, m0 - xor r5d, r5d -.loopW: - lea r6, [r0 + r5] +paddw m3, m5 - movh m0, [r6] - punpcklbw m0, m4 - pmaddubsw m0, m5 +pmulhrsw m3, m6 +packuswb m2, m3 - movh m1, [r6 + r1] - punpcklbw m1, m4 - pmaddubsw m1, m5 +movh [r2 + 16], m2 +movhps [r2 + r3 + 16], m2 - movh m2, [r6 + r1 * 2] - punpcklbw m2, m4 - pmaddubsw m2, m5 +mov r0, r5 +lea r2, [r2 + 2 * r3] - lea r6, [r6 + r1 * 2] - movh m3, [r6 + r1] - punpcklbw m3, m4 - pmaddubsw m3, m5 +sub r4, 2 +jnz .loop +RET +%endmacro - add r5, 8 - cmp r5, r3 - jg .width4 - movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 - movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 - movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 - movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 - je .nextH - jmp .loopW +FILTER_V4_W24 24, 32 -.width4: - movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 - movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 - movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 - movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 +FILTER_V4_W24 24, 64 -.nextH: - lea r0, [r0 + r1 * 4] - add r2, FENC_STRIDE * 8 +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W32 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8 - sub r4d, 4 - jnz .loopH +mov r4d, r4m +sub r0, r1 - RET +%ifdef PIC +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] +%else +movd m0, [tab_ChromaCoeff + r4 * 4] +%endif -%macro PROCESS_LUMA_W4_4R 0 - movd m0, [r0] - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[0 1] +pshufb m1, m0, [tab_Vm] +pshufb m0, [tab_Vm + 16] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[1 2] - punpcklqdq m2, m1 ; m2=[0 1 1 2] - pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] +mova m7, [pw_512] - movd m1, [r0 + r1] - punpcklbw m5, m0, m1 ; m2=[2 3] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[3 4] - punpcklqdq m5, m1 ; m5=[2 3 3 4] - pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] - paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 - pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 +mov r4d, %2 - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[4 5] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[5 6] - punpcklqdq m2, m1 ; m2=[4 5 5 6] - pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] - paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 - pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] - paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 +.loop: +movu m2, [r0] +movu m3, [r0 + r1] - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[6 7] - lea r0, [r0 + 2 * r1] - movd m0, [r0] - punpcklbw m1, m0 ; m1=[7 8] - punpcklqdq m2, m1 ; m2=[6 7 7 8] - pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] - paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end - pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] - paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 +punpcklbw m4, m2, m3 +punpckhbw m2, m3 - movd m1, [r0 + r1] - punpcklbw m2, m0, m1 ; m2=[8 9] - movd m0, [r0 + 2 * r1] - punpcklbw m1, m0 ; m1=[9 10] - punpcklqdq m2, m1 ; m2=[8 9 9 10] - pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] - paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end -%endmacro +pmaddubsw m4, m1 +pmaddubsw m2, m1 -%macro PROCESS_LUMA_W8_4R 0 - movq m0, [r0] - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 +lea r5, [r0 + 2 * r1] +movu m3, [r5] +movu m5, [r5 + r1] - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 +punpcklbw m6, m3, m5 +punpckhbw m3, m5 - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 - pmaddubsw m0, [r6 + 1 * 16] - paddw m7, m0 ;m7=[0+1+2+3] Row1 +pmaddubsw m6, m0 +pmaddubsw m3, m0 - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 - pmaddubsw m1, [r6 + 1 * 16] - paddw m6, m1 ;m6 = [1+2+3+4] Row2 +paddw m4, m6 +paddw m2, m3 - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m2, m0, [r6 + 1 * 16] - pmaddubsw m0, [r6 + 2 * 16] - paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 - paddw m5, m2 ;m5=[2+3+4+5] Row3 +pmulhrsw m4, m7 +pmulhrsw m2, m7 - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m2, m1, [r6 + 1 * 16] - pmaddubsw m1, [r6 + 2 * 16] - paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 - paddw m4, m2 ;m4=[3+4+5+6] Row4 +packuswb m4, m2 - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m2, m0, [r6 + 2 * 16] - pmaddubsw m0, [r6 + 3 * 16] - paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end - paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 +movu [r2], m4 - lea r0, [r0 + 2 * r1] - movq m0, [r0] - punpcklbw m1, m0 - pmaddubsw m2, m1, [r6 + 2 * 16] - pmaddubsw m1, [r6 + 3 * 16] - paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end - paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 +movu m2, [r0 + 16] +movu m3, [r0 + r1 + 16] - movq m1, [r0 + r1] - punpcklbw m0, m1 - pmaddubsw m0, [r6 + 3 * 16] - paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end +punpcklbw m4, m2, m3 +punpckhbw m2, m3 - movq m0, [r0 + 2 * r1] - punpcklbw m1, m0 - pmaddubsw m1, [r6 + 3 * 16] - paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end -%endmacro +pmaddubsw m4, m1 +pmaddubsw m2, m1 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_4xN 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 -%ifidn %3,ps - add r3d, r3d -%endif +movu m3, [r5 + 16] +movu m5, [r5 + r1 + 16] -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif +punpcklbw m6, m3, m5 +punpckhbw m3, m5 -%ifidn %3,pp - mova m3, [pw_512] -%else - mova m3, [pw_2000] -%endif +pmaddubsw m6, m0 +pmaddubsw m3, m0 - mov r4d, %2/4 - lea r5, [4 * r1] +paddw m4, m6 +paddw m2, m3 -.loopH: - PROCESS_LUMA_W4_4R +pmulhrsw m4, m7 +pmulhrsw m2, m7 -%ifidn %3,pp - pmulhrsw m4, m3 - pmulhrsw m5, m3 +packuswb m4, m2 - packuswb m4, m5 +movu [r2 + 16], m4 - movd [r2], m4 - pextrd [r2 + r3], m4, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m4, 2 - pextrd [r2 + r3], m4, 3 -%else - psubw m4, m3 - psubw m5, m3 - - movlps [r2], m4 - movhps [r2 + r3], m4 - lea r2, [r2 + 2 * r3] - movlps [r2], m5 - movhps [r2 + r3], m5 -%endif - - sub r0, r5 - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH +lea r0, [r0 + r1] +lea r2, [r2 + r3] - RET +dec r4 +jnz .loop +RET %endmacro +FILTER_V4_W32 32, 8 +FILTER_V4_W32 32, 16 +FILTER_V4_W32 32, 24 +FILTER_V4_W32 32, 32 + +FILTER_V4_W32 32, 48 +FILTER_V4_W32 32, 64 +%macro FILTER_VER_CHROMA_AVX2_32xN 2 INIT_YMM avx2 -cglobal interp_8tap_vert_pp_4x4, 4,6,8 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_32x%2, 4, 7, 13 mov r4d, r4m - lea r5, [r1 * 3] - sub r0, r5 - - ; TODO: VPGATHERDD - movd xm1, [r0] ; m1 = row0 - movd xm2, [r0 + r1] ; m2 = row1 - punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00] + shl r4d, 6 - movd xm3, [r0 + r1 * 2] ; m3 = row2 - punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10] - movd xm4, [r0 + r5] - punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20] - punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] +%ifdef PIC + lea r5, [tab_ChromaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_ChromaCoeffVer_32 + r4] +%endif + mova m10, [r5] + mova m11, [r5 + mmsize] + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,pp + mova m12, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m12, [pw_2000] +%endif + lea r5, [r3 * 3] + mov r6d, %2 / 4 +.loopW: + movu m0, [r0] ; m0 = row 0 + movu m1, [r0 + r1] ; m1 = row 1 + punpcklbw m2, m0, m1 + punpckhbw m3, m0, m1 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + movu m0, [r0 + r1 * 2] ; m0 = row 2 + punpcklbw m4, m1, m0 + punpckhbw m5, m1, m0 + pmaddubsw m4, m10 + pmaddubsw m5, m10 + movu m1, [r0 + r4] ; m1 = row 3 + punpcklbw m6, m0, m1 + punpckhbw m7, m0, m1 + pmaddubsw m8, m6, m11 + pmaddubsw m9, m7, m11 + pmaddubsw m6, m10 + pmaddubsw m7, m10 + paddw m2, m8 + paddw m3, m9 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2], m2 +%else + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2], m0 + movu [r2 + mmsize], m2 +%endif lea r0, [r0 + r1 * 4] - movd xm5, [r0] ; m5 = row4 - punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30] - punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] - movd xm2, [r0 + r1] ; m2 = row5 - punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40] - punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] - movd xm6, [r0 + r1 * 2] ; m6 = row6 - punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50] - punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] - movd xm4, [r0 + r5] ; m4 = row7 - punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60] - punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] + movu m0, [r0] ; m0 = row 4 + punpcklbw m2, m1, m0 + punpckhbw m3, m1, m0 + pmaddubsw m8, m2, m11 + pmaddubsw m9, m3, m11 + pmaddubsw m2, m10 + pmaddubsw m3, m10 + paddw m4, m8 + paddw m5, m9 +%ifidn %1,pp + pmulhrsw m4, m12 + pmulhrsw m5, m12 + packuswb m4, m5 + movu [r2 + r3], m4 +%else + psubw m4, m12 + psubw m5, m12 + vperm2i128 m1, m4, m5, 0x20 + vperm2i128 m4, m4, m5, 0x31 + movu [r2 + r3], m1 + movu [r2 + r3 + mmsize], m4 +%endif - lea r0, [r0 + r1 * 4] - movd xm7, [r0] ; m7 = row8 - punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70] - punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] - movd xm2, [r0 + r1] ; m2 = row9 - punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80] - punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] - movd xm7, [r0 + r1 * 2] ; m7 = rowA - punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90] - punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] + movu m1, [r0 + r1] ; m1 = row 5 + punpcklbw m4, m0, m1 + punpckhbw m5, m0, m1 + pmaddubsw m4, m11 + pmaddubsw m5, m11 + paddw m6, m4 + paddw m7, m5 +%ifidn %1,pp + pmulhrsw m6, m12 + pmulhrsw m7, m12 + packuswb m6, m7 + movu [r2 + r3 * 2], m6 +%else + psubw m6, m12 + psubw m7, m12 + vperm2i128 m0, m6, m7, 0x20 + vperm2i128 m6, m6, m7, 0x31 + movu [r2 + r3 * 2], m0 + movu [r2 + r3 * 2 + mmsize], m6 +%endif - ; load filter coeff -%ifdef PIC - lea r5, [tab_LumaCoeff] - vpbroadcastd m0, [r5 + r4 * 8 + 0] - vpbroadcastd m2, [r5 + r4 * 8 + 4] + movu m0, [r0 + r1 * 2] ; m0 = row 6 + punpcklbw m6, m1, m0 + punpckhbw m7, m1, m0 + pmaddubsw m6, m11 + pmaddubsw m7, m11 + paddw m2, m6 + paddw m3, m7 +%ifidn %1,pp + pmulhrsw m2, m12 + pmulhrsw m3, m12 + packuswb m2, m3 + movu [r2 + r5], m2 %else - vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0] - vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4] + psubw m2, m12 + psubw m3, m12 + vperm2i128 m0, m2, m3, 0x20 + vperm2i128 m2, m2, m3, 0x31 + movu [r2 + r5], m0 + movu [r2 + r5 + mmsize], m2 +%endif + lea r2, [r2 + r3 * 4] + dec r6d + jnz .loopW + RET %endif +%endmacro - pmaddubsw m1, m0 - pmaddubsw m3, m0 - pmaddubsw m5, m2 - pmaddubsw m6, m2 - vbroadcasti128 m0, [pw_1] - pmaddwd m1, m0 - pmaddwd m3, m0 - pmaddwd m5, m0 - pmaddwd m6, m0 - paddd m1, m5 ; m1 = DQWORD ROW[1 0] - paddd m3, m6 ; m3 = DQWORD ROW[3 2] - packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0] +FILTER_VER_CHROMA_AVX2_32xN pp, 32 +FILTER_VER_CHROMA_AVX2_32xN pp, 24 +FILTER_VER_CHROMA_AVX2_32xN pp, 16 +FILTER_VER_CHROMA_AVX2_32xN pp, 8 +FILTER_VER_CHROMA_AVX2_32xN ps, 32 +FILTER_VER_CHROMA_AVX2_32xN ps, 24 +FILTER_VER_CHROMA_AVX2_32xN ps, 16 +FILTER_VER_CHROMA_AVX2_32xN ps, 8 - ; TODO: does it overflow? - pmulhrsw m1, [pw_512] - vextracti128 xm2, m1, 1 - packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0] - movd [r2], xm1 - pextrd [r2 + r3], xm1, 2 - pextrd [r2 + r3 * 2], xm1, 1 - lea r4, [r3 * 3] - pextrd [r2 + r4], xm1, 3 - RET +;----------------------------------------------------------------------------- +; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------- +%macro FILTER_V4_W16n_H2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8 -INIT_YMM avx2 -cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 - mov r4d, r4m - shl r4d, 7 +mov r4d, r4m +sub r0, r1 %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 +lea r5, [tab_ChromaCoeff] +movd m0, [r5 + r4 * 4] %else - lea r5, [tab_LumaCoeffVer_32 + r4] +movd m0, [tab_ChromaCoeff + r4 * 4] %endif - lea r4, [r1 * 3] - sub r0, r4 +pshufb m1, m0, [tab_Vm] +pshufb m0, [tab_Vm + 16] - add r3d, r3d +mov r4d, %2/2 - movd xm1, [r0] - pinsrd xm1, [r0 + r1], 1 - pinsrd xm1, [r0 + r1 * 2], 2 - pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm2, [r0] - pinsrd xm2, [r0 + r1], 1 - pinsrd xm2, [r0 + r1 * 2], 2 - pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] - vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] - lea r0, [r0 + r1 * 4] - movd xm3, [r0] - pinsrd xm3, [r0 + r1], 1 - pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] - vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] - mova m3, [interp4_vpp_shuf1] - vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] - vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] - mova m3, [interp4_vpp_shuf1 + mmsize] - vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] - vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] +.loop: - mova m3, [interp4_vpp_shuf] - pshufb m0, m0, m3 - pshufb m1, m1, m3 - pshufb m4, m4, m3 - pshufb m2, m2, m3 - pmaddubsw m0, [r5] - pmaddubsw m1, [r5 + mmsize] - pmaddubsw m4, [r5 + 2 * mmsize] - pmaddubsw m2, [r5 + 3 * mmsize] - paddw m0, m1 - paddw m0, m4 - paddw m0, m2 ; m0 = WORD ROW[3 2 1 0] +mov r6d, %1/16 - vbroadcasti128 m3, [pw_2000] - psubw m0, m3 - vextracti128 xm2, m0, 1 - lea r5, [r3 * 3] - movq [r2], xm0 - movhps [r2 + r3], xm0 - movq [r2 + r3 * 2], xm2 - movhps [r2 + r5], xm2 - RET +.loopW: -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_4xN 4, 4, pp +movu m2, [r0] +movu m3, [r0 + r1] -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_4xN 4, 8, pp +punpcklbw m4, m2, m3 +punpckhbw m2, m3 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_4xN 4, 16, pp +pmaddubsw m4, m1 +pmaddubsw m2, m1 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_4xN 4, 4, ps +lea r5, [r0 + 2 * r1] +movu m5, [r5] +movu m6, [r5 + r1] -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_4xN 4, 8, ps +punpckhbw m7, m5, m6 +pmaddubsw m7, m0 +paddw m2, m7 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_4xN 4, 16, ps +punpcklbw m7, m5, m6 +pmaddubsw m7, m0 +paddw m4, m7 -%macro PROCESS_LUMA_AVX2_W8_8R 0 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m5, m3 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - pmaddubsw m3, m4, [r5 + 3 * mmsize] - paddw m5, m3 - pmaddubsw m3, m4, [r5 + 2 * mmsize] - paddw m2, m3 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 - pmaddubsw m4, [r5] - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - pmaddubsw m3, m0, [r5 + 3 * mmsize] - paddw m2, m3 - pmaddubsw m3, m0, [r5 + 2 * mmsize] - paddw m1, m3 - pmaddubsw m0, [r5 + 1 * mmsize] - paddw m4, m0 +mova m7, [pw_512] - movq xm3, [r0 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 12 - punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] - pmaddubsw m3, m6, [r5 + 3 * mmsize] - paddw m1, m3 - pmaddubsw m6, [r5 + 2 * mmsize] - paddw m4, m6 - movq xm3, [r0 + r1] ; m3 = row 13 - punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] - movq xm6, [r0 + r1 * 2] ; m6 = row 14 - punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] - pmaddubsw m0, [r5 + 3 * mmsize] - paddw m4, m0 -%endmacro +pmulhrsw m4, m7 +pmulhrsw m2, m7 -%macro PROCESS_LUMA_AVX2_W8_4R 0 - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - lea r0, [r0 + r1 * 4] - movq xm1, [r0] ; m1 = row 4 - punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r0 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - movq xm4, [r0 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m5, m3 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - movq xm3, [r0 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - lea r0, [r0 + r1 * 4] - movq xm0, [r0] ; m0 = row 8 - punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] - pmaddubsw m3, m4, [r5 + 3 * mmsize] - paddw m5, m3 - pmaddubsw m3, m4, [r5 + 2 * mmsize] - paddw m2, m3 - movq xm3, [r0 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - movq xm6, [r0 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] - pmaddubsw m3, m0, [r5 + 3 * mmsize] - paddw m2, m3 -%endmacro +packuswb m4, m2 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_8xN 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 +movu [r2], m4 -%ifidn %3,ps - add r3d, r3d -%endif +punpcklbw m4, m3, m5 +punpckhbw m3, m5 -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif +pmaddubsw m4, m1 +pmaddubsw m3, m1 - %ifidn %3,pp - mova m3, [pw_512] -%else - mova m3, [pw_2000] -%endif +movu m5, [r5 + 2 * r1] - mov r4d, %2/4 - lea r5, [4 * r1] +punpcklbw m2, m6, m5 +punpckhbw m6, m5 -.loopH: - PROCESS_LUMA_W8_4R +pmaddubsw m2, m0 +pmaddubsw m6, m0 -%ifidn %3,pp - pmulhrsw m7, m3 - pmulhrsw m6, m3 - pmulhrsw m5, m3 - pmulhrsw m4, m3 +paddw m4, m2 +paddw m3, m6 - packuswb m7, m6 - packuswb m5, m4 +pmulhrsw m4, m7 +pmulhrsw m3, m7 - movlps [r2], m7 - movhps [r2 + r3], m7 - lea r2, [r2 + 2 * r3] - movlps [r2], m5 - movhps [r2 + r3], m5 -%else - psubw m7, m3 - psubw m6, m3 - psubw m5, m3 - psubw m4, m3 +packuswb m4, m3 - movu [r2], m7 - movu [r2 + r3], m6 - lea r2, [r2 + 2 * r3] - movu [r2], m5 - movu [r2 + r3], m4 -%endif +movu [r2 + r3], m4 - sub r0, r5 - lea r2, [r2 + 2 * r3] +add r0, 16 +add r2, 16 +dec r6d +jnz .loopW - dec r4d - jnz .loopH +lea r0, [r0 + r1 * 2 - %1] +lea r2, [r2 + r3 * 2 - %1] - RET +dec r4d +jnz .loop +RET %endmacro -%macro FILTER_VER_LUMA_AVX2_8xN 2 -INIT_YMM avx2 -cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 8, 0-gprsize - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r1 * 4] - mov word [rsp], %2 / 8 - mova m7, [pw_512] +FILTER_V4_W16n_H2 64, 64 +FILTER_V4_W16n_H2 64, 32 +FILTER_V4_W16n_H2 64, 48 +FILTER_V4_W16n_H2 48, 64 +FILTER_V4_W16n_H2 64, 16 +;----------------------------------------------------------------------------- +; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) +;----------------------------------------------------------------------------- +%macro PIXEL_WH_4xN 2 +INIT_XMM ssse3 +cglobal pixelToShort_%1x%2, 3, 7, 6 -.loop: - PROCESS_LUMA_AVX2_W8_8R - pmulhrsw m5, m7 ; m5 = word: row 0, row 1 - pmulhrsw m2, m7 ; m2 = word: row 2, row 3 - pmulhrsw m1, m7 ; m1 = word: row 4, row 5 - pmulhrsw m4, m7 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - lea r2, [r2 + r3 * 2] - movhps [r2], xm5 - movhps [r2 + r3], xm2 - lea r2, [r2 + r3 * 2] - movq [r2], xm1 - movq [r2 + r3], xm4 - lea r2, [r2 + r3 * 2] - movhps [r2], xm1 - movhps [r2 + r3], xm4 - lea r2, [r2 + r3 * 2] - sub r0, r6 - dec word [rsp] - jnz .loop - RET -%endmacro - -INIT_YMM avx2 -cglobal interp_8tap_vert_pp_8x8, 4, 6, 7 - mov r4d, r4m - shl r4d, 7 + ; load width and height + mov r3d, %1 + mov r4d, %2 + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] +.loopH: + xor r5d, r5d -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif +.loopW: + mov r6, r0 + movh m0, [r6] + punpcklbw m0, m4 + pmaddubsw m0, m5 - lea r4, [r1 * 3] - sub r0, r4 - PROCESS_LUMA_AVX2_W8_8R - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - pmulhrsw m1, m3 ; m1 = word: row 4, row 5 - pmulhrsw m4, m3 ; m4 = word: row 6, row 7 - packuswb m5, m2 - packuswb m1, m4 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 - lea r2, [r2 + r3 * 4] - movq [r2], xm1 - movq [r2 + r3], xm4 - movhps [r2 + r3 * 2], xm1 - movhps [r2 + r4], xm4 - RET + movh m1, [r6 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 -INIT_YMM avx2 -cglobal interp_8tap_vert_pp_8x4, 4, 6, 7 - mov r4d, r4m - shl r4d, 7 + movh m2, [r6 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif + lea r6, [r6 + r1 * 2] + movh m3, [r6 + r1] + punpcklbw m3, m4 + pmaddubsw m3, m5 - lea r4, [r1 * 3] - sub r0, r4 - PROCESS_LUMA_AVX2_W8_4R - lea r4, [r3 * 3] - mova m3, [pw_512] - pmulhrsw m5, m3 ; m5 = word: row 0, row 1 - pmulhrsw m2, m3 ; m2 = word: row 2, row 3 - packuswb m5, m2 - vextracti128 xm2, m5, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 - movhps [r2 + r4], xm2 - RET + add r5, 8 + cmp r5, r3 + jg .width4 + movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 + movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 + movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 + movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 + je .nextH + jmp .loopW -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_8xN 8, 4, pp +.width4: + movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 + movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 + movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 + movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_8xN 8, 8, pp +.nextH: + lea r0, [r0 + r1 * 4] + add r2, FENC_STRIDE * 8 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_8xN 8, 16, pp -FILTER_VER_LUMA_AVX2_8xN 8, 16 + sub r4d, 4 + jnz .loopH + RET +%endmacro +PIXEL_WH_4xN 4, 4 +PIXEL_WH_4xN 4, 8 +PIXEL_WH_4xN 4, 16 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_8xN 8, 32, pp -FILTER_VER_LUMA_AVX2_8xN 8, 32 +;----------------------------------------------------------------------------- +; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) +;----------------------------------------------------------------------------- +%macro PIXEL_WH_8xN 2 +INIT_XMM ssse3 +cglobal pixelToShort_%1x%2, 3, 7, 6 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_8xN 8, 4, ps + ; load width and height + mov r3d, %1 + mov r4d, %2 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_8xN 8, 8, ps + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_8xN 8, 16, ps +.loopH + xor r5d, r5d +.loopW + lea r6, [r0 + r5] -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_8xN 8, 32, ps + movh m0, [r6] + punpcklbw m0, m4 + pmaddubsw m0, m5 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_12xN 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 -%ifidn %3,ps - add r3d, r3d -%endif + movh m1, [r6 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] -%else - lea r6, [tab_LumaCoeffVer + r4] -%endif + movh m2, [r6 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - %ifidn %3,pp - mova m3, [pw_512] -%else - mova m3, [pw_2000] -%endif + lea r6, [r6 + r1 * 2] + movh m3, [r6 + r1] + punpcklbw m3, m4 + pmaddubsw m3, m5 - mov r4d, %2/4 + add r5, 8 + cmp r5, r3 -.loopH: - PROCESS_LUMA_W8_4R + movu [r2 + FENC_STRIDE * 0], m0 + movu [r2 + FENC_STRIDE * 2], m1 + movu [r2 + FENC_STRIDE * 4], m2 + movu [r2 + FENC_STRIDE * 6], m3 -%ifidn %3,pp - pmulhrsw m7, m3 - pmulhrsw m6, m3 - pmulhrsw m5, m3 - pmulhrsw m4, m3 + je .nextH + jmp .loopW - packuswb m7, m6 - packuswb m5, m4 - movlps [r2], m7 - movhps [r2 + r3], m7 - lea r5, [r2 + 2 * r3] - movlps [r5], m5 - movhps [r5 + r3], m5 -%else - psubw m7, m3 - psubw m6, m3 - psubw m5, m3 - psubw m4, m3 +.nextH: + lea r0, [r0 + r1 * 4] + add r2, FENC_STRIDE * 8 - movu [r2], m7 - movu [r2 + r3], m6 - lea r5, [r2 + 2 * r3] - movu [r5], m5 - movu [r5 + r3], m4 -%endif + sub r4d, 4 + jnz .loopH + RET +%endmacro +PIXEL_WH_8xN 8, 8 +PIXEL_WH_8xN 8, 4 +PIXEL_WH_8xN 8, 16 +PIXEL_WH_8xN 8, 32 - lea r5, [8 * r1 - 8] - sub r0, r5 -%ifidn %3,pp - add r2, 8 -%else - add r2, 16 -%endif - PROCESS_LUMA_W4_4R +;----------------------------------------------------------------------------- +; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) +;----------------------------------------------------------------------------- +%macro PIXEL_WH_16xN 2 +INIT_XMM ssse3 +cglobal pixelToShort_%1x%2, 3, 7, 6 -%ifidn %3,pp - pmulhrsw m4, m3 - pmulhrsw m5, m3 + ; load width and height + mov r3d, %1 + mov r4d, %2 - packuswb m4, m5 + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] - movd [r2], m4 - pextrd [r2 + r3], m4, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m4, 2 - pextrd [r5 + r3], m4, 3 -%else - psubw m4, m3 - psubw m5, m3 +.loopH: + xor r5d, r5d +.loopW: + lea r6, [r0 + r5] - movlps [r2], m4 - movhps [r2 + r3], m4 - lea r5, [r2 + 2 * r3] - movlps [r5], m5 - movhps [r5 + r3], m5 -%endif + movh m0, [r6] + punpcklbw m0, m4 + pmaddubsw m0, m5 - lea r5, [4 * r1 + 8] - sub r0, r5 -%ifidn %3,pp - lea r2, [r2 + 4 * r3 - 8] -%else - lea r2, [r2 + 4 * r3 - 16] -%endif + movh m1, [r6 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - dec r4d - jnz .loopH + movh m2, [r6 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 + + lea r6, [r6 + r1 * 2] + movh m3, [r6 + r1] + punpcklbw m3, m4 + pmaddubsw m3, m5 + + add r5, 8 + cmp r5, r3 + + movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 + movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 + movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 + movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 + je .nextH + jmp .loopW + + +.nextH: + lea r0, [r0 + r1 * 4] + add r2, FENC_STRIDE * 8 + + sub r4d, 4 + jnz .loopH RET %endmacro +PIXEL_WH_16xN 16, 16 +PIXEL_WH_16xN 16, 8 +PIXEL_WH_16xN 16, 4 +PIXEL_WH_16xN 16, 12 +PIXEL_WH_16xN 16, 32 +PIXEL_WH_16xN 16, 64 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_12xN 12, 16, pp +;----------------------------------------------------------------------------- +; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) +;----------------------------------------------------------------------------- +%macro PIXEL_WH_32xN 2 +INIT_XMM ssse3 +cglobal pixelToShort_%1x%2, 3, 7, 6 -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -FILTER_VER_LUMA_12xN 12, 16, ps + ; load width and height + mov r3d, %1 + mov r4d, %2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_12x16, 4, 7, 15 - mov r4d, r4m - shl r4d, 7 + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif +.loopH: + xor r5d, r5d +.loopW: + lea r6, [r0 + r5] - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - mova m14, [pw_512] + movh m0, [r6] + punpcklbw m0, m4 + pmaddubsw m0, m5 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] + movh m1, [r6 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movq [r2], xm0 - pextrd [r2 + 8], xm0, 2 - movq [r2 + r3], xm1 - pextrd [r2 + r3 + 8], xm1, 2 - movq [r2 + r3 * 2], xm2 - pextrd [r2 + r3 * 2 + 8], xm2, 2 - movq [r2 + r6], xm3 - pextrd [r2 + r6 + 8], xm3, 2 - lea r2, [r2 + r3 * 4] - movq [r2], xm4 - pextrd [r2 + 8], xm4, 2 - movq [r2 + r3], xm5 - pextrd [r2 + r3 + 8], xm5, 2 + movh m2, [r6 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] + lea r6, [r6 + r1 * 2] + movh m3, [r6 + r1] + punpcklbw m3, m4 + pmaddubsw m3, m5 - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movq [r2 + r3 * 2], xm6 - pextrd [r2 + r3 * 2 + 8], xm6, 2 - movq [r2 + r6], xm7 - pextrd [r2 + r6 + 8], xm7, 2 - lea r2, [r2 + r3 * 4] + add r5, 8 + cmp r5, r3 - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r0 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 + movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 + movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 + movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 + movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 + je .nextH + jmp .loopW + + +.nextH: + lea r0, [r0 + r1 * 4] + add r2, FENC_STRIDE * 8 + + sub r4d, 4 + jnz .loopH - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movq [r2], xm8 - pextrd [r2 + 8], xm8, 2 - movq [r2 + r3], xm9 - pextrd [r2 + r3 + 8], xm9, 2 - movq [r2 + r3 * 2], xm10 - pextrd [r2 + r3 * 2 + 8], xm10, 2 - movq [r2 + r6], xm11 - pextrd [r2 + r6 + 8], xm11, 2 - lea r2, [r2 + r3 * 4] - movq [r2], xm12 - pextrd [r2 + 8], xm12, 2 - movq [r2 + r3], xm13 - pextrd [r2 + r3 + 8], xm13, 2 - movq [r2 + r3 * 2], xm0 - pextrd [r2 + r3 * 2 + 8], xm0, 2 - movq [r2 + r6], xm1 - pextrd [r2 + r6 + 8], xm1, 2 RET -%endif +%endmacro +PIXEL_WH_32xN 32, 32 +PIXEL_WH_32xN 32, 8 +PIXEL_WH_32xN 32, 16 +PIXEL_WH_32xN 32, 24 +PIXEL_WH_32xN 32, 64 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_16x16, 4, 7, 15 - mov r4d, r4m - shl r4d, 7 +;----------------------------------------------------------------------------- +; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height) +;----------------------------------------------------------------------------- +%macro PIXEL_WH_64xN 2 +INIT_XMM ssse3 +cglobal pixelToShort_%1x%2, 3, 7, 6 -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 -%else - lea r5, [tab_LumaCoeffVer_32 + r4] -%endif + ; load width and height + mov r3d, %1 + mov r4d, %2 - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - mova m14, [pw_512] + ; load constant + mova m4, [pb_128] + mova m5, [tab_c_64_n64] - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 +.loopH: + xor r5d, r5d +.loopW: + lea r6, [r0 + r5] + + movh m0, [r6] + punpcklbw m0, m4 + pmaddubsw m0, m5 + + movh m1, [r6 + r1] + punpcklbw m1, m4 + pmaddubsw m1, m5 + + movh m2, [r6 + r1 * 2] + punpcklbw m2, m4 + pmaddubsw m2, m5 + + lea r6, [r6 + r1 * 2] + movh m3, [r6 + r1] + punpcklbw m3, m4 + pmaddubsw m3, m5 + + add r5, 8 + cmp r5, r3 + + movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 + movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 + movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 + movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 + je .nextH + jmp .loopW + + +.nextH: + lea r0, [r0 + r1 * 4] + add r2, FENC_STRIDE * 8 + + sub r4d, 4 + jnz .loopH + + RET +%endmacro +PIXEL_WH_64xN 64, 64 +PIXEL_WH_64xN 64, 16 +PIXEL_WH_64xN 64, 32 +PIXEL_WH_64xN 64, 48 + +%macro PROCESS_LUMA_W4_4R 0 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[0 1] + + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[1 2] + punpcklqdq m2, m1 ; m2=[0 1 1 2] + pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2] + + movd m1, [r0 + r1] + punpcklbw m5, m0, m1 ; m2=[2 3] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[3 4] + punpcklqdq m5, m1 ; m5=[2 3 3 4] + pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4] + paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2 + pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4 + + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[4 5] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[5 6] + punpcklqdq m2, m1 ; m2=[4 5 5 6] + pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6] + paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2 + pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6] + paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4 + + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[6 7] + lea r0, [r0 + 2 * r1] + movd m0, [r0] + punpcklbw m1, m0 ; m1=[7 8] + punpcklqdq m2, m1 ; m2=[6 7 7 8] + pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8] + paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end + pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8] + paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4 + + movd m1, [r0 + r1] + punpcklbw m2, m0, m1 ; m2=[8 9] + movd m0, [r0 + 2 * r1] + punpcklbw m1, m0 ; m1=[9 10] + punpcklqdq m2, m1 ; m2=[8 9 9 10] + pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10] + paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end +%endmacro + +%macro PROCESS_LUMA_W8_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3 + pmaddubsw m0, [r6 + 1 * 16] + paddw m7, m0 ;m7=[0+1+2+3] Row1 + + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4 + pmaddubsw m1, [r6 + 1 * 16] + paddw m6, m1 ;m6 = [1+2+3+4] Row2 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m2, m0, [r6 + 1 * 16] + pmaddubsw m0, [r6 + 2 * 16] + paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1 + paddw m5, m2 ;m5=[2+3+4+5] Row3 + + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m2, m1, [r6 + 1 * 16] + pmaddubsw m1, [r6 + 2 * 16] + paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2 + paddw m4, m2 ;m4=[3+4+5+6] Row4 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m2, m0, [r6 + 2 * 16] + pmaddubsw m0, [r6 + 3 * 16] + paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end + paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3 + + lea r0, [r0 + 2 * r1] + movq m0, [r0] + punpcklbw m1, m0 + pmaddubsw m2, m1, [r6 + 2 * 16] + pmaddubsw m1, [r6 + 3 * 16] + paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end + paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4 + + movq m1, [r0 + r1] + punpcklbw m0, m1 + pmaddubsw m0, [r6 + 3 * 16] + paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end + + movq m0, [r0 + 2 * r1] + punpcklbw m1, m0 + pmaddubsw m1, [r6 + 3 * 16] + paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_4xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + +%ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif + + mov r4d, %2/4 + lea r5, [4 * r1] + +.loopH: + PROCESS_LUMA_W4_4R + +%ifidn %3,pp + pmulhrsw m4, m3 + pmulhrsw m5, m3 + + packuswb m4, m5 + + movd [r2], m4 + pextrd [r2 + r3], m4, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m4, 2 + pextrd [r2 + r3], m4, 3 +%else + psubw m4, m3 + psubw m5, m3 + + movlps [r2], m4 + movhps [r2 + r3], m4 + lea r2, [r2 + 2 * r3] + movlps [r2], m5 + movhps [r2 + r3], m5 +%endif + + sub r0, r5 + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + + +INIT_YMM avx2 +cglobal interp_8tap_vert_pp_4x4, 4,6,8 + mov r4d, r4m + lea r5, [r1 * 3] + sub r0, r5 + + ; TODO: VPGATHERDD + movd xm1, [r0] ; m1 = row0 + movd xm2, [r0 + r1] ; m2 = row1 + punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00] + + movd xm3, [r0 + r1 * 2] ; m3 = row2 + punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10] + movd xm4, [r0 + r5] + punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20] + punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] + + lea r0, [r0 + r1 * 4] + movd xm5, [r0] ; m5 = row4 + punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30] + punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] + vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00] + movd xm2, [r0 + r1] ; m2 = row5 + punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40] + punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] + movd xm6, [r0 + r1 * 2] ; m6 = row6 + punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50] + punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] + vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20] + movd xm4, [r0 + r5] ; m4 = row7 + punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60] + punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] + + lea r0, [r0 + r1 * 4] + movd xm7, [r0] ; m7 = row8 + punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70] + punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] + vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40] + movd xm2, [r0 + r1] ; m2 = row9 + punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80] + punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] + movd xm7, [r0 + r1 * 2] ; m7 = rowA + punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90] + punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] + vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60] + + ; load filter coeff +%ifdef PIC + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8 + 0] + vpbroadcastd m2, [r5 + r4 * 8 + 4] +%else + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0] + vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4] +%endif + + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddubsw m5, m2 + pmaddubsw m6, m2 + vbroadcasti128 m0, [pw_1] + pmaddwd m1, m0 + pmaddwd m3, m0 + pmaddwd m5, m0 + pmaddwd m6, m0 + paddd m1, m5 ; m1 = DQWORD ROW[1 0] + paddd m3, m6 ; m3 = DQWORD ROW[3 2] + packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0] + + ; TODO: does it overflow? + pmulhrsw m1, [pw_512] + vextracti128 xm2, m1, 1 + packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 2 + pextrd [r2 + r3 * 2], xm1, 1 + lea r4, [r3 * 3] + pextrd [r2 + r4], xm1, 3 + RET + +INIT_YMM avx2 +cglobal interp_8tap_vert_ps_4x4, 4, 6, 5 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + + add r3d, r3d + + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 ; m3 = row[x 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[x 10 9 8 7 6 5 4] + mova m3, [interp4_vpp_shuf1] + vpermd m0, m3, m1 ; m0 = row[4 3 3 2 2 1 1 0] + vpermd m4, m3, m2 ; m4 = row[8 7 7 6 6 5 5 4] + mova m3, [interp4_vpp_shuf1 + mmsize] + vpermd m1, m3, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m3, m2 ; m2 = row[10 9 9 8 8 7 7 6] + + mova m3, [interp4_vpp_shuf] + pshufb m0, m0, m3 + pshufb m1, m1, m3 + pshufb m4, m4, m3 + pshufb m2, m2, m3 + pmaddubsw m0, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m4, [r5 + 2 * mmsize] + pmaddubsw m2, [r5 + 3 * mmsize] + paddw m0, m1 + paddw m0, m4 + paddw m0, m2 ; m0 = WORD ROW[3 2 1 0] + + psubw m0, [pw_2000] + vextracti128 xm2, m0, 1 + lea r5, [r3 * 3] + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r5], xm2 + RET + +%macro FILTER_VER_LUMA_AVX2_4xN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 10 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r1 * 4] +%ifidn %3,pp + mova m6, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m6, [pw_2000] +%endif + lea r8, [r3 * 3] + mova m5, [interp4_vpp_shuf] + mova m0, [interp4_vpp_shuf1] + mova m7, [interp4_vpp_shuf1 + mmsize] + mov r7d, %2 / 8 +.loop: + movd xm1, [r0] + pinsrd xm1, [r0 + r1], 1 + pinsrd xm1, [r0 + r1 * 2], 2 + pinsrd xm1, [r0 + r4], 3 ; m1 = row[3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm2, [r0] + pinsrd xm2, [r0 + r1], 1 + pinsrd xm2, [r0 + r1 * 2], 2 + pinsrd xm2, [r0 + r4], 3 ; m2 = row[7 6 5 4] + vinserti128 m1, m1, xm2, 1 ; m1 = row[7 6 5 4 3 2 1 0] + lea r0, [r0 + r1 * 4] + movd xm3, [r0] + pinsrd xm3, [r0 + r1], 1 + pinsrd xm3, [r0 + r1 * 2], 2 + pinsrd xm3, [r0 + r4], 3 ; m3 = row[11 10 9 8] + vinserti128 m2, m2, xm3, 1 ; m2 = row[11 10 9 8 7 6 5 4] + lea r0, [r0 + r1 * 4] + movd xm4, [r0] + pinsrd xm4, [r0 + r1], 1 + pinsrd xm4, [r0 + r1 * 2], 2 ; m4 = row[x 14 13 12] + vinserti128 m3, m3, xm4, 1 ; m3 = row[x 14 13 12 11 10 9 8] + vpermd m8, m0, m1 ; m8 = row[4 3 3 2 2 1 1 0] + vpermd m4, m0, m2 ; m4 = row[8 7 7 6 6 5 5 4] + vpermd m1, m7, m1 ; m1 = row[6 5 5 4 4 3 3 2] + vpermd m2, m7, m2 ; m2 = row[10 9 9 8 8 7 7 6] + vpermd m9, m0, m3 ; m9 = row[12 11 11 10 10 9 9 8] + vpermd m3, m7, m3 ; m3 = row[14 13 13 12 12 11 11 10] + + pshufb m8, m8, m5 + pshufb m1, m1, m5 + pshufb m4, m4, m5 + pshufb m9, m9, m5 + pshufb m2, m2, m5 + pshufb m3, m3, m5 + pmaddubsw m8, [r5] + pmaddubsw m1, [r5 + mmsize] + pmaddubsw m9, [r5 + 2 * mmsize] + pmaddubsw m3, [r5 + 3 * mmsize] + paddw m8, m1 + paddw m9, m3 + pmaddubsw m1, m4, [r5 + 2 * mmsize] + pmaddubsw m3, m2, [r5 + 3 * mmsize] + pmaddubsw m4, [r5] + pmaddubsw m2, [r5 + mmsize] + paddw m3, m1 + paddw m2, m4 + paddw m8, m3 ; m8 = WORD ROW[3 2 1 0] + paddw m9, m2 ; m9 = WORD ROW[7 6 5 4] + +%ifidn %3,pp + pmulhrsw m8, m6 + pmulhrsw m9, m6 + packuswb m8, m9 + vextracti128 xm1, m8, 1 + movd [r2], xm8 + pextrd [r2 + r3], xm8, 1 + movd [r2 + r3 * 2], xm1 + pextrd [r2 + r8], xm1, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm8, 2 + pextrd [r2 + r3], xm8, 3 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r8], xm1, 3 +%else + psubw m8, m6 + psubw m9, m6 + vextracti128 xm1, m8, 1 + vextracti128 xm2, m9, 1 + movq [r2], xm8 + movhps [r2 + r3], xm8 + movq [r2 + r3 * 2], xm1 + movhps [r2 + r8], xm1 + lea r2, [r2 + r3 * 4] + movq [r2], xm9 + movhps [r2 + r3], xm9 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r8], xm2 +%endif + lea r2, [r2 + r3 * 4] + sub r0, r6 + dec r7d + jnz .loop + RET +%endif +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 4, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 8, pp +FILTER_VER_LUMA_AVX2_4xN 4, 8, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 16, pp +FILTER_VER_LUMA_AVX2_4xN 4, 16, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 4, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 8, ps +FILTER_VER_LUMA_AVX2_4xN 4, 8, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_4xN 4, 16, ps +FILTER_VER_LUMA_AVX2_4xN 4, 16, ps + +%macro PROCESS_LUMA_AVX2_W8_8R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 + pmaddubsw m3, m0, [r5 + 2 * mmsize] + paddw m1, m3 + pmaddubsw m0, [r5 + 1 * mmsize] + paddw m4, m0 + + movq xm3, [r0 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 ; m6 = [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 12 + punpcklbw xm3, xm0 ; m3 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] + vinserti128 m6, m6, xm3, 1 ; m6 = [C7 B7 C6 B6 C5 B5 C4 B4 C3 B3 C2 B2 C1 B1 C0 B0] - [B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0] + pmaddubsw m3, m6, [r5 + 3 * mmsize] + paddw m1, m3 + pmaddubsw m6, [r5 + 2 * mmsize] + paddw m4, m6 + movq xm3, [r0 + r1] ; m3 = row 13 + punpcklbw xm0, xm3 ; m0 = [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] + movq xm6, [r0 + r1 * 2] ; m6 = row 14 + punpcklbw xm3, xm6 ; m3 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] + vinserti128 m0, m0, xm3, 1 ; m0 = [E7 D7 E6 D6 E5 D5 E4 D4 E3 D3 E2 D2 E1 D1 E0 D0] - [D7 C7 D6 C6 D5 C5 D4 C4 D3 C3 D2 C2 D1 C1 D0 C0] + pmaddubsw m0, [r5 + 3 * mmsize] + paddw m4, m0 +%endmacro + +%macro PROCESS_LUMA_AVX2_W8_4R 0 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 ; m1 = [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 ; m2 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] + vinserti128 m5, m1, xm2, 1 ; m5 = [27 17 26 16 25 15 24 14 23 13 22 12 21 11 20 10] - [17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00] + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 ; m3 = [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + lea r0, [r0 + r1 * 4] + movq xm1, [r0] ; m1 = row 4 + punpcklbw xm4, xm1 ; m4 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] + vinserti128 m2, m3, xm4, 1 ; m2 = [47 37 46 36 45 35 44 34 43 33 42 32 41 31 40 30] - [37 27 36 26 35 25 34 24 33 23 32 22 31 21 30 20] + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r0 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 ; m1 = [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + movq xm4, [r0 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 ; m3 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] + vinserti128 m1, m1, xm3, 1 ; m1 = [67 57 66 56 65 55 64 54 63 53 62 52 61 51 60 50] - [57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40] + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + movq xm3, [r0 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 ; m4 = [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + lea r0, [r0 + r1 * 4] + movq xm0, [r0] ; m0 = row 8 + punpcklbw xm3, xm0 ; m3 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] + vinserti128 m4, m4, xm3, 1 ; m4 = [87 77 86 76 85 75 84 74 83 73 82 72 81 71 80 70] - [77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60] + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + movq xm3, [r0 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 ; m0 = [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + movq xm6, [r0 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 ; m3 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] + vinserti128 m0, m0, xm3, 1 ; m0 = [A7 97 A6 96 A5 95 A4 94 A3 93 A2 92 A1 91 A0 90] - [97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80] + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_8xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 + +%ifidn %3,ps + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + + %ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif + + mov r4d, %2/4 + lea r5, [4 * r1] + +.loopH: + PROCESS_LUMA_W8_4R + +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 + + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r2, [r2 + 2 * r3] + movlps [r2], m5 + movhps [r2 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r2, [r2 + 2 * r3] + movu [r2], m5 + movu [r2 + r3], m4 +%endif + + sub r0, r5 + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +%macro FILTER_VER_LUMA_AVX2_8xN 3 +INIT_YMM avx2 +cglobal interp_8tap_vert_%3_%1x%2, 4, 7, 8, 0-gprsize + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 + lea r6, [r1 * 4] +%ifidn %3,pp + mova m7, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m7, [pw_2000] +%endif + mov word [rsp], %2 / 8 + +.loop: + PROCESS_LUMA_AVX2_W8_8R +%ifidn %3,pp + pmulhrsw m5, m7 ; m5 = word: row 0, row 1 + pmulhrsw m2, m7 ; m2 = word: row 2, row 3 + pmulhrsw m1, m7 ; m1 = word: row 4, row 5 + pmulhrsw m4, m7 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + lea r2, [r2 + r3 * 2] + movhps [r2], xm5 + movhps [r2 + r3], xm2 + lea r2, [r2 + r3 * 2] + movq [r2], xm1 + movq [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm1 + movhps [r2 + r3], xm4 +%else + psubw m5, m7 ; m5 = word: row 0, row 1 + psubw m2, m7 ; m2 = word: row 2, row 3 + psubw m1, m7 ; m1 = word: row 4, row 5 + psubw m4, m7 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm6 + lea r2, [r2 + r3 * 2] + movu [r2], xm2 + movu [r2 + r3], xm3 + lea r2, [r2 + r3 * 2] + movu [r2], xm1 + movu [r2 + r3], xm0 + lea r2, [r2 + r3 * 2] + movu [r2], xm4 + vextracti128 xm4, m4, 1 + movu [r2 + r3], xm4 +%endif + lea r2, [r2 + r3 * 2] + sub r0, r6 + dec word [rsp] + jnz .loop + RET +%endmacro + +%macro FILTER_VER_LUMA_AVX2_8x8 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_8x8, 4, 6, 7 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + PROCESS_LUMA_AVX2_W8_8R +%ifidn %1,pp + mova m3, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] +%endif + lea r4, [r3 * 3] +%ifidn %1,pp + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + pmulhrsw m1, m3 ; m1 = word: row 4, row 5 + pmulhrsw m4, m3 ; m4 = word: row 6, row 7 + packuswb m5, m2 + packuswb m1, m4 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm1 + movq [r2 + r3], xm4 + movhps [r2 + r3 * 2], xm1 + movhps [r2 + r4], xm4 +%else + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + psubw m1, m3 ; m1 = word: row 4, row 5 + psubw m4, m3 ; m4 = word: row 6, row 7 + vextracti128 xm6, m5, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm0, m1, 1 + movu [r2], xm5 + movu [r2 + r3], xm6 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm4 + vextracti128 xm4, m4, 1 + movu [r2 + r4], xm4 +%endif + RET +%endmacro + +%macro FILTER_VER_LUMA_AVX2_8x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_8x4, 4, 6, 7 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + PROCESS_LUMA_AVX2_W8_4R +%ifidn %1,pp + mova m3, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m3, [pw_2000] +%endif + lea r4, [r3 * 3] +%ifidn %1,pp + pmulhrsw m5, m3 ; m5 = word: row 0, row 1 + pmulhrsw m2, m3 ; m2 = word: row 2, row 3 + packuswb m5, m2 + vextracti128 xm2, m5, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r4], xm2 +%else + psubw m5, m3 ; m5 = word: row 0, row 1 + psubw m2, m3 ; m2 = word: row 2, row 3 + movu [r2], xm5 + vextracti128 xm5, m5, 1 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movu [r2 + r4], xm2 +%endif + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 4, pp +FILTER_VER_LUMA_AVX2_8x4 pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 8, pp +FILTER_VER_LUMA_AVX2_8x8 pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 16, pp +FILTER_VER_LUMA_AVX2_8xN 8, 16, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 32, pp +FILTER_VER_LUMA_AVX2_8xN 8, 32, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 4, ps +FILTER_VER_LUMA_AVX2_8x4 ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 8, ps +FILTER_VER_LUMA_AVX2_8x8 ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 16, ps +FILTER_VER_LUMA_AVX2_8xN 8, 16, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_8xN 8, 32, ps +FILTER_VER_LUMA_AVX2_8xN 8, 32, ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_12xN 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + + %ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif + + mov r4d, %2/4 + +.loopH: + PROCESS_LUMA_W8_4R + +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 + + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r5, [r2 + 2 * r3] + movu [r5], m5 + movu [r5 + r3], m4 +%endif + + lea r5, [8 * r1 - 8] + sub r0, r5 +%ifidn %3,pp + add r2, 8 +%else + add r2, 16 +%endif + + PROCESS_LUMA_W4_4R + +%ifidn %3,pp + pmulhrsw m4, m3 + pmulhrsw m5, m3 + + packuswb m4, m5 + + movd [r2], m4 + pextrd [r2 + r3], m4, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m4, 2 + pextrd [r5 + r3], m4, 3 +%else + psubw m4, m3 + psubw m5, m3 + + movlps [r2], m4 + movhps [r2 + r3], m4 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%endif + + lea r5, [4 * r1 + 8] + sub r0, r5 +%ifidn %3,pp + lea r2, [r2 + 4 * r3 - 8] +%else + lea r2, [r2 + 4 * r3 - 16] +%endif + + dec r4d + jnz .loopH + + RET +%endmacro + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_12xN 12, 16, pp + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +FILTER_VER_LUMA_12xN 12, 16, ps + +%macro FILTER_VER_LUMA_AVX2_12x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_12x16, 4, 7, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r6, [r3 * 3] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movq [r2], xm0 + pextrd [r2 + 8], xm0, 2 + movq [r2 + r3], xm1 + pextrd [r2 + r3 + 8], xm1, 2 + movq [r2 + r3 * 2], xm2 + pextrd [r2 + r3 * 2 + 8], xm2, 2 + movq [r2 + r6], xm3 + pextrd [r2 + r6 + 8], xm3, 2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + pextrd [r2 + 8], xm4, 2 + movq [r2 + r3], xm5 + pextrd [r2 + r3 + 8], xm5, 2 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + 16], xm0 + movu [r2 + r3], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r3 + 16], xm1 + movu [r2 + r3 * 2], xm2 + vextracti128 xm2, m2, 1 + movq [r2 + r3 * 2 + 16], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m3, 1 + movq [r2 + r6 + 16], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + vextracti128 xm4, m4, 1 + movq [r2 + 16], xm4 + movu [r2 + r3], xm5 + vextracti128 xm5, m5, 1 + movq [r2 + r3 + 16], xm5 +%endif + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movq [r2 + r3 * 2], xm6 + pextrd [r2 + r3 * 2 + 8], xm6, 2 + movq [r2 + r6], xm7 + pextrd [r2 + r6 + 8], xm7, 2 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], xm6 + vextracti128 xm6, m6, 1 + movq [r2 + r3 * 2 + 16], xm6 + movu [r2 + r6], xm7 + vextracti128 xm7, m7, 1 + movq [r2 + r6 + 16], xm7 +%endif + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 + +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movq [r2], xm8 + pextrd [r2 + 8], xm8, 2 + movq [r2 + r3], xm9 + pextrd [r2 + r3 + 8], xm9, 2 + movq [r2 + r3 * 2], xm10 + pextrd [r2 + r3 * 2 + 8], xm10, 2 + movq [r2 + r6], xm11 + pextrd [r2 + r6 + 8], xm11, 2 + lea r2, [r2 + r3 * 4] + movq [r2], xm12 + pextrd [r2 + 8], xm12, 2 + movq [r2 + r3], xm13 + pextrd [r2 + r3 + 8], xm13, 2 + movq [r2 + r3 * 2], xm0 + pextrd [r2 + r3 * 2 + 8], xm0, 2 + movq [r2 + r6], xm1 + pextrd [r2 + r6 + 8], xm1, 2 +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], xm8 + vextracti128 xm8, m8, 1 + movq [r2 + 16], xm8 + movu [r2 + r3], xm9 + vextracti128 xm9, m9, 1 + movq [r2 + r3 + 16], xm9 + movu [r2 + r3 * 2], xm10 + vextracti128 xm10, m10, 1 + movq [r2 + r3 * 2 + 16], xm10 + movu [r2 + r6], xm11 + vextracti128 xm11, m11, 1 + movq [r2 + r6 + 16], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + vextracti128 xm12, m12, 1 + movq [r2 + 16], xm12 + movu [r2 + r3], xm13 + vextracti128 xm13, m13, 1 + movq [r2 + r3 + 16], xm13 + movu [r2 + r3 * 2], xm0 + vextracti128 xm0, m0, 1 + movq [r2 + r3 * 2 + 16], xm0 + movu [r2 + r6], xm1 + vextracti128 xm1, m1, 1 + movq [r2 + r6 + 16], xm1 +%endif + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_12x16 pp +FILTER_VER_LUMA_AVX2_12x16 ps + +%macro FILTER_VER_LUMA_AVX2_16x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x16, 4, 7, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r6, [r3 * 3] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 +%endif + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r6], m7 +%endif + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 + +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r6], m11 + lea r2, [r2 + r3 * 4] + movu [r2], m12 + movu [r2 + r3], m13 + movu [r2 + r3 * 2], m0 + movu [r2 + r6], m1 +%endif + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_16x16 pp +FILTER_VER_LUMA_AVX2_16x16 ps + +%macro FILTER_VER_LUMA_AVX2_16x12 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x12, 4, 7, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + lea r6, [r3 * 3] + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 +%endif + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r6], m7 +%endif + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + packuswb m8, m9 + packuswb m10, m11 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r6], m11 +%endif + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_16x12 pp +FILTER_VER_LUMA_AVX2_16x12 ps + +%macro FILTER_VER_LUMA_AVX2_16x8 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x8, 4, 6, 15 + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + mova m14, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%endif + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + lea r4, [r3 * 3] +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r4], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 +%endif + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm7 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r4], m7 +%endif + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_16x8 pp +FILTER_VER_LUMA_AVX2_16x8 ps + +%macro FILTER_VER_LUMA_AVX2_16x4 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x4, 4, 6, 13 + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,pp + mova m12, [pw_512] +%else + add r3d, r3d + vbroadcasti128 m12, [pw_2000] +%endif + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 +%ifidn %1,pp + pmulhrsw m0, m12 ; m0 = word: row 0 + pmulhrsw m1, m12 ; m1 = word: row 1 + pmulhrsw m2, m12 ; m2 = word: row 2 + pmulhrsw m3, m12 ; m3 = word: row 3 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + lea r4, [r3 * 3] + movu [r2 + r4], xm3 +%else + psubw m0, m12 ; m0 = word: row 0 + psubw m1, m12 ; m1 = word: row 1 + psubw m2, m12 ; m2 = word: row 2 + psubw m3, m12 ; m3 = word: row 3 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + lea r4, [r3 * 3] + movu [r2 + r4], m3 +%endif + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_16x4 pp +FILTER_VER_LUMA_AVX2_16x4 ps +%macro FILTER_VER_LUMA_AVX2_16xN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %3,ps + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%else + mova m14, [pw_512] +%endif + lea r6, [r3 * 3] + lea r7, [r1 * 4] + mov r8d, %2 / 16 + +.loop: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + +%ifidn %3,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r2, [r2 + r3 * 4] + movu [r2], m4 + movu [r2 + r3], m5 +%endif + + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + +%ifidn %3,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r2 + r3 * 2], m6 + movu [r2 + r6], m7 +%endif + + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r0 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r0, [r0 + r1 * 4] + movu xm6, [r0] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r0 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 + +%ifidn %3,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm1 +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r2], m8 + movu [r2 + r3], m9 + movu [r2 + r3 * 2], m10 + movu [r2 + r6], m11 + lea r2, [r2 + r3 * 4] + movu [r2], m12 + movu [r2 + r3], m13 + movu [r2 + r3 * 2], m0 + movu [r2 + r6], m1 +%endif + + lea r2, [r2 + r3 * 4] + sub r0, r7 + dec r8d + jnz .loop + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_16xN 16, 32, pp +FILTER_VER_LUMA_AVX2_16xN 16, 64, pp +FILTER_VER_LUMA_AVX2_16xN 16, 32, ps +FILTER_VER_LUMA_AVX2_16xN 16, 64, ps + +%macro PROCESS_LUMA_AVX2_W16_16R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + pmaddubsw m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddubsw m11, m9, [r5 + 3 * mmsize] + paddw m3, m11 + pmaddubsw m11, m9, [r5 + 2 * mmsize] + paddw m5, m11 + pmaddubsw m11, m9, [r5 + 1 * mmsize] + paddw m7, m11 + pmaddubsw m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhbw xm12, xm10, xm11 + punpcklbw xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddubsw m12, m10, [r5 + 3 * mmsize] + paddw m4, m12 + pmaddubsw m12, m10, [r5 + 2 * mmsize] + paddw m6, m12 + pmaddubsw m12, m10, [r5 + 1 * mmsize] + paddw m8, m12 + pmaddubsw m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhbw xm13, xm11, xm12 + punpcklbw xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddubsw m13, m11, [r5 + 3 * mmsize] + paddw m5, m13 + pmaddubsw m13, m11, [r5 + 2 * mmsize] + paddw m7, m13 + pmaddubsw m13, m11, [r5 + 1 * mmsize] + paddw m9, m13 + pmaddubsw m11, [r5] + +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r8, [r2 + r3 * 4] + movu [r8], xm4 + movu [r8 + r3], xm5 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r8, [r2 + r3 * 4] + movu [r8], m4 + movu [r8 + r3], m5 +%endif + + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + pmaddubsw m0, m12, [r5 + 2 * mmsize] + paddw m8, m0 + pmaddubsw m0, m12, [r5 + 1 * mmsize] + paddw m10, m0 + pmaddubsw m12, [r5] + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + pmaddubsw m1, m13, [r5 + 2 * mmsize] + paddw m9, m1 + pmaddubsw m1, m13, [r5 + 1 * mmsize] + paddw m11, m1 + pmaddubsw m13, [r5] + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r8 + r3 * 2], m6 + movu [r8 + r6], m7 +%endif + + lea r8, [r8 + r3 * 4] + + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m2, m0, [r5 + 3 * mmsize] + paddw m8, m2 + pmaddubsw m2, m0, [r5 + 2 * mmsize] + paddw m10, m2 + pmaddubsw m2, m0, [r5 + 1 * mmsize] + paddw m12, m2 + pmaddubsw m0, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 3 * mmsize] + paddw m9, m3 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m11, m3 + pmaddubsw m3, m1, [r5 + 1 * mmsize] + paddw m13, m3 + pmaddubsw m1, [r5] + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 3 * mmsize] + paddw m10, m4 + pmaddubsw m4, m2, [r5 + 2 * mmsize] + paddw m12, m4 + pmaddubsw m2, [r5 + 1 * mmsize] + paddw m0, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 3 * mmsize] + paddw m11, m5 + pmaddubsw m5, m3, [r5 + 2 * mmsize] + paddw m13, m5 + pmaddubsw m3, [r5 + 1 * mmsize] + paddw m1, m3 + movu xm5, [r7 + r4] ; m5 = row 19 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 3 * mmsize] + paddw m12, m6 + pmaddubsw m4, [r5 + 2 * mmsize] + paddw m0, m4 + lea r7, [r7 + r1 * 4] + movu xm6, [r7] ; m6 = row 20 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 3 * mmsize] + paddw m13, m7 + pmaddubsw m5, [r5 + 2 * mmsize] + paddw m1, m5 + movu xm7, [r7 + r1] ; m7 = row 21 + punpckhbw xm2, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddubsw m6, [r5 + 3 * mmsize] + paddw m0, m6 + movu xm2, [r7 + r1 * 2] ; m2 = row 22 + punpckhbw xm3, xm7, xm2 + punpcklbw xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m7, [r5 + 3 * mmsize] + paddw m1, m7 + +%ifidn %1,pp + pmulhrsw m8, m14 ; m8 = word: row 8 + pmulhrsw m9, m14 ; m9 = word: row 9 + pmulhrsw m10, m14 ; m10 = word: row 10 + pmulhrsw m11, m14 ; m11 = word: row 11 + pmulhrsw m12, m14 ; m12 = word: row 12 + pmulhrsw m13, m14 ; m13 = word: row 13 + pmulhrsw m0, m14 ; m0 = word: row 14 + pmulhrsw m1, m14 ; m1 = word: row 15 + packuswb m8, m9 + packuswb m10, m11 + packuswb m12, m13 + packuswb m0, m1 + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + lea r8, [r8 + r3 * 4] + movu [r8], xm12 + movu [r8 + r3], xm13 + movu [r8 + r3 * 2], xm0 + movu [r8 + r6], xm1 +%else + psubw m8, m14 ; m8 = word: row 8 + psubw m9, m14 ; m9 = word: row 9 + psubw m10, m14 ; m10 = word: row 10 + psubw m11, m14 ; m11 = word: row 11 + psubw m12, m14 ; m12 = word: row 12 + psubw m13, m14 ; m13 = word: row 13 + psubw m0, m14 ; m0 = word: row 14 + psubw m1, m14 ; m1 = word: row 15 + movu [r8], m8 + movu [r8 + r3], m9 + movu [r8 + r3 * 2], m10 + movu [r8 + r6], m11 + lea r8, [r8 + r3 * 4] + movu [r8], m12 + movu [r8 + r3], m13 + movu [r8 + r3 * 2], m0 + movu [r8 + r6], m1 +%endif +%endmacro + +%macro PROCESS_LUMA_AVX2_W16_8R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhbw xm2, xm0, xm1 + punpcklbw xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddubsw m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhbw xm3, xm1, xm2 + punpcklbw xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhbw xm4, xm2, xm3 + punpcklbw xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddubsw m4, m2, [r5 + 1 * mmsize] + paddw m0, m4 + pmaddubsw m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhbw xm5, xm3, xm4 + punpcklbw xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddubsw m5, m3, [r5 + 1 * mmsize] + paddw m1, m5 + pmaddubsw m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhbw xm6, xm4, xm5 + punpcklbw xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddubsw m6, m4, [r5 + 2 * mmsize] + paddw m0, m6 + pmaddubsw m6, m4, [r5 + 1 * mmsize] + paddw m2, m6 + pmaddubsw m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhbw xm7, xm5, xm6 + punpcklbw xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddubsw m7, m5, [r5 + 2 * mmsize] + paddw m1, m7 + pmaddubsw m7, m5, [r5 + 1 * mmsize] + paddw m3, m7 + pmaddubsw m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhbw xm8, xm6, xm7 + punpcklbw xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddubsw m8, m6, [r5 + 3 * mmsize] + paddw m0, m8 + pmaddubsw m8, m6, [r5 + 2 * mmsize] + paddw m2, m8 + pmaddubsw m8, m6, [r5 + 1 * mmsize] + paddw m4, m8 + pmaddubsw m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhbw xm9, xm7, xm8 + punpcklbw xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddubsw m9, m7, [r5 + 3 * mmsize] + paddw m1, m9 + pmaddubsw m9, m7, [r5 + 2 * mmsize] + paddw m3, m9 + pmaddubsw m9, m7, [r5 + 1 * mmsize] + paddw m5, m9 + pmaddubsw m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhbw xm10, xm8, xm9 + punpcklbw xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddubsw m10, m8, [r5 + 3 * mmsize] + paddw m2, m10 + pmaddubsw m10, m8, [r5 + 2 * mmsize] + paddw m4, m10 + pmaddubsw m10, m8, [r5 + 1 * mmsize] + paddw m6, m10 + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhbw xm11, xm9, xm10 + punpcklbw xm9, xm10 vinserti128 m9, m9, xm11, 1 pmaddubsw m11, m9, [r5 + 3 * mmsize] paddw m3, m11 @@ -5682,8 +11145,7 @@ paddw m5, m11 pmaddubsw m11, m9, [r5 + 1 * mmsize] paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 + movu xm11, [r7 + r4] ; m11 = row 11 punpckhbw xm12, xm10, xm11 punpcklbw xm10, xm11 vinserti128 m10, m10, xm12, 1 @@ -5691,11 +11153,8 @@ paddw m4, m12 pmaddubsw m12, m10, [r5 + 2 * mmsize] paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 punpckhbw xm13, xm11, xm12 punpcklbw xm11, xm12 vinserti128 m11, m11, xm13, 1 @@ -5703,3746 +11162,9397 @@ paddw m5, m13 pmaddubsw m13, m11, [r5 + 2 * mmsize] paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 +%ifidn %1,pp + pmulhrsw m0, m14 ; m0 = word: row 0 + pmulhrsw m1, m14 ; m1 = word: row 1 + pmulhrsw m2, m14 ; m2 = word: row 2 + pmulhrsw m3, m14 ; m3 = word: row 3 + pmulhrsw m4, m14 ; m4 = word: row 4 + pmulhrsw m5, m14 ; m5 = word: row 5 + packuswb m0, m1 + packuswb m2, m3 + packuswb m4, m5 + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + lea r8, [r2 + r3 * 4] + movu [r8], xm4 + movu [r8 + r3], xm5 +%else + psubw m0, m14 ; m0 = word: row 0 + psubw m1, m14 ; m1 = word: row 1 + psubw m2, m14 ; m2 = word: row 2 + psubw m3, m14 ; m3 = word: row 3 + psubw m4, m14 ; m4 = word: row 4 + psubw m5, m14 ; m5 = word: row 5 + movu [r2], m0 + movu [r2 + r3], m1 + movu [r2 + r3 * 2], m2 + movu [r2 + r6], m3 + lea r8, [r2 + r3 * 4] + movu [r8], m4 + movu [r8 + r3], m5 +%endif + + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhbw xm0, xm12, xm13 + punpcklbw xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddubsw m0, m12, [r5 + 3 * mmsize] + paddw m6, m0 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhbw xm1, xm13, xm0 + punpcklbw xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddubsw m1, m13, [r5 + 3 * mmsize] + paddw m7, m1 + +%ifidn %1,pp + pmulhrsw m6, m14 ; m6 = word: row 6 + pmulhrsw m7, m14 ; m7 = word: row 7 + packuswb m6, m7 + vpermq m6, m6, 11011000b + vextracti128 xm7, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%else + psubw m6, m14 ; m6 = word: row 6 + psubw m7, m14 ; m7 = word: row 7 + movu [r8 + r3 * 2], m6 + movu [r8 + r6], m7 +%endif +%endmacro + +%macro FILTER_VER_LUMA_AVX2_24x32 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_24x32, 4, 11, 15 + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,ps + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%else + mova m14, [pw_512] +%endif + lea r6, [r3 * 3] + lea r10, [r1 * 4] + mov r9d, 2 +.loopH: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + movq xm1, [r0] ; m1 = row 0 + movq xm2, [r0 + r1] ; m2 = row 1 + punpcklbw xm1, xm2 + movq xm3, [r0 + r1 * 2] ; m3 = row 2 + punpcklbw xm2, xm3 + vinserti128 m5, m1, xm2, 1 + pmaddubsw m5, [r5] + movq xm4, [r0 + r4] ; m4 = row 3 + punpcklbw xm3, xm4 + lea r7, [r0 + r1 * 4] + movq xm1, [r7] ; m1 = row 4 + punpcklbw xm4, xm1 + vinserti128 m2, m3, xm4, 1 + pmaddubsw m0, m2, [r5 + 1 * mmsize] + paddw m5, m0 + pmaddubsw m2, [r5] + movq xm3, [r7 + r1] ; m3 = row 5 + punpcklbw xm1, xm3 + movq xm4, [r7 + r1 * 2] ; m4 = row 6 + punpcklbw xm3, xm4 + vinserti128 m1, m1, xm3, 1 + pmaddubsw m3, m1, [r5 + 2 * mmsize] + paddw m5, m3 + pmaddubsw m0, m1, [r5 + 1 * mmsize] + paddw m2, m0 + pmaddubsw m1, [r5] + movq xm3, [r7 + r4] ; m3 = row 7 + punpcklbw xm4, xm3 + lea r7, [r7 + r1 * 4] + movq xm0, [r7] ; m0 = row 8 + punpcklbw xm3, xm0 + vinserti128 m4, m4, xm3, 1 + pmaddubsw m3, m4, [r5 + 3 * mmsize] + paddw m5, m3 + pmaddubsw m3, m4, [r5 + 2 * mmsize] + paddw m2, m3 + pmaddubsw m3, m4, [r5 + 1 * mmsize] + paddw m1, m3 + pmaddubsw m4, [r5] + movq xm3, [r7 + r1] ; m3 = row 9 + punpcklbw xm0, xm3 + movq xm6, [r7 + r1 * 2] ; m6 = row 10 + punpcklbw xm3, xm6 + vinserti128 m0, m0, xm3, 1 + pmaddubsw m3, m0, [r5 + 3 * mmsize] + paddw m2, m3 + pmaddubsw m3, m0, [r5 + 2 * mmsize] + paddw m1, m3 + pmaddubsw m3, m0, [r5 + 1 * mmsize] + paddw m4, m3 + pmaddubsw m0, [r5] + + movq xm3, [r7 + r4] ; m3 = row 11 + punpcklbw xm6, xm3 + lea r7, [r7 + r1 * 4] + movq xm7, [r7] ; m7 = row 12 + punpcklbw xm3, xm7 + vinserti128 m6, m6, xm3, 1 + pmaddubsw m3, m6, [r5 + 3 * mmsize] + paddw m1, m3 + pmaddubsw m3, m6, [r5 + 2 * mmsize] + paddw m4, m3 + pmaddubsw m3, m6, [r5 + 1 * mmsize] + paddw m0, m3 + pmaddubsw m6, [r5] + movq xm3, [r7 + r1] ; m3 = row 13 + punpcklbw xm7, xm3 + movq xm8, [r7 + r1 * 2] ; m8 = row 14 + punpcklbw xm3, xm8 + vinserti128 m7, m7, xm3, 1 + pmaddubsw m3, m7, [r5 + 3 * mmsize] + paddw m4, m3 + pmaddubsw m3, m7, [r5 + 2 * mmsize] + paddw m0, m3 + pmaddubsw m3, m7, [r5 + 1 * mmsize] + paddw m6, m3 + pmaddubsw m7, [r5] + movq xm3, [r7 + r4] ; m3 = row 15 + punpcklbw xm8, xm3 + lea r7, [r7 + r1 * 4] + movq xm9, [r7] ; m9 = row 16 + punpcklbw xm3, xm9 + vinserti128 m8, m8, xm3, 1 + pmaddubsw m3, m8, [r5 + 3 * mmsize] + paddw m0, m3 + pmaddubsw m3, m8, [r5 + 2 * mmsize] + paddw m6, m3 + pmaddubsw m3, m8, [r5 + 1 * mmsize] + paddw m7, m3 + pmaddubsw m8, [r5] + movq xm3, [r7 + r1] ; m3 = row 17 + punpcklbw xm9, xm3 + movq xm10, [r7 + r1 * 2] ; m10 = row 18 + punpcklbw xm3, xm10 + vinserti128 m9, m9, xm3, 1 + pmaddubsw m3, m9, [r5 + 3 * mmsize] + paddw m6, m3 + pmaddubsw m3, m9, [r5 + 2 * mmsize] + paddw m7, m3 + pmaddubsw m3, m9, [r5 + 1 * mmsize] + paddw m8, m3 + movq xm3, [r7 + r4] ; m3 = row 19 + punpcklbw xm10, xm3 + lea r7, [r7 + r1 * 4] + movq xm9, [r7] ; m9 = row 20 + punpcklbw xm3, xm9 + vinserti128 m10, m10, xm3, 1 + pmaddubsw m3, m10, [r5 + 3 * mmsize] + paddw m7, m3 + pmaddubsw m3, m10, [r5 + 2 * mmsize] + paddw m8, m3 + movq xm3, [r7 + r1] ; m3 = row 21 + punpcklbw xm9, xm3 + movq xm10, [r7 + r1 * 2] ; m10 = row 22 + punpcklbw xm3, xm10 + vinserti128 m9, m9, xm3, 1 + pmaddubsw m3, m9, [r5 + 3 * mmsize] + paddw m8, m3 +%ifidn %1,pp + pmulhrsw m5, m14 ; m5 = word: row 0, row 1 + pmulhrsw m2, m14 ; m2 = word: row 2, row 3 + pmulhrsw m1, m14 ; m1 = word: row 4, row 5 + pmulhrsw m4, m14 ; m4 = word: row 6, row 7 + pmulhrsw m0, m14 ; m0 = word: row 8, row 9 + pmulhrsw m6, m14 ; m6 = word: row 10, row 11 + pmulhrsw m7, m14 ; m7 = word: row 12, row 13 + pmulhrsw m8, m14 ; m8 = word: row 14, row 15 + packuswb m5, m2 + packuswb m1, m4 + packuswb m0, m6 + packuswb m7, m8 + vextracti128 xm2, m5, 1 + vextracti128 xm4, m1, 1 + vextracti128 xm6, m0, 1 + vextracti128 xm8, m7, 1 + movq [r2], xm5 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm2 + lea r8, [r2 + r3 * 4] + movq [r8], xm1 + movq [r8 + r3], xm4 + movhps [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm4 + lea r8, [r8 + r3 * 4] + movq [r8], xm0 + movq [r8 + r3], xm6 + movhps [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm6 + lea r8, [r8 + r3 * 4] + movq [r8], xm7 + movq [r8 + r3], xm8 + movhps [r8 + r3 * 2], xm7 + movhps [r8 + r6], xm8 +%else + psubw m5, m14 ; m5 = word: row 0, row 1 + psubw m2, m14 ; m2 = word: row 2, row 3 + psubw m1, m14 ; m1 = word: row 4, row 5 + psubw m4, m14 ; m4 = word: row 6, row 7 + psubw m0, m14 ; m0 = word: row 8, row 9 + psubw m6, m14 ; m6 = word: row 10, row 11 + psubw m7, m14 ; m7 = word: row 12, row 13 + psubw m8, m14 ; m8 = word: row 14, row 15 + vextracti128 xm3, m5, 1 + movu [r2], xm5 + movu [r2 + r3], xm3 + vextracti128 xm3, m2, 1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + vextracti128 xm3, m1, 1 + lea r8, [r2 + r3 * 4] + movu [r8], xm1 + movu [r8 + r3], xm3 + vextracti128 xm3, m4, 1 + movu [r8 + r3 * 2], xm4 + movu [r8 + r6], xm3 + vextracti128 xm3, m0, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm0 + movu [r8 + r3], xm3 + vextracti128 xm3, m6, 1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm3 + vextracti128 xm3, m7, 1 + lea r8, [r8 + r3 * 4] + movu [r8], xm7 + movu [r8 + r3], xm3 + vextracti128 xm3, m8, 1 + movu [r8 + r3 * 2], xm8 + movu [r8 + r6], xm3 +%endif + sub r7, r10 + lea r0, [r7 - 16] +%ifidn %1,pp + lea r2, [r8 + r3 * 4 - 16] +%else + lea r2, [r8 + r3 * 4 - 32] +%endif + dec r9d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_24x32 pp +FILTER_VER_LUMA_AVX2_24x32 ps + +%macro FILTER_VER_LUMA_AVX2_32xN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %3,ps + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%else + mova m14, [pw_512] +%endif + lea r6, [r3 * 3] + lea r11, [r1 * 4] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 16 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %3 +%ifidn %3,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 16] +%ifidn %3,pp + lea r2, [r8 + r3 * 4 - 16] +%else + lea r2, [r8 + r3 * 4 - 32] +%endif + dec r9d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_32xN 32, 32, pp +FILTER_VER_LUMA_AVX2_32xN 32, 64, pp +FILTER_VER_LUMA_AVX2_32xN 32, 32, ps +FILTER_VER_LUMA_AVX2_32xN 32, 64, ps + +%macro FILTER_VER_LUMA_AVX2_32x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x16, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,ps + add r3d, r3d + vbroadcasti128 m14, [pw_2000] +%else + mova m14, [pw_512] +%endif + lea r6, [r3 * 3] + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r9d + jnz .loopW + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_32x16 pp +FILTER_VER_LUMA_AVX2_32x16 ps + +%macro FILTER_VER_LUMA_AVX2_32x24 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,ps + add r3d, r3d +%endif + lea r6, [r3 * 3] +%ifidn %1,pp + mova m14, [pw_512] +%else + vbroadcasti128 m14, [pw_2000] +%endif + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r9d + jnz .loopW + lea r9, [r1 * 4] + sub r7, r9 + lea r0, [r7 - 16] +%ifidn %1,pp + lea r2, [r8 + r3 * 4 - 16] +%else + lea r2, [r8 + r3 * 4 - 32] +%endif + mov r9d, 2 +.loop: + PROCESS_LUMA_AVX2_W16_8R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r9d + jnz .loop + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_32x24 pp +FILTER_VER_LUMA_AVX2_32x24 ps + +%macro FILTER_VER_LUMA_AVX2_32x8 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x8, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,ps + add r3d, r3d +%endif + lea r6, [r3 * 3] +%ifidn %1,pp + mova m14, [pw_512] +%else + vbroadcasti128 m14, [pw_2000] +%endif + mov r9d, 2 +.loopW: + PROCESS_LUMA_AVX2_W16_8R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r9d + jnz .loopW + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_32x8 pp +FILTER_VER_LUMA_AVX2_32x8 ps + +%macro FILTER_VER_LUMA_AVX2_48x64 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_48x64, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %1,ps + add r3d, r3d +%endif + + lea r6, [r3 * 3] + lea r11, [r1 * 4] + +%ifidn %1,pp + mova m14, [pw_512] +%else + vbroadcasti128 m14, [pw_2000] +%endif + + mov r9d, 4 +.loopH: + mov r10d, 3 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 32] +%ifidn %1,pp + lea r2, [r8 + r3 * 4 - 32] +%else + lea r2, [r8 + r3 * 4 - 64] +%endif + dec r9d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_48x64 pp +FILTER_VER_LUMA_AVX2_48x64 ps + +%macro FILTER_VER_LUMA_AVX2_64xN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %3,ps + add r3d, r3d +%endif + + lea r6, [r3 * 3] + lea r11, [r1 * 4] + +%ifidn %3,pp + mova m14, [pw_512] +%else + vbroadcasti128 m14, [pw_2000] +%endif + + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 16 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %3 +%ifidn %3,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 48] +%ifidn %3,pp + lea r2, [r8 + r3 * 4 - 48] +%else + lea r2, [r8 + r3 * 4 - 96] +%endif + dec r9d + jnz .loopH + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_64xN 64, 32, pp +FILTER_VER_LUMA_AVX2_64xN 64, 48, pp +FILTER_VER_LUMA_AVX2_64xN 64, 64, pp +FILTER_VER_LUMA_AVX2_64xN 64, 32, ps +FILTER_VER_LUMA_AVX2_64xN 64, 48, ps +FILTER_VER_LUMA_AVX2_64xN 64, 64, ps + +%macro FILTER_VER_LUMA_AVX2_64x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_64x16, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + +%ifdef PIC + lea r5, [tab_LumaCoeffVer_32] + add r5, r4 +%else + lea r5, [tab_LumaCoeffVer_32 + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r4 + +%ifidn %1,ps + add r3d, r3d +%endif + + lea r6, [r3 * 3] + +%ifidn %1,pp + mova m14, [pw_512] +%else + vbroadcasti128 m14, [pw_2000] +%endif + + mov r9d, 4 +.loopW: + PROCESS_LUMA_AVX2_W16_16R %1 +%ifidn %1,pp + add r2, 16 +%else + add r2, 32 +%endif + add r0, 16 + dec r9d + jnz .loopW + RET +%endif +%endmacro + +FILTER_VER_LUMA_AVX2_64x16 pp +FILTER_VER_LUMA_AVX2_64x16 ps + +;------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA 3 +INIT_XMM sse4 +cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 +%ifidn %3,ps + add r3d, r3d +%endif + +%ifdef PIC + lea r5, [tab_LumaCoeffVer] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffVer + r4] +%endif + +%ifidn %3,pp + mova m3, [pw_512] +%else + mova m3, [pw_2000] +%endif + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/8) +.loopW: + PROCESS_LUMA_W8_4R +%ifidn %3,pp + pmulhrsw m7, m3 + pmulhrsw m6, m3 + pmulhrsw m5, m3 + pmulhrsw m4, m3 + + packuswb m7, m6 + packuswb m5, m4 + + movlps [r2], m7 + movhps [r2 + r3], m7 + lea r5, [r2 + 2 * r3] + movlps [r5], m5 + movhps [r5 + r3], m5 +%else + psubw m7, m3 + psubw m6, m3 + psubw m5, m3 + psubw m4, m3 + + movu [r2], m7 + movu [r2 + r3], m6 + lea r5, [r2 + 2 * r3] + movu [r5], m5 + movu [r5 + r3], m4 +%endif + + lea r5, [8 * r1 - 8] + sub r0, r5 +%ifidn %3,pp + add r2, 8 +%else + add r2, 16 +%endif + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - %1] +%ifidn %3,pp + lea r2, [r2 + 4 * r3 - %1] +%else + lea r2, [r2 + 4 * r3 - 2 * %1] +%endif + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + +FILTER_VER_LUMA 16, 4, pp +FILTER_VER_LUMA 16, 8, pp +FILTER_VER_LUMA 16, 12, pp +FILTER_VER_LUMA 16, 16, pp +FILTER_VER_LUMA 16, 32, pp +FILTER_VER_LUMA 16, 64, pp +FILTER_VER_LUMA 24, 32, pp +FILTER_VER_LUMA 32, 8, pp +FILTER_VER_LUMA 32, 16, pp +FILTER_VER_LUMA 32, 24, pp +FILTER_VER_LUMA 32, 32, pp +FILTER_VER_LUMA 32, 64, pp +FILTER_VER_LUMA 48, 64, pp +FILTER_VER_LUMA 64, 16, pp +FILTER_VER_LUMA 64, 32, pp +FILTER_VER_LUMA 64, 48, pp +FILTER_VER_LUMA 64, 64, pp + +FILTER_VER_LUMA 16, 4, ps +FILTER_VER_LUMA 16, 8, ps +FILTER_VER_LUMA 16, 12, ps +FILTER_VER_LUMA 16, 16, ps +FILTER_VER_LUMA 16, 32, ps +FILTER_VER_LUMA 16, 64, ps +FILTER_VER_LUMA 24, 32, ps +FILTER_VER_LUMA 32, 8, ps +FILTER_VER_LUMA 32, 16, ps +FILTER_VER_LUMA 32, 24, ps +FILTER_VER_LUMA 32, 32, ps +FILTER_VER_LUMA 32, 64, ps +FILTER_VER_LUMA 48, 64, ps +FILTER_VER_LUMA 64, 16, ps +FILTER_VER_LUMA 64, 32, ps +FILTER_VER_LUMA 64, 48, ps +FILTER_VER_LUMA 64, 64, ps + +%macro PROCESS_LUMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SP 2 +INIT_XMM sse4 +cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize + + add r1d, r1d + lea r5, [r1 + 2 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffV + r4] +%endif + + mova m7, [tab_c_526336] + + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_LUMA_SP_W4_4R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- + FILTER_VER_LUMA_SP 4, 4 + FILTER_VER_LUMA_SP 8, 8 + FILTER_VER_LUMA_SP 8, 4 + FILTER_VER_LUMA_SP 4, 8 + FILTER_VER_LUMA_SP 16, 16 + FILTER_VER_LUMA_SP 16, 8 + FILTER_VER_LUMA_SP 8, 16 + FILTER_VER_LUMA_SP 16, 12 + FILTER_VER_LUMA_SP 12, 16 + FILTER_VER_LUMA_SP 16, 4 + FILTER_VER_LUMA_SP 4, 16 + FILTER_VER_LUMA_SP 32, 32 + FILTER_VER_LUMA_SP 32, 16 + FILTER_VER_LUMA_SP 16, 32 + FILTER_VER_LUMA_SP 32, 24 + FILTER_VER_LUMA_SP 24, 32 + FILTER_VER_LUMA_SP 32, 8 + FILTER_VER_LUMA_SP 8, 32 + FILTER_VER_LUMA_SP 64, 64 + FILTER_VER_LUMA_SP 64, 32 + FILTER_VER_LUMA_SP 32, 64 + FILTER_VER_LUMA_SP 64, 48 + FILTER_VER_LUMA_SP 48, 64 + FILTER_VER_LUMA_SP 64, 16 + FILTER_VER_LUMA_SP 16, 64 + +; TODO: combin of U and V is more performance, but need more register +; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it +INIT_XMM ssse3 +cglobal chroma_p2s, 3, 7, 4 + + ; load width and height + mov r3d, r3m + mov r4d, r4m + + ; load constant + mova m2, [pb_128] + mova m3, [tab_c_64_n64] + +.loopH: + + xor r5d, r5d +.loopW: + lea r6, [r0 + r5] + + movh m0, [r6] + punpcklbw m0, m2 + pmaddubsw m0, m3 + + movh m1, [r6 + r1] + punpcklbw m1, m2 + pmaddubsw m1, m3 + + add r5d, 8 + cmp r5d, r3d + lea r6, [r2 + r5 * 2] + jg .width4 + movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + je .nextH + jmp .loopW + +.width4: + test r3d, 4 + jz .width2 + test r3d, 2 + movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + lea r6, [r6 + 8] + pshufd m0, m0, 2 + pshufd m1, m1, 2 + jz .nextH + +.width2: + movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0 + movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + +.nextH: + lea r0, [r0 + r1 * 2] + add r2, FENC_STRIDE / 2 * 4 + + sub r4d, 2 + jnz .loopH + + RET + +%macro PROCESS_CHROMA_SP_W4_4R 0 + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m1, m4 ;m1=[1 2] + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[2 3] + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m4, [r6 + 1 * 16] + paddd m0, m4 ;m0=[0+1+2+3] Row1 done + + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[3 4] + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m5, [r6 + 1 * 16] + paddd m1, m5 ;m1 = [1+2+3+4] Row2 + + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[4 5] + pmaddwd m4, [r6 + 1 * 16] + paddd m2, m4 ;m2=[2+3+4+5] Row3 + + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[5 6] + pmaddwd m5, [r6 + 1 * 16] + paddd m3, m5 ;m3=[3+4+5+6] Row4 +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mova m6, [tab_c_526336] + + mov dword [rsp], %2/4 + +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R + + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SP 4, 4 + FILTER_VER_CHROMA_SP 4, 8 + FILTER_VER_CHROMA_SP 16, 16 + FILTER_VER_CHROMA_SP 16, 8 + FILTER_VER_CHROMA_SP 16, 12 + FILTER_VER_CHROMA_SP 12, 16 + FILTER_VER_CHROMA_SP 16, 4 + FILTER_VER_CHROMA_SP 4, 16 + FILTER_VER_CHROMA_SP 32, 32 + FILTER_VER_CHROMA_SP 32, 16 + FILTER_VER_CHROMA_SP 16, 32 + FILTER_VER_CHROMA_SP 32, 24 + FILTER_VER_CHROMA_SP 24, 32 + FILTER_VER_CHROMA_SP 32, 8 + + FILTER_VER_CHROMA_SP 16, 24 + FILTER_VER_CHROMA_SP 16, 64 + FILTER_VER_CHROMA_SP 12, 32 + FILTER_VER_CHROMA_SP 4, 32 + FILTER_VER_CHROMA_SP 32, 64 + FILTER_VER_CHROMA_SP 32, 48 + FILTER_VER_CHROMA_SP 24, 64 + + FILTER_VER_CHROMA_SP 64, 64 + FILTER_VER_CHROMA_SP 64, 32 + FILTER_VER_CHROMA_SP 64, 48 + FILTER_VER_CHROMA_SP 48, 64 + FILTER_VER_CHROMA_SP 64, 16 + + +%macro PROCESS_CHROMA_SP_W2_4R 1 + movd m0, [r0] + movd m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + + lea r0, [r0 + 2 * r1] + movd m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + punpcklqdq m0, m1 ;m0=[0 1 1 2] + pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m2, m1 ;m2=[2 3] + + lea r0, [r0 + 2 * r1] + movd m3, [r0] + punpcklwd m1, m3 ;m2=[3 4] + punpcklqdq m2, m1 ;m2=[2 3 3 4] + + pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 + pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 + paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 + + movd m1, [r0 + r1] + punpcklwd m3, m1 ;m3=[4 5] + + movd m4, [r0 + 2 * r1] + punpcklwd m1, m4 ;m1=[5 6] + punpcklqdq m3, m1 ;m2=[4 5 5 6] + pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 + paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 +%endmacro + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W2_4R 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mova m5, [tab_c_526336] + + mov r4d, (%2/4) + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 + + paddd m0, m5 + paddd m2, m5 + + psrad m0, 12 + psrad m2, 12 + + packssdw m0, m2 + packuswb m0, m0 + + pextrw [r2], m0, 0 + pextrw [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrw [r2], m0, 2 + pextrw [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_SP_W2_4R 2, 4 +FILTER_VER_CHROMA_SP_W2_4R 2, 8 + +FILTER_VER_CHROMA_SP_W2_4R 2, 16 + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mova m4, [tab_c_526336] + + movq m0, [r0] + movq m1, [r0 + r1] + punpcklwd m0, m1 ;m0=[0 1] + pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + + lea r0, [r0 + 2 * r1] + movq m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 + + movq m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 16] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + paddd m0, m4 + psrad m0, 12 + + movq m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 16] + paddd m1, m3 ;m1 = [1+2+3+4] Row2 done + paddd m1, m4 + psrad m1, 12 + + packssdw m0, m1 + packuswb m0, m0 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + + RET + +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W6_H4 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_ChromaCoeffV + r4] +%endif + + mova m6, [tab_c_526336] + + mov r4d, %2/4 + +.loopH: + PROCESS_CHROMA_SP_W4_4R + + paddd m0, m6 + paddd m1, m6 + paddd m2, m6 + paddd m3, m6 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r5, [r2 + 2 * r3] + pextrd [r5], m0, 2 + pextrd [r5 + r3], m0, 3 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 4 + + PROCESS_CHROMA_SP_W2_4R r6 + + paddd m0, m6 + paddd m2, m6 + + psrad m0, 12 + psrad m2, 12 + + packssdw m0, m2 + packuswb m0, m0 + + pextrw [r2], m0, 0 + pextrw [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrw [r2], m0, 2 + pextrw [r2 + r3], m0, 3 + + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 4] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_SP_W6_H4 6, 8 + +FILTER_VER_CHROMA_SP_W6_H4 6, 16 + +%macro PROCESS_CHROMA_SP_W8_2R 0 + movu m1, [r0] + movu m3, [r0 + r1] + punpcklwd m0, m1, m3 + pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l + punpckhwd m1, m3 + pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h + + movu m4, [r0 + 2 * r1] + punpcklwd m2, m3, m4 + pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l + punpckhwd m3, m4 + pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h + + lea r0, [r0 + 2 * r1] + movu m5, [r0 + r1] + punpcklwd m6, m4, m5 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l + paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum + punpckhwd m4, m5 + pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h + paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum + + movu m4, [r0 + 2 * r1] + punpcklwd m6, m5, m4 + pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l + paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum + punpckhwd m5, m4 + pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h + paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum +%endmacro + +;-------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) +;-------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SP_W8_H2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 + + add r1d, r1d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif + + mova m7, [tab_c_526336] + + mov r4d, %2/2 +.loopH: + PROCESS_CHROMA_SP_W8_2R + + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + + packssdw m0, m1 + packssdw m2, m3 + + packuswb m0, m2 + + movlps [r2], m0 + movhps [r2 + r3], m0 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_VER_CHROMA_SP_W8_H2 8, 2 +FILTER_VER_CHROMA_SP_W8_H2 8, 4 +FILTER_VER_CHROMA_SP_W8_H2 8, 6 +FILTER_VER_CHROMA_SP_W8_H2 8, 8 +FILTER_VER_CHROMA_SP_W8_H2 8, 16 +FILTER_VER_CHROMA_SP_W8_H2 8, 32 + +FILTER_VER_CHROMA_SP_W8_H2 8, 12 +FILTER_VER_CHROMA_SP_W8_H2 8, 64 + + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA_2xN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride +%define coef2 m3 +%define Tm0 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t1, [pw_2000] + mova Tm0, [tab_Tm] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + movh t0, [srcq] + pshufb t0, t0, Tm0 + pmaddubsw t0, coef2 + phaddw t0, t0 + psubw t0, t1 + movd [dstq], t0 + + lea srcq, [srcq + srcstrideq] + lea dstq, [dstq + dststrideq] + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_HORIZ_CHROMA_2xN 2, 4 +FILTER_HORIZ_CHROMA_2xN 2, 8 + +FILTER_HORIZ_CHROMA_2xN 2, 16 + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA_4xN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride +%define coef2 m3 +%define Tm0 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t1, [pw_2000] + mova Tm0, [tab_Tm] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + movh t0, [srcq] + pshufb t0, t0, Tm0 + pmaddubsw t0, coef2 + phaddw t0, t0 + psubw t0, t1 + movlps [dstq], t0 + + lea srcq, [srcq + srcstrideq] + lea dstq, [dstq + dststrideq] + + dec r4d + jnz .loopH + RET +%endmacro + +FILTER_HORIZ_CHROMA_4xN 4, 2 +FILTER_HORIZ_CHROMA_4xN 4, 4 +FILTER_HORIZ_CHROMA_4xN 4, 8 +FILTER_HORIZ_CHROMA_4xN 4, 16 + +FILTER_HORIZ_CHROMA_4xN 4, 32 + +%macro PROCESS_CHROMA_W6 3 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movh [dstq], %2 + pshufd %2, %2, 2 + movd [dstq + 8], %2 +%endmacro + +%macro PROCESS_CHROMA_W12 3 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movu [dstq], %2 + movu %1, [srcq + 8] + pshufb %1, %1, Tm0 + pmaddubsw %1, coef2 + phaddw %1, %1 + psubw %1, %3 + movh [dstq + 16], %1 +%endmacro + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride +%define coef2 m5 +%define Tm0 m4 +%define Tm1 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_2000] + mova Tm0, [tab_Tm] + mova Tm1, [tab_Tm + 16] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + PROCESS_CHROMA_W%1 t0, t1, t2 + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_HORIZ_CHROMA 6, 8 +FILTER_HORIZ_CHROMA 12, 16 + +FILTER_HORIZ_CHROMA 6, 16 +FILTER_HORIZ_CHROMA 12, 32 + +%macro PROCESS_CHROMA_W8 3 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movu [dstq], %2 +%endmacro + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro FILTER_HORIZ_CHROMA_8xN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride +%define coef2 m5 +%define Tm0 m4 +%define Tm1 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_2000] + mova Tm0, [tab_Tm] + mova Tm1, [tab_Tm + 16] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + PROCESS_CHROMA_W8 t0, t1, t2 + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_HORIZ_CHROMA_8xN 8, 2 +FILTER_HORIZ_CHROMA_8xN 8, 4 +FILTER_HORIZ_CHROMA_8xN 8, 6 +FILTER_HORIZ_CHROMA_8xN 8, 8 +FILTER_HORIZ_CHROMA_8xN 8, 16 +FILTER_HORIZ_CHROMA_8xN 8, 32 + +FILTER_HORIZ_CHROMA_8xN 8, 12 +FILTER_HORIZ_CHROMA_8xN 8, 64 + +%macro PROCESS_CHROMA_W16 4 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq], %2 + movu [dstq + 16], %4 +%endmacro + +%macro PROCESS_CHROMA_W24 4 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq], %2 + movu [dstq + 16], %4 + movu %1, [srcq + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + psubw %2, %3 + movu [dstq + 32], %2 +%endmacro + +%macro PROCESS_CHROMA_W32 4 + movu %1, [srcq] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq], %2 + movu [dstq + 16], %4 + movu %1, [srcq + 16] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + 24] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq + 32], %2 + movu [dstq + 48], %4 +%endmacro + +%macro PROCESS_CHROMA_W16o 5 + movu %1, [srcq + %5] + pshufb %2, %1, Tm0 + pmaddubsw %2, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %2, %1 + movu %1, [srcq + %5 + 8] + pshufb %4, %1, Tm0 + pmaddubsw %4, coef2 + pshufb %1, %1, Tm1 + pmaddubsw %1, coef2 + phaddw %4, %1 + psubw %2, %3 + psubw %4, %3 + movu [dstq + %5 * 2], %2 + movu [dstq + %5 * 2 + 16], %4 +%endmacro + +%macro PROCESS_CHROMA_W48 4 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 +%endmacro + +%macro PROCESS_CHROMA_W64 4 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 + PROCESS_CHROMA_W16o %1, %2, %3, %4, 48 +%endmacro + +;------------------------------------------------------------------------------------------------------------------------------ +; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;------------------------------------------------------------------------------------------------------------------------------ +%macro FILTER_HORIZ_CHROMA_WxN 2 +INIT_XMM sse4 +cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride +%define coef2 m6 +%define Tm0 m5 +%define Tm1 m4 +%define t3 m3 +%define t2 m2 +%define t1 m1 +%define t0 m0 + + dec srcq + mov r4d, r4m + add dststrided, dststrided + +%ifdef PIC + lea r6, [tab_ChromaCoeff] + movd coef2, [r6 + r4 * 4] +%else + movd coef2, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufd coef2, coef2, 0 + mova t2, [pw_2000] + mova Tm0, [tab_Tm] + mova Tm1, [tab_Tm + 16] + + mov r4d, %2 + cmp r5m, byte 0 + je .loopH + sub srcq, srcstrideq + add r4d, 3 + +.loopH: + PROCESS_CHROMA_W%1 t0, t1, t2, t3 + add srcq, srcstrideq + add dstq, dststrideq + + dec r4d + jnz .loopH + + RET +%endmacro + +FILTER_HORIZ_CHROMA_WxN 16, 4 +FILTER_HORIZ_CHROMA_WxN 16, 8 +FILTER_HORIZ_CHROMA_WxN 16, 12 +FILTER_HORIZ_CHROMA_WxN 16, 16 +FILTER_HORIZ_CHROMA_WxN 16, 32 +FILTER_HORIZ_CHROMA_WxN 24, 32 +FILTER_HORIZ_CHROMA_WxN 32, 8 +FILTER_HORIZ_CHROMA_WxN 32, 16 +FILTER_HORIZ_CHROMA_WxN 32, 24 +FILTER_HORIZ_CHROMA_WxN 32, 32 + +FILTER_HORIZ_CHROMA_WxN 16, 24 +FILTER_HORIZ_CHROMA_WxN 16, 64 +FILTER_HORIZ_CHROMA_WxN 24, 64 +FILTER_HORIZ_CHROMA_WxN 32, 48 +FILTER_HORIZ_CHROMA_WxN 32, 64 + +FILTER_HORIZ_CHROMA_WxN 64, 64 +FILTER_HORIZ_CHROMA_WxN 64, 32 +FILTER_HORIZ_CHROMA_WxN 64, 48 +FILTER_HORIZ_CHROMA_WxN 48, 64 +FILTER_HORIZ_CHROMA_WxN 64, 16 + + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W16n 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m1, m0, [tab_Vm] + pshufb m0, [tab_Vm + 16] + mov r4d, %2/2 + +.loop: + + mov r6d, %1/16 + +.loopW: + + movu m2, [r0] + movu m3, [r0 + r1] + + punpcklbw m4, m2, m3 + punpckhbw m2, m3 + + pmaddubsw m4, m1 + pmaddubsw m2, m1 + + lea r5, [r0 + 2 * r1] + movu m5, [r5] + movu m7, [r5 + r1] + + punpcklbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m4, m6 + + punpckhbw m6, m5, m7 + pmaddubsw m6, m0 + paddw m2, m6 + + mova m6, [pw_2000] + + psubw m4, m6 + psubw m2, m6 + + movu [r2], m4 + movu [r2 + 16], m2 + + punpcklbw m4, m3, m5 + punpckhbw m3, m5 + + pmaddubsw m4, m1 + pmaddubsw m3, m1 + + movu m5, [r5 + 2 * r1] + + punpcklbw m2, m7, m5 + punpckhbw m7, m5 + + pmaddubsw m2, m0 + pmaddubsw m7, m0 + + paddw m4, m2 + paddw m3, m7 + + psubw m4, m6 + psubw m3, m6 + + movu [r2 + r3], m4 + movu [r2 + r3 + 16], m3 + + add r0, 16 + add r2, 32 + dec r6d + jnz .loopW + + lea r0, [r0 + r1 * 2 - %1] + lea r2, [r2 + r3 * 2 - %1 * 2] + + dec r4d + jnz .loop + RET +%endmacro + +FILTER_V_PS_W16n 64, 64 +FILTER_V_PS_W16n 64, 32 +FILTER_V_PS_W16n 64, 48 +FILTER_V_PS_W16n 48, 64 +FILTER_V_PS_W16n 64, 16 + + +;------------------------------------------------------------------------------------------------------------ +;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------ +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + lea r5, [3 * r1] + + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] + pmaddubsw m2, m0 - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 - lea r2, [r2 + r3 * 4] + lea r0, [r0 + 4 * r1] + movd m6, [r0] - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r0 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 + punpcklbw m3, m4 + punpcklbw m1, m5, m6 + punpcklbw m3, m1 + + pmaddubsw m3, m0 + phaddw m2, m3 + + mova m1, [pw_2000] + + psubw m2, m1 + + movd [r2], m2 + pextrd [r2 + r3], m2, 2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + phaddw m4, m5 + psubw m4, m1 + + lea r2, [r2 + 2 * r3] + movd [r2], m4 + pextrd [r2 + r3], m4, 2 + + RET + +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------- +%macro FILTER_V_PS_W2 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 + + mov r4d, r4m + sub r0, r1 + add r3d, r3d + +%ifdef PIC + lea r5, [tab_ChromaCoeff] + movd m0, [r5 + r4 * 4] +%else + movd m0, [tab_ChromaCoeff + r4 * 4] +%endif + + pshufb m0, [tab_Cm] + + mova m1, [pw_2000] + lea r5, [3 * r1] + mov r4d, %2/4 +.loop: + movd m2, [r0] + movd m3, [r0 + r1] + movd m4, [r0 + 2 * r1] + movd m5, [r0 + r5] + + punpcklbw m2, m3 + punpcklbw m6, m4, m5 + punpcklbw m2, m6 + + pmaddubsw m2, m0 + + lea r0, [r0 + 4 * r1] + movd m6, [r0] + + punpcklbw m3, m4 + punpcklbw m7, m5, m6 + punpcklbw m3, m7 + + pmaddubsw m3, m0 + + phaddw m2, m3 + psubw m2, m1 + + + movd [r2], m2 + pshufd m2, m2, 2 + movd [r2 + r3], m2 + + movd m2, [r0 + r1] + + punpcklbw m4, m5 + punpcklbw m3, m6, m2 + punpcklbw m4, m3 + + pmaddubsw m4, m0 + + movd m3, [r0 + 2 * r1] + + punpcklbw m5, m6 + punpcklbw m2, m3 + punpcklbw m5, m2 + + pmaddubsw m5, m0 + + phaddw m4, m5 + + psubw m4, m1 + + lea r2, [r2 + 2 * r3] + movd [r2], m4 + pshufd m4 , m4 ,2 + movd [r2 + r3], m4 + + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loop + +RET +%endmacro + +FILTER_V_PS_W2 2, 8 + +FILTER_V_PS_W2 2, 16 - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - movu [r2 + r3], xm13 - movu [r2 + r3 * 2], xm0 - movu [r2 + r6], xm1 - RET -%endif +;----------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_16x12, 4, 7, 15 - mov r4d, r4m - shl r4d, 7 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 + lea r5, [tab_ChromaCoeffV] + lea r6, [r5 + r4] %else - lea r5, [tab_LumaCoeffVer_32 + r4] + lea r6, [tab_ChromaCoeffV + r4] %endif - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - mova m14, [pw_512] + mov dword [rsp], %2/4 - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] +.loopH: + mov r4d, (%1/4) +.loopW: + PROCESS_CHROMA_SP_W4_4R - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + + packssdw m0, m1 + packssdw m2, m3 + + movlps [r2], m0 + movhps [r2 + r3], m0 + lea r5, [r2 + 2 * r3] + movlps [r5], m2 + movhps [r5 + r3], m2 + + lea r5, [4 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 + + dec r4d + jnz .loopW + + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] + + dec dword [rsp] + jnz .loopH + + RET +%endmacro + + FILTER_VER_CHROMA_SS 4, 4 + FILTER_VER_CHROMA_SS 4, 8 + FILTER_VER_CHROMA_SS 16, 16 + FILTER_VER_CHROMA_SS 16, 8 + FILTER_VER_CHROMA_SS 16, 12 + FILTER_VER_CHROMA_SS 12, 16 + FILTER_VER_CHROMA_SS 16, 4 + FILTER_VER_CHROMA_SS 4, 16 + FILTER_VER_CHROMA_SS 32, 32 + FILTER_VER_CHROMA_SS 32, 16 + FILTER_VER_CHROMA_SS 16, 32 + FILTER_VER_CHROMA_SS 32, 24 + FILTER_VER_CHROMA_SS 24, 32 + FILTER_VER_CHROMA_SS 32, 8 + + FILTER_VER_CHROMA_SS 16, 24 + FILTER_VER_CHROMA_SS 12, 32 + FILTER_VER_CHROMA_SS 4, 32 + FILTER_VER_CHROMA_SS 32, 64 + FILTER_VER_CHROMA_SS 16, 64 + FILTER_VER_CHROMA_SS 32, 48 + FILTER_VER_CHROMA_SS 24, 64 + + FILTER_VER_CHROMA_SS 64, 64 + FILTER_VER_CHROMA_SS 64, 32 + FILTER_VER_CHROMA_SS 64, 48 + FILTER_VER_CHROMA_SS 48, 64 + FILTER_VER_CHROMA_SS 64, 16 + +%macro FILTER_VER_CHROMA_S_AVX2_4x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x4, 4, 6, 7 + mov r4d, r4m + add r1d, r1d + shl r4d, 6 + sub r0, r1 - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 - lea r2, [r2 + r3 * 4] + lea r4, [r1 * 3] +%ifidn %1,sp + mova m6, [pd_526336] +%else + add r3d, r3d +%endif - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + +%ifidn %1,sp + paddd m0, m6 + paddd m2, m6 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - packuswb m8, m9 - packuswb m10, m11 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - RET +%ifidn %1,sp + packuswb xm0, xm2 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r4], xm0, 3 +%else + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 %endif + RET +%endmacro +FILTER_VER_CHROMA_S_AVX2_4x4 sp +FILTER_VER_CHROMA_S_AVX2_4x4 ss + +%macro FILTER_VER_CHROMA_S_AVX2_4x8 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_16x8, 4, 7, 15 +cglobal interp_4tap_vert_%1_4x8, 4, 6, 8 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 + add r1d, r1d + sub r0, r1 %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - mova m14, [pw_512] - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + lea r4, [r3 * 3] - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 - RET +%ifidn %1,sp + paddd m0, m7 + paddd m2, m7 + paddd m4, m7 + paddd m1, m7 + psrad m0, 12 + psrad m2, 12 + psrad m4, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m2, 6 + psrad m4, 6 + psrad m1, 6 +%endif + packssdw m0, m2 + packssdw m4, m1 +%ifidn %1,sp + packuswb m0, m4 + vextracti128 xm2, m0, 1 + movd [r2], xm0 + movd [r2 + r3], xm2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r4], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 3 + pextrd [r2 + r4], xm2, 3 +%else + vextracti128 xm2, m0, 1 + vextracti128 xm1, m4, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r4], xm1 %endif + RET +%endmacro -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_16x4, 4, 7, 13 - mov r4d, r4m - shl r4d, 7 +FILTER_VER_CHROMA_S_AVX2_4x8 sp +FILTER_VER_CHROMA_S_AVX2_4x8 ss -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 +%macro PROCESS_CHROMA_AVX2_W4_16R 1 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m1, m3 + pmaddwd m6, [r5] + +%ifidn %1,sp + paddd m0, m7 + paddd m2, m7 + paddd m4, m7 + paddd m1, m7 + psrad m4, 12 + psrad m1, 12 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 + psrad m4, 6 + psrad m1, 6 +%endif + packssdw m0, m2 + packssdw m4, m1 +%ifidn %1,sp + packuswb m0, m4 + vextracti128 xm4, m0, 1 + movd [r2], xm0 + movd [r2 + r3], xm4 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r6], xm4, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm4, 2 + pextrd [r2 + r3 * 2], xm0, 3 + pextrd [r2 + r6], xm4, 3 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + vextracti128 xm2, m0, 1 + vextracti128 xm1, m4, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 %endif - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - mova m12, [pw_512] - - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 + movq xm2, [r0 + r4] + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] + pmaddwd m2, m5, [r5 + 1 * mmsize] + paddd m6, m2 + pmaddwd m5, [r5] + movq xm2, [r0 + r1] + punpcklwd xm0, xm2 + movq xm3, [r0 + 2 * r1] + punpcklwd xm2, xm3 + vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m0, [r5] + movq xm4, [r0 + r4] + punpcklwd xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] + punpcklwd xm4, xm1 + vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] + pmaddwd m4, m3, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m3, [r5] + movq xm4, [r0 + r1] + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] + pmaddwd m1, [r5 + 1 * mmsize] + paddd m3, m1 + +%ifidn %1,sp + paddd m6, m7 + paddd m5, m7 + paddd m0, m7 + paddd m3, m7 + psrad m6, 12 + psrad m5, 12 + psrad m0, 12 + psrad m3, 12 +%else + psrad m6, 6 + psrad m5, 6 + psrad m0, 6 + psrad m3, 6 +%endif + packssdw m6, m5 + packssdw m0, m3 + lea r2, [r2 + r3 * 4] - pmulhrsw m0, m12 ; m0 = word: row 0 - pmulhrsw m1, m12 ; m1 = word: row 1 - pmulhrsw m2, m12 ; m2 = word: row 2 - pmulhrsw m3, m12 ; m3 = word: row 3 - packuswb m0, m1 - packuswb m2, m3 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - RET +%ifidn %1,sp + packuswb m6, m0 + vextracti128 xm0, m6, 1 + movd [r2], xm6 + movd [r2 + r3], xm0 + pextrd [r2 + r3 * 2], xm6, 1 + pextrd [r2 + r6], xm0, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm6, 2 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm6, 3 + pextrd [r2 + r6], xm0, 3 +%else + vextracti128 xm5, m6, 1 + vextracti128 xm3, m0, 1 + movq [r2], xm6 + movq [r2 + r3], xm5 + movhps [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm5 + lea r2, [r2 + r3 * 4] + movq [r2], xm0 + movq [r2 + r3], xm3 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm3 %endif +%endmacro -%macro FILTER_VER_LUMA_AVX2_16xN 2 +%macro FILTER_VER_CHROMA_S_AVX2_4x16 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_%1x%2, 4, 9, 15 +cglobal interp_4tap_vert_%1_4x16, 4, 7, 8 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 + add r1d, r1d + sub r0, r1 %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] - sub r0, r4 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif lea r6, [r3 * 3] - lea r7, [r1 * 4] - mova m14, [pw_512] - mov r8d, %2 / 16 + PROCESS_CHROMA_AVX2_W4_16R %1 + RET +%endmacro -.loop: - movu xm0, [r0] ; m0 = row 0 - movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] - movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] - movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r0, [r0 + r1 * 4] - movu xm4, [r0] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r0 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 - vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r0 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r0 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r0, [r0 + r1 * 4] - movu xm8, [r0] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r0 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r0 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r0 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r0, [r0 + r1 * 4] - movu xm12, [r0] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] +FILTER_VER_CHROMA_S_AVX2_4x16 sp +FILTER_VER_CHROMA_S_AVX2_4x16 ss + +%macro FILTER_VER_CHROMA_S_AVX2_4x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_4x2, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 - vpermq m0, m0, 11011000b - vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m5, [pd_526336] +%else + add r3d, r3d +%endif + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + movq xm4, [r0 + 4 * r1] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 +%ifidn %1,sp + paddd m0, m5 + psrad m0, 12 +%else + psrad m0, 6 +%endif vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 - movu [r2], xm0 - movu [r2 + r3], xm1 - movu [r2 + r3 * 2], xm2 - movu [r2 + r6], xm3 - lea r2, [r2 + r3 * 4] - movu [r2], xm4 - movu [r2 + r3], xm5 + packssdw xm0, xm1 +%ifidn %1,sp + packuswb xm0, xm0 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 +%else + movq [r2], xm0 + movhps [r2 + r3], xm0 +%endif + RET +%endmacro - movu xm13, [r0 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r0 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] +FILTER_VER_CHROMA_S_AVX2_4x2 sp +FILTER_VER_CHROMA_S_AVX2_4x2 ss - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 - vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 - movu [r2 + r3 * 2], xm6 - movu [r2 + r6], xm7 - lea r2, [r2 + r3 * 4] +%macro FILTER_VER_CHROMA_S_AVX2_2x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_2x4, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + sub r0, r1 - movu xm1, [r0 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] +%ifidn %1,sp + mova m5, [pd_526336] +%else + add r3d, r3d +%endif + movd xm0, [r0] + movd xm1, [r0 + r1] + punpcklwd xm0, xm1 + movd xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] + movd xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movd xm4, [r0] + punpcklwd xm3, xm4 + punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] + vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] + movd xm1, [r0 + r1] + punpcklwd xm4, xm1 + movd xm3, [r0 + r1 * 2] + punpcklwd xm1, xm3 + punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] + vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] + pmaddwd m0, [r5] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 +%ifidn %1,sp + paddd m0, m5 + psrad m0, 12 +%else + psrad m0, 6 +%endif + vextracti128 xm1, m0, 1 + packssdw xm0, xm1 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb xm0, xm0 + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + 2 * r3], xm0, 2 + pextrw [r2 + r4], xm0, 3 +%else + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + pextrd [r2 + 2 * r3], xm0, 2 + pextrd [r2 + r4], xm0, 3 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_S_AVX2_2x4 sp +FILTER_VER_CHROMA_S_AVX2_2x4 ss + +%macro FILTER_VER_CHROMA_S_AVX2_8x8 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x8, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] - lea r0, [r0 + r1 * 4] - movu xm2, [r0] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r0 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r0 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r0 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r0, [r0 + r1 * 4] - movu xm6, [r0] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r0 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r0 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m7 + paddd m3, m7 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm3, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm3, 1 + pmaddwd m3, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m3 - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 +%else vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movu [r2], xm8 - movu [r2 + r3], xm9 - movu [r2 + r3 * 2], xm10 - movu [r2 + r6], xm11 - lea r2, [r2 + r3 * 4] - movu [r2], xm12 - movu [r2 + r3], xm13 - movu [r2 + r3 * 2], xm0 - movu [r2 + r6], xm1 + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 +%endif lea r2, [r2 + r3 * 4] - sub r0, r7 - dec r8d - jnz .loop - RET + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m6, m0 + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm0, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m1, m2 + +%ifidn %1,sp + paddd m6, m7 + paddd m1, m7 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r4], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm1, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm1 %endif + RET %endmacro -FILTER_VER_LUMA_AVX2_16xN 16, 32 -FILTER_VER_LUMA_AVX2_16xN 16, 64 +FILTER_VER_CHROMA_S_AVX2_8x8 sp +FILTER_VER_CHROMA_S_AVX2_8x8 ss -%macro PROCESS_LUMA_AVX2_W16_16R 0 +%macro PROCESS_CHROMA_S_AVX2_W8_16R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] + pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] + pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] lea r7, [r0 + r1 * 4] movu xm4, [r7] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] movu xm5, [r7 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r7 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - pmaddubsw m8, [r5] - movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - pmaddubsw m9, [r5] - movu xm11, [r7 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - pmaddubsw m12, m10, [r5 + 1 * mmsize] - paddw m8, m12 - pmaddubsw m10, [r5] - lea r7, [r7 + r1 * 4] - movu xm12, [r7] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 - pmaddubsw m13, m11, [r5 + 1 * mmsize] - paddw m9, m13 - pmaddubsw m11, [r5] - - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] +%ifidn %1,sp + paddd m0, m9 + paddd m1, m9 + paddd m2, m9 + paddd m3, m9 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b vextracti128 xm1, m0, 1 vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 movu [r2], xm0 movu [r2 + r3], xm1 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 - lea r8, [r2 + r3 * 4] - movu [r8], xm4 - movu [r8 + r3], xm5 - - movu xm13, [r7 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - pmaddubsw m0, m12, [r5 + 2 * mmsize] - paddw m8, m0 - pmaddubsw m0, m12, [r5 + 1 * mmsize] - paddw m10, m0 - pmaddubsw m12, [r5] - movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmaddubsw m1, m13, [r5 + 2 * mmsize] - paddw m9, m1 - pmaddubsw m1, m13, [r5 + 1 * mmsize] - paddw m11, m1 - pmaddubsw m13, [r5] +%endif - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm0, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm0, 1 + pmaddwd m0, m7, [r5 + 1 * mmsize] + paddd m5, m0 + pmaddwd m7, [r5] + movu xm0, [r7 + r1] ; m0 = row 9 + punpckhwd xm1, xm8, xm0 + punpcklwd xm8, xm0 + vinserti128 m8, m8, xm1, 1 + pmaddwd m1, m8, [r5 + 1 * mmsize] + paddd m6, m1 + pmaddwd m8, [r5] + movu xm1, [r7 + r1 * 2] ; m1 = row 10 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m7, m2 + pmaddwd m0, [r5] +%ifidn %1,sp + paddd m4, m9 + paddd m5, m9 + psrad m4, 12 + psrad m5, 12 + paddd m6, m9 + paddd m7, m9 + psrad m6, 12 + psrad m7, 12 +%else + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m4, m5 + packssdw m6, m7 + lea r8, [r2 + r3 * 4] +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm5 movu [r8 + r3 * 2], xm6 movu [r8 + r6], xm7 - lea r8, [r8 + r3 * 4] +%endif - movu xm1, [r7 + r4] ; m1 = row 15 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 - vinserti128 m0, m0, xm2, 1 - pmaddubsw m2, m0, [r5 + 3 * mmsize] - paddw m8, m2 - pmaddubsw m2, m0, [r5 + 2 * mmsize] - paddw m10, m2 - pmaddubsw m2, m0, [r5 + 1 * mmsize] - paddw m12, m2 - pmaddubsw m0, [r5] + movu xm2, [r7 + r4] ; m2 = row 11 + punpckhwd xm4, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm4, 1 + pmaddwd m4, m1, [r5 + 1 * mmsize] + paddd m8, m4 + pmaddwd m1, [r5] lea r7, [r7 + r1 * 4] - movu xm2, [r7] ; m2 = row 16 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 3 * mmsize] - paddw m9, m3 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m11, m3 - pmaddubsw m3, m1, [r5 + 1 * mmsize] - paddw m13, m3 - pmaddubsw m1, [r5] - movu xm3, [r7 + r1] ; m3 = row 17 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 - vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 3 * mmsize] - paddw m10, m4 - pmaddubsw m4, m2, [r5 + 2 * mmsize] - paddw m12, m4 - pmaddubsw m2, [r5 + 1 * mmsize] - paddw m0, m2 - movu xm4, [r7 + r1 * 2] ; m4 = row 18 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 - vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 3 * mmsize] - paddw m11, m5 - pmaddubsw m5, m3, [r5 + 2 * mmsize] - paddw m13, m5 - pmaddubsw m3, [r5 + 1 * mmsize] - paddw m1, m3 - movu xm5, [r7 + r4] ; m5 = row 19 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 + movu xm4, [r7] ; m4 = row 12 + punpckhwd xm5, xm2, xm4 + punpcklwd xm2, xm4 + vinserti128 m2, m2, xm5, 1 + pmaddwd m5, m2, [r5 + 1 * mmsize] + paddd m0, m5 + pmaddwd m2, [r5] + movu xm5, [r7 + r1] ; m5 = row 13 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 3 * mmsize] - paddw m12, m6 - pmaddubsw m4, [r5 + 2 * mmsize] - paddw m0, m4 - lea r7, [r7 + r1 * 4] - movu xm6, [r7] ; m6 = row 20 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m1, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 14 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 3 * mmsize] - paddw m13, m7 - pmaddubsw m5, [r5 + 2 * mmsize] - paddw m1, m5 - movu xm7, [r7 + r1] ; m7 = row 21 - punpckhbw xm2, xm6, xm7 - punpcklbw xm6, xm7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m2, m7 + pmaddwd m5, [r5] +%ifidn %1,sp + paddd m8, m9 + paddd m0, m9 + paddd m1, m9 + paddd m2, m9 + psrad m8, 12 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 +%else + psrad m8, 6 + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 +%endif + packssdw m8, m0 + packssdw m1, m2 + lea r8, [r8 + r3 * 4] +%ifidn %1,sp + packuswb m8, m1 + vpermd m8, m3, m8 + vextracti128 xm1, m8, 1 + movq [r8], xm8 + movhps [r8 + r3], xm8 + movq [r8 + r3 * 2], xm1 + movhps [r8 + r6], xm1 +%else + vpermq m8, m8, 11011000b + vpermq m1, m1, 11011000b + vextracti128 xm0, m8, 1 + vextracti128 xm2, m1, 1 + movu [r8], xm8 + movu [r8 + r3], xm0 + movu [r8 + r3 * 2], xm1 + movu [r8 + r6], xm2 +%endif + lea r8, [r8 + r3 * 4] + + movu xm7, [r7 + r4] ; m7 = row 15 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 vinserti128 m6, m6, xm2, 1 - pmaddubsw m6, [r5 + 3 * mmsize] - paddw m0, m6 - movu xm2, [r7 + r1 * 2] ; m2 = row 22 - punpckhbw xm3, xm7, xm2 - punpcklbw xm7, xm2 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m7, [r5 + 3 * mmsize] - paddw m1, m7 + pmaddwd m2, m6, [r5 + 1 * mmsize] + paddd m4, m2 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm1, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm1, 1 + pmaddwd m1, m7, [r5 + 1 * mmsize] + paddd m5, m1 + pmaddwd m7, [r5] + movu xm1, [r7 + r1] ; m1 = row 17 + punpckhwd xm0, xm2, xm1 + punpcklwd xm2, xm1 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m6, m2 + movu xm0, [r7 + r1 * 2] ; m0 = row 18 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m7, m1 + +%ifidn %1,sp + paddd m4, m9 + paddd m5, m9 + paddd m6, m9 + paddd m7, m9 + psrad m4, 12 + psrad m5, 12 + psrad m6, 12 + psrad m7, 12 +%else + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m4, m5 + packssdw m6, m7 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm5 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%endif +%endmacro + +%macro FILTER_VER_CHROMA_S_AVX2_Nx16 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_%2x16, 4, 10, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m9, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_CHROMA_S_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loopW + RET +%endif +%endmacro - pmulhrsw m8, m14 ; m8 = word: row 8 - pmulhrsw m9, m14 ; m9 = word: row 9 - pmulhrsw m10, m14 ; m10 = word: row 10 - pmulhrsw m11, m14 ; m11 = word: row 11 - pmulhrsw m12, m14 ; m12 = word: row 12 - pmulhrsw m13, m14 ; m13 = word: row 13 - pmulhrsw m0, m14 ; m0 = word: row 14 - pmulhrsw m1, m14 ; m1 = word: row 15 - packuswb m8, m9 - packuswb m10, m11 - packuswb m12, m13 - packuswb m0, m1 - vpermq m8, m8, 11011000b - vpermq m10, m10, 11011000b - vpermq m12, m12, 11011000b - vpermq m0, m0, 11011000b - vextracti128 xm9, m8, 1 - vextracti128 xm11, m10, 1 - vextracti128 xm13, m12, 1 - vextracti128 xm1, m0, 1 - movu [r8], xm8 - movu [r8 + r3], xm9 - movu [r8 + r3 * 2], xm10 - movu [r8 + r6], xm11 - lea r8, [r8 + r3 * 4] - movu [r8], xm12 - movu [r8 + r3], xm13 - movu [r8 + r3 * 2], xm0 - movu [r8 + r6], xm1 +FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16 +FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32 +FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16 +FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32 + +%macro FILTER_VER_CHROMA_S_AVX2_NxN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%3_%1x%2, 4, 11, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %3,sp + mova m9, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 16 +.loopH: + mov r10d, %1 / 8 +.loopW: + PROCESS_CHROMA_S_AVX2_W8_16R %3 +%ifidn %3,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r10d + jnz .loopW + lea r0, [r7 - 2 * %1 + 16] +%ifidn %3,sp + lea r2, [r8 + r3 * 4 - %1 + 8] +%else + lea r2, [r8 + r3 * 4 - 2 * %1 + 16] +%endif + dec r9d + jnz .loopH + RET +%endif %endmacro -%macro PROCESS_LUMA_AVX2_W16_8R 0 +FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, sp +FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, sp +FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, sp +FILTER_VER_CHROMA_S_AVX2_NxN 16, 32, ss +FILTER_VER_CHROMA_S_AVX2_NxN 24, 32, ss +FILTER_VER_CHROMA_S_AVX2_NxN 32, 32, ss + +%macro PROCESS_CHROMA_S_AVX2_W8_4R 1 movu xm0, [r0] ; m0 = row 0 movu xm1, [r0 + r1] ; m1 = row 1 - punpckhbw xm2, xm0, xm1 - punpcklbw xm0, xm1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 vinserti128 m0, m0, xm2, 1 - pmaddubsw m0, [r5] + pmaddwd m0, [r5] movu xm2, [r0 + r1 * 2] ; m2 = row 2 - punpckhbw xm3, xm1, xm2 - punpcklbw xm1, xm2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 vinserti128 m1, m1, xm3, 1 - pmaddubsw m1, [r5] + pmaddwd m1, [r5] movu xm3, [r0 + r4] ; m3 = row 3 - punpckhbw xm4, xm2, xm3 - punpcklbw xm2, xm3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 vinserti128 m2, m2, xm4, 1 - pmaddubsw m4, m2, [r5 + 1 * mmsize] - paddw m0, m4 - pmaddubsw m2, [r5] - lea r7, [r0 + r1 * 4] - movu xm4, [r7] ; m4 = row 4 - punpckhbw xm5, xm3, xm4 - punpcklbw xm3, xm4 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 vinserti128 m3, m3, xm5, 1 - pmaddubsw m5, m3, [r5 + 1 * mmsize] - paddw m1, m5 - pmaddubsw m3, [r5] - movu xm5, [r7 + r1] ; m5 = row 5 - punpckhbw xm6, xm4, xm5 - punpcklbw xm4, xm5 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 vinserti128 m4, m4, xm6, 1 - pmaddubsw m6, m4, [r5 + 2 * mmsize] - paddw m0, m6 - pmaddubsw m6, m4, [r5 + 1 * mmsize] - paddw m2, m6 - pmaddubsw m4, [r5] - movu xm6, [r7 + r1 * 2] ; m6 = row 6 - punpckhbw xm7, xm5, xm6 - punpcklbw xm5, xm6 - vinserti128 m5, m5, xm7, 1 - pmaddubsw m7, m5, [r5 + 2 * mmsize] - paddw m1, m7 - pmaddubsw m7, m5, [r5 + 1 * mmsize] - paddw m3, m7 - pmaddubsw m5, [r5] - movu xm7, [r7 + r4] ; m7 = row 7 - punpckhbw xm8, xm6, xm7 - punpcklbw xm6, xm7 - vinserti128 m6, m6, xm8, 1 - pmaddubsw m8, m6, [r5 + 3 * mmsize] - paddw m0, m8 - pmaddubsw m8, m6, [r5 + 2 * mmsize] - paddw m2, m8 - pmaddubsw m8, m6, [r5 + 1 * mmsize] - paddw m4, m8 - pmaddubsw m6, [r5] - lea r7, [r7 + r1 * 4] - movu xm8, [r7] ; m8 = row 8 - punpckhbw xm9, xm7, xm8 - punpcklbw xm7, xm8 - vinserti128 m7, m7, xm9, 1 - pmaddubsw m9, m7, [r5 + 3 * mmsize] - paddw m1, m9 - pmaddubsw m9, m7, [r5 + 2 * mmsize] - paddw m3, m9 - pmaddubsw m9, m7, [r5 + 1 * mmsize] - paddw m5, m9 - pmaddubsw m7, [r5] - movu xm9, [r7 + r1] ; m9 = row 9 - punpckhbw xm10, xm8, xm9 - punpcklbw xm8, xm9 - vinserti128 m8, m8, xm10, 1 - pmaddubsw m10, m8, [r5 + 3 * mmsize] - paddw m2, m10 - pmaddubsw m10, m8, [r5 + 2 * mmsize] - paddw m4, m10 - pmaddubsw m10, m8, [r5 + 1 * mmsize] - paddw m6, m10 - movu xm10, [r7 + r1 * 2] ; m10 = row 10 - punpckhbw xm11, xm9, xm10 - punpcklbw xm9, xm10 - vinserti128 m9, m9, xm11, 1 - pmaddubsw m11, m9, [r5 + 3 * mmsize] - paddw m3, m11 - pmaddubsw m11, m9, [r5 + 2 * mmsize] - paddw m5, m11 - pmaddubsw m11, m9, [r5 + 1 * mmsize] - paddw m7, m11 - movu xm11, [r7 + r4] ; m11 = row 11 - punpckhbw xm12, xm10, xm11 - punpcklbw xm10, xm11 - vinserti128 m10, m10, xm12, 1 - pmaddubsw m12, m10, [r5 + 3 * mmsize] - paddw m4, m12 - pmaddubsw m12, m10, [r5 + 2 * mmsize] - paddw m6, m12 - lea r7, [r7 + r1 * 4] - movu xm12, [r7] ; m12 = row 12 - punpckhbw xm13, xm11, xm12 - punpcklbw xm11, xm12 - vinserti128 m11, m11, xm13, 1 - pmaddubsw m13, m11, [r5 + 3 * mmsize] - paddw m5, m13 - pmaddubsw m13, m11, [r5 + 2 * mmsize] - paddw m7, m13 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 +%endif +%endmacro + +%macro FILTER_VER_CHROMA_S_AVX2_8x4 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x4, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif + + PROCESS_CHROMA_S_AVX2_W8_4R %1 + lea r4, [r3 * 3] +%ifidn %1,sp + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 +%else + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 +%endif + RET +%endmacro + +FILTER_VER_CHROMA_S_AVX2_8x4 sp +FILTER_VER_CHROMA_S_AVX2_8x4 ss + +%macro FILTER_VER_CHROMA_S_AVX2_12x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_12x16, 4, 9, 10 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m9, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + PROCESS_CHROMA_S_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + mova m7, m9 + PROCESS_CHROMA_AVX2_W4_16R %1 + RET +%endif +%endmacro + +FILTER_VER_CHROMA_S_AVX2_12x16 sp +FILTER_VER_CHROMA_S_AVX2_12x16 ss + +%macro FILTER_VER_CHROMA_S_AVX2_16x12 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_4tap_vert_%1_16x12, 4, 9, 9 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m8, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] +%rep 2 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m1, m8 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 - pmulhrsw m0, m14 ; m0 = word: row 0 - pmulhrsw m1, m14 ; m1 = word: row 1 - pmulhrsw m2, m14 ; m2 = word: row 2 - pmulhrsw m3, m14 ; m3 = word: row 3 - pmulhrsw m4, m14 ; m4 = word: row 4 - pmulhrsw m5, m14 ; m5 = word: row 5 - packuswb m0, m1 - packuswb m2, m3 - packuswb m4, m5 + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m8 + paddd m3, m8 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else vpermq m0, m0, 11011000b vpermq m2, m2, 11011000b - vpermq m4, m4, 11011000b - vextracti128 xm1, m0, 1 - vextracti128 xm3, m2, 1 - vextracti128 xm5, m4, 1 movu [r2], xm0 - movu [r2 + r3], xm1 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 movu [r2 + r3 * 2], xm2 movu [r2 + r6], xm3 +%endif lea r8, [r2 + r3 * 4] - movu [r8], xm4 - movu [r8 + r3], xm5 - - movu xm13, [r7 + r1] ; m13 = row 13 - punpckhbw xm0, xm12, xm13 - punpcklbw xm12, xm13 - vinserti128 m12, m12, xm0, 1 - pmaddubsw m0, m12, [r5 + 3 * mmsize] - paddw m6, m0 - movu xm0, [r7 + r1 * 2] ; m0 = row 14 - punpckhbw xm1, xm13, xm0 - punpcklbw xm13, xm0 - vinserti128 m13, m13, xm1, 1 - pmaddubsw m1, m13, [r5 + 3 * mmsize] - paddw m7, m1 - pmulhrsw m6, m14 ; m6 = word: row 6 - pmulhrsw m7, m14 ; m7 = word: row 7 - packuswb m6, m7 + movu xm1, [r7 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r7, [r7 + r1 * 4] + movu xm0, [r7] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m8 + paddd m5, m8 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r7 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m6, m5 + pmaddwd m0, [r5] + movu xm5, [r7 + r1 * 2] ; m5 = row 10 + punpckhwd xm7, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 1 * mmsize] + paddd m1, m7 + pmaddwd m2, [r5] + +%ifidn %1,sp + paddd m6, m8 + paddd m1, m8 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b vpermq m6, m6, 11011000b - vextracti128 xm7, m6, 1 + vextracti128 xm7, m4, 1 + vextracti128 xm1, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm7 movu [r8 + r3 * 2], xm6 - movu [r8 + r6], xm7 + movu [r8 + r6], xm1 +%endif + lea r8, [r8 + r3 * 4] + + movu xm7, [r7 + r4] ; m7 = row 11 + punpckhwd xm1, xm5, xm7 + punpcklwd xm5, xm7 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + paddd m0, m1 + pmaddwd m5, [r5] + lea r7, [r7 + r1 * 4] + movu xm1, [r7] ; m1 = row 12 + punpckhwd xm4, xm7, xm1 + punpcklwd xm7, xm1 + vinserti128 m7, m7, xm4, 1 + pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m2, m4 + pmaddwd m7, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m2, m8 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + + movu xm4, [r7 + r1] ; m4 = row 13 + punpckhwd xm2, xm1, xm4 + punpcklwd xm1, xm4 + vinserti128 m1, m1, xm2, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m5, m1 + movu xm2, [r7 + r1 * 2] ; m2 = row 14 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m7, m4 +%ifidn %1,sp + paddd m5, m8 + paddd m7, m8 + psrad m5, 12 + psrad m7, 12 +%else + psrad m5, 6 + psrad m7, 6 +%endif + packssdw m5, m7 +%ifidn %1,sp + packuswb m0, m5 + vpermd m0, m3, m0 + vextracti128 xm5, m0, 1 + movq [r8], xm0 + movhps [r8 + r3], xm0 + movq [r8 + r3 * 2], xm5 + movhps [r8 + r6], xm5 + add r2, 8 +%else + vpermq m0, m0, 11011000b + vpermq m5, m5, 11011000b + vextracti128 xm7, m0, 1 + vextracti128 xm6, m5, 1 + movu [r8], xm0 + movu [r8 + r3], xm7 + movu [r8 + r3 * 2], xm5 + movu [r8 + r6], xm6 + add r2, 16 +%endif + add r0, 16 +%endrep + RET +%endif %endmacro +FILTER_VER_CHROMA_S_AVX2_16x12 sp +FILTER_VER_CHROMA_S_AVX2_16x12 ss + +%macro FILTER_VER_CHROMA_S_AVX2_16x4 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_24x32, 4, 11, 15 +cglobal interp_4tap_vert_%1_16x4, 4, 7, 8 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 + add r1d, r1d %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] - sub r0, r4 + sub r0, r1 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif +%rep 2 + PROCESS_CHROMA_S_AVX2_W8_4R %1 lea r6, [r3 * 3] - lea r10, [r1 * 4] - mova m14, [pw_512] - mov r9d, 2 -.loopH: - PROCESS_LUMA_AVX2_W16_16R +%ifidn %1,sp + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 + add r2, 8 +%else + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 add r2, 16 - add r0, 16 - - movq xm1, [r0] ; m1 = row 0 - movq xm2, [r0 + r1] ; m2 = row 1 - punpcklbw xm1, xm2 - movq xm3, [r0 + r1 * 2] ; m3 = row 2 - punpcklbw xm2, xm3 - vinserti128 m5, m1, xm2, 1 - pmaddubsw m5, [r5] - movq xm4, [r0 + r4] ; m4 = row 3 - punpcklbw xm3, xm4 - lea r7, [r0 + r1 * 4] - movq xm1, [r7] ; m1 = row 4 - punpcklbw xm4, xm1 - vinserti128 m2, m3, xm4, 1 - pmaddubsw m0, m2, [r5 + 1 * mmsize] - paddw m5, m0 - pmaddubsw m2, [r5] - movq xm3, [r7 + r1] ; m3 = row 5 - punpcklbw xm1, xm3 - movq xm4, [r7 + r1 * 2] ; m4 = row 6 - punpcklbw xm3, xm4 - vinserti128 m1, m1, xm3, 1 - pmaddubsw m3, m1, [r5 + 2 * mmsize] - paddw m5, m3 - pmaddubsw m0, m1, [r5 + 1 * mmsize] - paddw m2, m0 - pmaddubsw m1, [r5] - movq xm3, [r7 + r4] ; m3 = row 7 - punpcklbw xm4, xm3 - lea r7, [r7 + r1 * 4] - movq xm0, [r7] ; m0 = row 8 - punpcklbw xm3, xm0 - vinserti128 m4, m4, xm3, 1 - pmaddubsw m3, m4, [r5 + 3 * mmsize] - paddw m5, m3 - pmaddubsw m3, m4, [r5 + 2 * mmsize] - paddw m2, m3 - pmaddubsw m3, m4, [r5 + 1 * mmsize] - paddw m1, m3 - pmaddubsw m4, [r5] - movq xm3, [r7 + r1] ; m3 = row 9 - punpcklbw xm0, xm3 - movq xm6, [r7 + r1 * 2] ; m6 = row 10 - punpcklbw xm3, xm6 - vinserti128 m0, m0, xm3, 1 - pmaddubsw m3, m0, [r5 + 3 * mmsize] - paddw m2, m3 - pmaddubsw m3, m0, [r5 + 2 * mmsize] - paddw m1, m3 - pmaddubsw m3, m0, [r5 + 1 * mmsize] - paddw m4, m3 - pmaddubsw m0, [r5] +%endif + lea r6, [4 * r1 - 16] + sub r0, r6 +%endrep + RET +%endmacro - movq xm3, [r7 + r4] ; m3 = row 11 - punpcklbw xm6, xm3 - lea r7, [r7 + r1 * 4] - movq xm7, [r7] ; m7 = row 12 - punpcklbw xm3, xm7 - vinserti128 m6, m6, xm3, 1 - pmaddubsw m3, m6, [r5 + 3 * mmsize] - paddw m1, m3 - pmaddubsw m3, m6, [r5 + 2 * mmsize] - paddw m4, m3 - pmaddubsw m3, m6, [r5 + 1 * mmsize] - paddw m0, m3 - pmaddubsw m6, [r5] - movq xm3, [r7 + r1] ; m3 = row 13 - punpcklbw xm7, xm3 - movq xm8, [r7 + r1 * 2] ; m8 = row 14 - punpcklbw xm3, xm8 - vinserti128 m7, m7, xm3, 1 - pmaddubsw m3, m7, [r5 + 3 * mmsize] - paddw m4, m3 - pmaddubsw m3, m7, [r5 + 2 * mmsize] - paddw m0, m3 - pmaddubsw m3, m7, [r5 + 1 * mmsize] - paddw m6, m3 - pmaddubsw m7, [r5] - movq xm3, [r7 + r4] ; m3 = row 15 - punpcklbw xm8, xm3 - lea r7, [r7 + r1 * 4] - movq xm9, [r7] ; m9 = row 16 - punpcklbw xm3, xm9 - vinserti128 m8, m8, xm3, 1 - pmaddubsw m3, m8, [r5 + 3 * mmsize] - paddw m0, m3 - pmaddubsw m3, m8, [r5 + 2 * mmsize] - paddw m6, m3 - pmaddubsw m3, m8, [r5 + 1 * mmsize] - paddw m7, m3 - pmaddubsw m8, [r5] - movq xm3, [r7 + r1] ; m3 = row 17 - punpcklbw xm9, xm3 - movq xm10, [r7 + r1 * 2] ; m10 = row 18 - punpcklbw xm3, xm10 - vinserti128 m9, m9, xm3, 1 - pmaddubsw m3, m9, [r5 + 3 * mmsize] - paddw m6, m3 - pmaddubsw m3, m9, [r5 + 2 * mmsize] - paddw m7, m3 - pmaddubsw m3, m9, [r5 + 1 * mmsize] - paddw m8, m3 - movq xm3, [r7 + r4] ; m3 = row 19 - punpcklbw xm10, xm3 - lea r7, [r7 + r1 * 4] - movq xm9, [r7] ; m9 = row 20 - punpcklbw xm3, xm9 - vinserti128 m10, m10, xm3, 1 - pmaddubsw m3, m10, [r5 + 3 * mmsize] - paddw m7, m3 - pmaddubsw m3, m10, [r5 + 2 * mmsize] - paddw m8, m3 - movq xm3, [r7 + r1] ; m3 = row 21 - punpcklbw xm9, xm3 - movq xm10, [r7 + r1 * 2] ; m10 = row 22 - punpcklbw xm3, xm10 - vinserti128 m9, m9, xm3, 1 - pmaddubsw m3, m9, [r5 + 3 * mmsize] - paddw m8, m3 +FILTER_VER_CHROMA_S_AVX2_16x4 sp +FILTER_VER_CHROMA_S_AVX2_16x4 ss - pmulhrsw m5, m14 ; m5 = word: row 0, row 1 - pmulhrsw m2, m14 ; m2 = word: row 2, row 3 - pmulhrsw m1, m14 ; m1 = word: row 4, row 5 - pmulhrsw m4, m14 ; m4 = word: row 6, row 7 - pmulhrsw m0, m14 ; m0 = word: row 8, row 9 - pmulhrsw m6, m14 ; m6 = word: row 10, row 11 - pmulhrsw m7, m14 ; m7 = word: row 12, row 13 - pmulhrsw m8, m14 ; m8 = word: row 14, row 15 - packuswb m5, m2 - packuswb m1, m4 - packuswb m0, m6 - packuswb m7, m8 - vextracti128 xm2, m5, 1 - vextracti128 xm4, m1, 1 - vextracti128 xm6, m0, 1 - vextracti128 xm8, m7, 1 - movq [r2], xm5 - movq [r2 + r3], xm2 - movhps [r2 + r3 * 2], xm5 +%macro PROCESS_CHROMA_S_AVX2_W8_8R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m7 + paddd m3, m7 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif lea r8, [r2 + r3 * 4] - movq [r8], xm1 - movq [r8 + r3], xm4 - movhps [r8 + r3 * 2], xm1 - movhps [r8 + r6], xm4 - lea r8, [r8 + r3 * 4] - movq [r8], xm0 - movq [r8 + r3], xm6 - movhps [r8 + r3 * 2], xm0 - movhps [r8 + r6], xm6 - lea r8, [r8 + r3 * 4] - movq [r8], xm7 - movq [r8 + r3], xm8 - movhps [r8 + r3 * 2], xm7 - movhps [r8 + r6], xm8 - sub r7, r10 - lea r0, [r7 - 16] - lea r2, [r8 + r3 * 4 - 16] - dec r9d - jnz .loopH - RET + movu xm1, [r7 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r7, [r7 + r1 * 4] + movu xm0, [r7] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r7 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m6, m0 + movu xm5, [r7 + r1 * 2] ; m5 = row 10 + punpckhwd xm0, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m1, m2 + +%ifidn %1,sp + paddd m6, m7 + paddd m1, m7 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 + vextracti128 xm1, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm7 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm1 %endif +%endmacro -%macro FILTER_VER_LUMA_AVX2_32xN 2 +%macro FILTER_VER_CHROMA_S_AVX2_Nx8 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15 +cglobal interp_4tap_vert_%1_%2x8, 4, 9, 8 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 + add r1d, r1d %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] - sub r0, r4 + sub r0, r1 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif lea r6, [r3 * 3] - lea r11, [r1 * 4] - mova m14, [pw_512] - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 16 -.loopW: - PROCESS_LUMA_AVX2_W16_16R +%rep %2 / 8 + PROCESS_CHROMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 +%else add r2, 16 +%endif add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 16] - lea r2, [r8 + r3 * 4 - 16] - dec r9d - jnz .loopH +%endrep + RET +%endif +%endmacro + +FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 32 +FILTER_VER_CHROMA_S_AVX2_Nx8 sp, 16 +FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 32 +FILTER_VER_CHROMA_S_AVX2_Nx8 ss, 16 + +%macro FILTER_VER_CHROMA_S_AVX2_8x2 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x2, 4, 6, 6 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m5, [pd_526336] +%else + add r3d, r3d +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r0 + r1 * 4] ; m4 = row 4 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m1, m3 +%ifidn %1,sp + paddd m0, m5 + paddd m1, m5 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 +%ifidn %1,sp + vextracti128 xm1, m0, 1 + packuswb xm0, xm1 + pshufd xm0, xm0, 11011000b + movq [r2], xm0 + movhps [r2 + r3], xm0 +%else + vpermq m0, m0, 11011000b + vextracti128 xm1, m0, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 +%endif RET +%endmacro + +FILTER_VER_CHROMA_S_AVX2_8x2 sp +FILTER_VER_CHROMA_S_AVX2_8x2 ss + +%macro FILTER_VER_CHROMA_S_AVX2_8x6 1 +INIT_YMM avx2 +cglobal interp_4tap_vert_%1_8x6, 4, 6, 8 + mov r4d, r4m + shl r4d, 6 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_ChromaCoeffV] + add r5, r4 +%else + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m7 + paddd m3, m7 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm3, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm3, 1 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m4, m6 + movu xm6, [r0 + r1 * 4] ; m6 = row 8 + punpckhwd xm3, xm1, xm6 + punpcklwd xm1, xm6 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m5, m1 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + vextracti128 xm5, m4, 1 + packuswb xm4, xm5 + pshufd xm4, xm4, 11011000b + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movhps [r2 + r3], xm4 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vpermq m4, m4, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + vextracti128 xm5, m4, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 + lea r2, [r2 + r3 * 4] + movu [r2], xm4 + movu [r2 + r3], xm5 %endif + RET %endmacro -FILTER_VER_LUMA_AVX2_32xN 32, 32 -FILTER_VER_LUMA_AVX2_32xN 32, 64 +FILTER_VER_CHROMA_S_AVX2_8x6 sp +FILTER_VER_CHROMA_S_AVX2_8x6 ss +%macro FILTER_VER_CHROMA_S_AVX2_8xN 2 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_32x16, 4, 10, 15 +cglobal interp_4tap_vert_%1_8x%2, 4, 7, 9 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 + add r1d, r1d %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + lea r5, [pw_ChromaCoeffV + r4] +%endif + + lea r4, [r1 * 3] + sub r0, r1 +%ifidn %1,sp + mova m8, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] +%rep %2 / 16 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m1, m8 + psrad m0, 12 + psrad m1, 12 +%else + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m0, m1 + + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m8 + paddd m3, m8 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m3, [interp8_hps_shuf] + vpermd m0, m3, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + movu [r2], xm0 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2 + r3], xm0 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif + lea r2, [r2 + r3 * 4] + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm0, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm0, 1 + pmaddwd m0, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m0 + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m8 + paddd m5, m8 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m6, m5 + pmaddwd m0, [r5] + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm7, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 1 * mmsize] + paddd m1, m7 + pmaddwd m2, [r5] + +%ifidn %1,sp + paddd m6, m8 + paddd m1, m8 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m3, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm7, m4, 1 + vextracti128 xm1, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm1 +%endif + lea r2, [r2 + r3 * 4] + + movu xm7, [r0 + r4] ; m7 = row 11 + punpckhwd xm1, xm5, xm7 + punpcklwd xm5, xm7 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + paddd m0, m1 + pmaddwd m5, [r5] + lea r0, [r0 + r1 * 4] + movu xm1, [r0] ; m1 = row 12 + punpckhwd xm4, xm7, xm1 + punpcklwd xm7, xm1 + vinserti128 m7, m7, xm4, 1 + pmaddwd m4, m7, [r5 + 1 * mmsize] + paddd m2, m4 + pmaddwd m7, [r5] +%ifidn %1,sp + paddd m0, m8 + paddd m2, m8 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + + movu xm4, [r0 + r1] ; m4 = row 13 + punpckhwd xm2, xm1, xm4 + punpcklwd xm1, xm4 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m1, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 14 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m7, m6 + pmaddwd m4, [r5] +%ifidn %1,sp + paddd m5, m8 + paddd m7, m8 + psrad m5, 12 + psrad m7, 12 +%else + psrad m5, 6 + psrad m7, 6 +%endif + packssdw m5, m7 +%ifidn %1,sp + packuswb m0, m5 + vpermd m0, m3, m0 + vextracti128 xm5, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm5 + movhps [r2 + r6], xm5 +%else + vpermq m0, m0, 11011000b + vpermq m5, m5, 11011000b + vextracti128 xm7, m0, 1 + vextracti128 xm6, m5, 1 + movu [r2], xm0 + movu [r2 + r3], xm7 + movu [r2 + r3 * 2], xm5 + movu [r2 + r6], xm6 +%endif + lea r2, [r2 + r3 * 4] + + movu xm6, [r0 + r4] ; m6 = row 15 + punpckhwd xm5, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm5, 1 + pmaddwd m5, m2, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 16 + punpckhwd xm5, xm6, xm0 + punpcklwd xm6, xm0 + vinserti128 m6, m6, xm5, 1 + pmaddwd m5, m6, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m6, [r5] +%ifidn %1,sp + paddd m1, m8 + paddd m4, m8 + psrad m1, 12 + psrad m4, 12 +%else + psrad m1, 6 + psrad m4, 6 +%endif + packssdw m1, m4 + + movu xm5, [r0 + r1] ; m5 = row 17 + punpckhwd xm4, xm0, xm5 + punpcklwd xm0, xm5 + vinserti128 m0, m0, xm4, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m2, m0 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm0, xm5, xm4 + punpcklwd xm5, xm4 + vinserti128 m5, m5, xm0, 1 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m6, m5 +%ifidn %1,sp + paddd m2, m8 + paddd m6, m8 + psrad m2, 12 + psrad m6, 12 +%else + psrad m2, 6 + psrad m6, 6 +%endif + packssdw m2, m6 +%ifidn %1,sp + packuswb m1, m2 + vpermd m1, m3, m1 + vextracti128 xm2, m1, 1 + movq [r2], xm1 + movhps [r2 + r3], xm1 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m1, m1, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm6, m1, 1 + vextracti128 xm4, m2, 1 + movu [r2], xm1 + movu [r2 + r3], xm6 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm4 %endif - - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - mova m14, [pw_512] - mov r9d, 2 -.loopW: - PROCESS_LUMA_AVX2_W16_16R - add r2, 16 - add r0, 16 - dec r9d - jnz .loopW + lea r2, [r2 + r3 * 4] +%endrep RET %endif +%endmacro + +FILTER_VER_CHROMA_S_AVX2_8xN sp, 16 +FILTER_VER_CHROMA_S_AVX2_8xN sp, 32 +FILTER_VER_CHROMA_S_AVX2_8xN ss, 16 +FILTER_VER_CHROMA_S_AVX2_8xN ss, 32 +%macro FILTER_VER_CHROMA_S_AVX2_32x24 1 INIT_YMM avx2 %if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_32x24, 4, 10, 15 +cglobal interp_4tap_vert_%1_32x24, 4, 10, 10 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 + add r1d, r1d %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] - sub r0, r4 + sub r0, r1 +%ifidn %1,sp + mova m9, [pd_526336] +%else + add r3d, r3d +%endif lea r6, [r3 * 3] - mova m14, [pw_512] - mov r9d, 2 + mov r9d, 4 .loopW: - PROCESS_LUMA_AVX2_W16_16R + PROCESS_CHROMA_S_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else add r2, 16 +%endif add r0, 16 dec r9d jnz .loopW - lea r9, [r1 * 4] - sub r7, r9 - lea r0, [r7 - 16] - lea r2, [r8 + r3 * 4 - 16] - mov r9d, 2 +%ifidn %1,sp + lea r2, [r8 + r3 * 4 - 24] +%else + lea r2, [r8 + r3 * 4 - 48] +%endif + lea r0, [r7 - 48] + mova m7, m9 + mov r9d, 4 .loop: - PROCESS_LUMA_AVX2_W16_8R + PROCESS_CHROMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 +%else add r2, 16 +%endif add r0, 16 dec r9d jnz .loop RET %endif +%endmacro + +FILTER_VER_CHROMA_S_AVX2_32x24 sp +FILTER_VER_CHROMA_S_AVX2_32x24 ss +%macro FILTER_VER_CHROMA_S_AVX2_2x8 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_32x8, 4, 10, 15 +cglobal interp_4tap_vert_%1_2x8, 4, 6, 7 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 + add r1d, r1d + sub r0, r1 %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - mova m14, [pw_512] - mov r9d, 2 -.loopW: - PROCESS_LUMA_AVX2_W16_8R - add r2, 16 - add r0, 16 - dec r9d - jnz .loopW - RET -%endif - -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_48x64, 4, 12, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 +%ifidn %1,sp + mova m6, [pd_526336] %else - lea r5, [tab_LumaCoeffVer_32 + r4] + add r3d, r3d %endif - - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - lea r11, [r1 * 4] - mova m14, [pw_512] - mov r9d, 4 -.loopH: - mov r10d, 3 -.loopW: - PROCESS_LUMA_AVX2_W16_16R - add r2, 16 - add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 32] - lea r2, [r8 + r3 * 4 - 32] - dec r9d - jnz .loopH - RET + movd xm0, [r0] + movd xm1, [r0 + r1] + punpcklwd xm0, xm1 + movd xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + punpcklqdq xm0, xm1 ; m0 = [2 1 1 0] + movd xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movd xm4, [r0] + punpcklwd xm3, xm4 + punpcklqdq xm2, xm3 ; m2 = [4 3 3 2] + vinserti128 m0, m0, xm2, 1 ; m0 = [4 3 3 2 2 1 1 0] + movd xm1, [r0 + r1] + punpcklwd xm4, xm1 + movd xm3, [r0 + r1 * 2] + punpcklwd xm1, xm3 + punpcklqdq xm4, xm1 ; m4 = [6 5 5 4] + vinserti128 m2, m2, xm4, 1 ; m2 = [6 5 5 4 4 3 3 2] + pmaddwd m0, [r5] + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movd xm1, [r0 + r4] + punpcklwd xm3, xm1 + lea r0, [r0 + 4 * r1] + movd xm2, [r0] + punpcklwd xm1, xm2 + punpcklqdq xm3, xm1 ; m3 = [8 7 7 6] + vinserti128 m4, m4, xm3, 1 ; m4 = [8 7 7 6 6 5 5 4] + movd xm1, [r0 + r1] + punpcklwd xm2, xm1 + movd xm5, [r0 + r1 * 2] + punpcklwd xm1, xm5 + punpcklqdq xm2, xm1 ; m2 = [10 9 9 8] + vinserti128 m3, m3, xm2, 1 ; m3 = [10 9 9 8 8 7 7 6] + pmaddwd m4, [r5] + pmaddwd m3, [r5 + 1 * mmsize] + paddd m4, m3 +%ifidn %1,sp + paddd m0, m6 + paddd m4, m6 + psrad m0, 12 + psrad m4, 12 +%else + psrad m0, 6 + psrad m4, 6 %endif - -%macro FILTER_VER_LUMA_AVX2_64xN 2 -INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_%1x%2, 4, 12, 15 - mov r4d, r4m - shl r4d, 7 - -%ifdef PIC - lea r5, [tab_LumaCoeffVer_32] - add r5, r4 + packssdw m0, m4 + vextracti128 xm4, m0, 1 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb xm0, xm4 + pextrw [r2], xm0, 0 + pextrw [r2 + r3], xm0, 1 + pextrw [r2 + 2 * r3], xm0, 4 + pextrw [r2 + r4], xm0, 5 + lea r2, [r2 + r3 * 4] + pextrw [r2], xm0, 2 + pextrw [r2 + r3], xm0, 3 + pextrw [r2 + 2 * r3], xm0, 6 + pextrw [r2 + r4], xm0, 7 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + movd [r2], xm0 + pextrd [r2 + r3], xm0, 1 + movd [r2 + 2 * r3], xm4 + pextrd [r2 + r4], xm4, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm0, 3 + pextrd [r2 + 2 * r3], xm4, 2 + pextrd [r2 + r4], xm4, 3 %endif - - lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - lea r11, [r1 * 4] - mova m14, [pw_512] - mov r9d, %2 / 16 -.loopH: - mov r10d, %1 / 16 -.loopW: - PROCESS_LUMA_AVX2_W16_16R - add r2, 16 - add r0, 16 - dec r10d - jnz .loopW - sub r7, r11 - lea r0, [r7 - 48] - lea r2, [r8 + r3 * 4 - 48] - dec r9d - jnz .loopH RET -%endif %endmacro -FILTER_VER_LUMA_AVX2_64xN 64, 32 -FILTER_VER_LUMA_AVX2_64xN 64, 48 -FILTER_VER_LUMA_AVX2_64xN 64, 64 +FILTER_VER_CHROMA_S_AVX2_2x8 sp +FILTER_VER_CHROMA_S_AVX2_2x8 ss +%macro FILTER_VER_CHROMA_S_AVX2_6x8 1 INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal interp_8tap_vert_pp_64x16, 4, 10, 15 +cglobal interp_4tap_vert_%1_6x8, 4, 6, 8 mov r4d, r4m - shl r4d, 7 + shl r4d, 6 + add r1d, r1d %ifdef PIC - lea r5, [tab_LumaCoeffVer_32] + lea r5, [pw_ChromaCoeffV] add r5, r4 %else - lea r5, [tab_LumaCoeffVer_32 + r4] + lea r5, [pw_ChromaCoeffV + r4] %endif lea r4, [r1 * 3] - sub r0, r4 - lea r6, [r3 * 3] - mova m14, [pw_512] - mov r9d, 4 -.loopW: - PROCESS_LUMA_AVX2_W16_16R - add r2, 16 - add r0, 16 - dec r9d - jnz .loopW - RET -%endif - -;------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA 3 -INIT_XMM sse4 -cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 -%ifidn %3,ps - add r3d, r3d -%endif - -%ifdef PIC - lea r5, [tab_LumaCoeffVer] - lea r6, [r5 + r4] + sub r0, r1 +%ifidn %1,sp + mova m7, [pd_526336] %else - lea r6, [tab_LumaCoeffVer + r4] + add r3d, r3d %endif -%ifidn %3,pp - mova m3, [pw_512] + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + psrad m0, 12 + psrad m1, 12 %else - mova m3, [pw_2000] + psrad m0, 6 + psrad m1, 6 %endif - mov dword [rsp], %2/4 - -.loopH: - mov r4d, (%1/8) -.loopW: - PROCESS_LUMA_W8_4R -%ifidn %3,pp - pmulhrsw m7, m3 - pmulhrsw m6, m3 - pmulhrsw m5, m3 - pmulhrsw m4, m3 + packssdw m0, m1 - packuswb m7, m6 - packuswb m5, m4 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm1, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm1, 1 + pmaddwd m1, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m1 +%ifidn %1,sp + paddd m2, m7 + paddd m3, m7 + psrad m2, 12 + psrad m3, 12 +%else + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m2, m3 + + movu xm1, [r0 + r4] ; m1 = row 7 + punpckhwd xm3, xm6, xm1 + punpcklwd xm6, xm1 + vinserti128 m6, m6, xm3, 1 + pmaddwd m3, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m3 - movlps [r2], m7 - movhps [r2 + r3], m7 - lea r5, [r2 + 2 * r3] - movlps [r5], m5 - movhps [r5 + r3], m5 + lea r4, [r3 * 3] +%ifidn %1,sp + packuswb m0, m2 + vextracti128 xm2, m0, 1 + movd [r2], xm0 + pextrw [r2 + 4], xm2, 0 + pextrd [r2 + r3], xm0, 1 + pextrw [r2 + r3 + 4], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 2 + pextrw [r2 + r3 * 2 + 4], xm2, 4 + pextrd [r2 + r4], xm0, 3 + pextrw [r2 + r4 + 4], xm2, 6 %else - psubw m7, m3 - psubw m6, m3 - psubw m5, m3 - psubw m4, m3 - - movu [r2], m7 - movu [r2 + r3], m6 - lea r5, [r2 + 2 * r3] - movu [r5], m5 - movu [r5 + r3], m4 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 + vextracti128 xm0, m0, 1 + vextracti128 xm3, m2, 1 + movd [r2 + 8], xm0 + pextrd [r2 + r3 + 8], xm0, 2 + movd [r2 + r3 * 2 + 8], xm3 + pextrd [r2 + r4 + 8], xm3, 2 %endif - - lea r5, [8 * r1 - 8] - sub r0, r5 -%ifidn %3,pp - add r2, 8 + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + movu xm0, [r0] ; m0 = row 8 + punpckhwd xm2, xm1, xm0 + punpcklwd xm1, xm0 + vinserti128 m1, m1, xm2, 1 + pmaddwd m2, m1, [r5 + 1 * mmsize] + pmaddwd m1, [r5] + paddd m5, m2 +%ifidn %1,sp + paddd m4, m7 + paddd m5, m7 + psrad m4, 12 + psrad m5, 12 +%else + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m4, m5 + + movu xm2, [r0 + r1] ; m2 = row 9 + punpckhwd xm5, xm0, xm2 + punpcklwd xm0, xm2 + vinserti128 m0, m0, xm5, 1 + pmaddwd m0, [r5 + 1 * mmsize] + paddd m6, m0 + movu xm5, [r0 + r1 * 2] ; m5 = row 10 + punpckhwd xm0, xm2, xm5 + punpcklwd xm2, xm5 + vinserti128 m2, m2, xm0, 1 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m1, m2 + +%ifidn %1,sp + paddd m6, m7 + paddd m1, m7 + psrad m6, 12 + psrad m1, 12 +%else + psrad m6, 6 + psrad m1, 6 +%endif + packssdw m6, m1 +%ifidn %1,sp + packuswb m4, m6 + vextracti128 xm6, m4, 1 + movd [r2], xm4 + pextrw [r2 + 4], xm6, 0 + pextrd [r2 + r3], xm4, 1 + pextrw [r2 + r3 + 4], xm6, 2 + pextrd [r2 + r3 * 2], xm4, 2 + pextrw [r2 + r3 * 2 + 4], xm6, 4 + pextrd [r2 + r4], xm4, 3 + pextrw [r2 + r4 + 4], xm6, 6 %else - add r2, 16 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r4], xm6 + vextracti128 xm5, m4, 1 + vextracti128 xm1, m6, 1 + movd [r2 + 8], xm5 + pextrd [r2 + r3 + 8], xm5, 2 + movd [r2 + r3 * 2 + 8], xm1 + pextrd [r2 + r4 + 8], xm1, 2 %endif - dec r4d - jnz .loopW + RET +%endmacro - lea r0, [r0 + 4 * r1 - %1] -%ifidn %3,pp - lea r2, [r2 + 4 * r3 - %1] +FILTER_VER_CHROMA_S_AVX2_6x8 sp +FILTER_VER_CHROMA_S_AVX2_6x8 ss + +;--------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W2_4R 2 +INIT_XMM sse4 +cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] %else - lea r2, [r2 + 4 * r3 - 2 * %1] + lea r5, [tab_ChromaCoeffV + r4] %endif - dec dword [rsp] + mov r4d, (%2/4) + +.loopH: + PROCESS_CHROMA_SP_W2_4R r5 + + psrad m0, 6 + psrad m2, 6 + + packssdw m0, m2 + + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 + + lea r2, [r2 + 2 * r3] + + dec r4d jnz .loopH RET %endmacro -FILTER_VER_LUMA 16, 4, pp -FILTER_VER_LUMA 16, 8, pp -FILTER_VER_LUMA 16, 12, pp -FILTER_VER_LUMA 16, 16, pp -FILTER_VER_LUMA 16, 32, pp -FILTER_VER_LUMA 16, 64, pp -FILTER_VER_LUMA 24, 32, pp -FILTER_VER_LUMA 32, 8, pp -FILTER_VER_LUMA 32, 16, pp -FILTER_VER_LUMA 32, 24, pp -FILTER_VER_LUMA 32, 32, pp -FILTER_VER_LUMA 32, 64, pp -FILTER_VER_LUMA 48, 64, pp -FILTER_VER_LUMA 64, 16, pp -FILTER_VER_LUMA 64, 32, pp -FILTER_VER_LUMA 64, 48, pp -FILTER_VER_LUMA 64, 64, pp +FILTER_VER_CHROMA_SS_W2_4R 2, 4 +FILTER_VER_CHROMA_SS_W2_4R 2, 8 -FILTER_VER_LUMA 16, 4, ps -FILTER_VER_LUMA 16, 8, ps -FILTER_VER_LUMA 16, 12, ps -FILTER_VER_LUMA 16, 16, ps -FILTER_VER_LUMA 16, 32, ps -FILTER_VER_LUMA 16, 64, ps -FILTER_VER_LUMA 24, 32, ps -FILTER_VER_LUMA 32, 8, ps -FILTER_VER_LUMA 32, 16, ps -FILTER_VER_LUMA 32, 24, ps -FILTER_VER_LUMA 32, 32, ps -FILTER_VER_LUMA 32, 64, ps -FILTER_VER_LUMA 48, 64, ps -FILTER_VER_LUMA 64, 16, ps -FILTER_VER_LUMA 64, 32, ps -FILTER_VER_LUMA 64, 48, ps -FILTER_VER_LUMA 64, 64, ps +FILTER_VER_CHROMA_SS_W2_4R 2, 16 + +;--------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;--------------------------------------------------------------------------------------------------------------- +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 + + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 + +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif -%macro PROCESS_LUMA_SP_W4_4R 0 movq m0, [r0] movq m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 - - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 16] - paddd m0, m4 ;m0=[0+1+2+3] Row1 + pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 16] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 - - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m6, m4, [r6 + 1 * 16] - paddd m2, m6 ;m2=[2+3+4+5] Row3 - pmaddwd m4, [r6 + 2 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + movq m2, [r0] + punpcklwd m1, m2 ;m1=[1 2] + pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m6, m5, [r6 + 1 * 16] - paddd m3, m6 ;m3=[3+4+5+6] Row4 - pmaddwd m5, [r6 + 2 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + movq m3, [r0 + r1] + punpcklwd m2, m3 ;m4=[2 3] + pmaddwd m2, [r5 + 1 * 16] + paddd m0, m2 ;m0=[0+1+2+3] Row1 done + psrad m0, 6 - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[6 7] - pmaddwd m6, m4, [r6 + 2 * 16] - paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 - pmaddwd m4, [r6 + 3 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + movq m2, [r0 + 2 * r1] + punpcklwd m3, m2 ;m5=[3 4] + pmaddwd m3, [r5 + 1 * 16] + paddd m1, m3 ;m1=[1+2+3+4] Row2 done + psrad m1, 6 - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[7 8] - pmaddwd m6, m5, [r6 + 2 * 16] - paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 - pmaddwd m5, [r6 + 3 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + packssdw m0, m1 - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[8 9] - pmaddwd m4, [r6 + 3 * 16] - paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + movlps [r2], m0 + movhps [r2 + r3], m0 - movq m4, [r0 + 2 * r1] - punpcklwd m5, m4 ;m5=[9 10] - pmaddwd m5, [r6 + 3 * 16] - paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end -%endmacro + RET -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_SP 2 +;------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;------------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W6_H4 2 INIT_XMM sse4 -cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize +cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 add r1d, r1d - lea r5, [r1 + 2 * r1] - sub r0, r5 - shl r4d, 6 + add r3d, r3d + sub r0, r1 + shl r4d, 5 %ifdef PIC - lea r5, [tab_LumaCoeffV] + lea r5, [tab_ChromaCoeffV] lea r6, [r5 + r4] %else - lea r6, [tab_LumaCoeffV + r4] + lea r6, [tab_ChromaCoeffV + r4] %endif - mova m7, [tab_c_526336] + mov r4d, %2/4 - mov dword [rsp], %2/4 .loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_LUMA_SP_W4_4R - - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 + PROCESS_CHROMA_SP_W4_4R - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 packssdw m0, m1 packssdw m2, m3 - packuswb m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 + movlps [r2], m0 + movhps [r2 + r3], m0 lea r5, [r2 + 2 * r3] - pextrd [r5], m0, 2 - pextrd [r5 + r3], m0, 3 + movlps [r5], m2 + movhps [r5 + r3], m2 - lea r5, [8 * r1 - 2 * 4] + lea r5, [4 * r1 - 2 * 4] sub r0, r5 - add r2, 4 + add r2, 2 * 4 - dec r4d - jnz .loopW + PROCESS_CHROMA_SP_W2_4R r6 - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - %1] + psrad m0, 6 + psrad m2, 6 - dec dword [rsp] - jnz .loopH + packssdw m0, m2 - RET -%endmacro + movd [r2], m0 + pextrd [r2 + r3], m0, 1 + lea r2, [r2 + 2 * r3] + pextrd [r2], m0, 2 + pextrd [r2 + r3], m0, 3 -;-------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- - FILTER_VER_LUMA_SP 4, 4 - FILTER_VER_LUMA_SP 8, 8 - FILTER_VER_LUMA_SP 8, 4 - FILTER_VER_LUMA_SP 4, 8 - FILTER_VER_LUMA_SP 16, 16 - FILTER_VER_LUMA_SP 16, 8 - FILTER_VER_LUMA_SP 8, 16 - FILTER_VER_LUMA_SP 16, 12 - FILTER_VER_LUMA_SP 12, 16 - FILTER_VER_LUMA_SP 16, 4 - FILTER_VER_LUMA_SP 4, 16 - FILTER_VER_LUMA_SP 32, 32 - FILTER_VER_LUMA_SP 32, 16 - FILTER_VER_LUMA_SP 16, 32 - FILTER_VER_LUMA_SP 32, 24 - FILTER_VER_LUMA_SP 24, 32 - FILTER_VER_LUMA_SP 32, 8 - FILTER_VER_LUMA_SP 8, 32 - FILTER_VER_LUMA_SP 64, 64 - FILTER_VER_LUMA_SP 64, 32 - FILTER_VER_LUMA_SP 32, 64 - FILTER_VER_LUMA_SP 64, 48 - FILTER_VER_LUMA_SP 48, 64 - FILTER_VER_LUMA_SP 64, 16 - FILTER_VER_LUMA_SP 16, 64 + sub r0, 2 * 4 + lea r2, [r2 + 2 * r3 - 2 * 4] + + dec r4d + jnz .loopH -; TODO: combin of U and V is more performance, but need more register -; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it -INIT_XMM ssse3 -cglobal chroma_p2s, 3, 7, 4 + RET +%endmacro - ; load width and height - mov r3d, r3m - mov r4d, r4m +FILTER_VER_CHROMA_SS_W6_H4 6, 8 - ; load constant - mova m2, [pb_128] - mova m3, [tab_c_64_n64] +FILTER_VER_CHROMA_SS_W6_H4 6, 16 -.loopH: - xor r5d, r5d -.loopW: - lea r6, [r0 + r5] +;---------------------------------------------------------------------------------------------------------------- +; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;---------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_CHROMA_SS_W8_H2 2 +INIT_XMM sse2 +cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 - movh m0, [r6] - punpcklbw m0, m2 - pmaddubsw m0, m3 + add r1d, r1d + add r3d, r3d + sub r0, r1 + shl r4d, 5 - movh m1, [r6 + r1] - punpcklbw m1, m2 - pmaddubsw m1, m3 +%ifdef PIC + lea r5, [tab_ChromaCoeffV] + lea r5, [r5 + r4] +%else + lea r5, [tab_ChromaCoeffV + r4] +%endif - add r5d, 8 - cmp r5d, r3d - lea r6, [r2 + r5 * 2] - jg .width4 - movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0 - movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1 - je .nextH - jmp .loopW + mov r4d, %2/2 +.loopH: + PROCESS_CHROMA_SP_W8_2R -.width4: - test r3d, 4 - jz .width2 - test r3d, 2 - movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0 - movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1 - lea r6, [r6 + 8] - pshufd m0, m0, 2 - pshufd m1, m1, 2 - jz .nextH + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 -.width2: - movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0 - movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1 + packssdw m0, m1 + packssdw m2, m3 -.nextH: - lea r0, [r0 + r1 * 2] - add r2, FENC_STRIDE / 2 * 4 + movu [r2], m0 + movu [r2 + r3], m2 - sub r4d, 2 - jnz .loopH + lea r2, [r2 + 2 * r3] + + dec r4d + jnz .loopH RET +%endmacro -%macro PROCESS_CHROMA_SP_W4_4R 0 +FILTER_VER_CHROMA_SS_W8_H2 8, 2 +FILTER_VER_CHROMA_SS_W8_H2 8, 4 +FILTER_VER_CHROMA_SS_W8_H2 8, 6 +FILTER_VER_CHROMA_SS_W8_H2 8, 8 +FILTER_VER_CHROMA_SS_W8_H2 8, 16 +FILTER_VER_CHROMA_SS_W8_H2 8, 32 + +FILTER_VER_CHROMA_SS_W8_H2 8, 12 +FILTER_VER_CHROMA_SS_W8_H2 8, 64 + +;----------------------------------------------------------------------------------------------------------------- +; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) +;----------------------------------------------------------------------------------------------------------------- +%macro FILTER_VER_LUMA_SS 2 +INIT_XMM sse2 +cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize + + add r1d, r1d + add r3d, r3d + lea r5, [3 * r1] + sub r0, r5 + shl r4d, 6 + +%ifdef PIC + lea r5, [tab_LumaCoeffV] + lea r6, [r5 + r4] +%else + lea r6, [tab_LumaCoeffV + r4] +%endif + + mov dword [rsp], %2/4 +.loopH: + mov r4d, (%1/4) +.loopW: movq m0, [r0] movq m1, [r0 + r1] punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 + pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 pmaddwd m4, [r6 + 1 * 16] - paddd m0, m4 ;m0=[0+1+2+3] Row1 done + paddd m0, m4 ;m0=[0+1+2+3] Row1 lea r0, [r0 + 2 * r1] movq m4, [r0] punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 + pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 pmaddwd m5, [r6 + 1 * 16] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 + paddd m1, m5 ;m1 = [1+2+3+4] Row2 movq m5, [r0 + r1] punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m4, [r6 + 1 * 16] - paddd m2, m4 ;m2=[2+3+4+5] Row3 + pmaddwd m6, m4, [r6 + 1 * 16] + paddd m2, m6 ;m2=[2+3+4+5] Row3 + pmaddwd m4, [r6 + 2 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 - movq m4, [r0 + 2 * r1] + lea r0, [r0 + 2 * r1] + movq m4, [r0] punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m5, [r6 + 1 * 16] - paddd m3, m5 ;m3=[3+4+5+6] Row4 -%endmacro - -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize - - add r1d, r1d - sub r0, r1 - shl r4d, 5 - -%ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] -%else - lea r6, [tab_ChromaCoeffV + r4] -%endif - - mova m6, [tab_c_526336] - - mov dword [rsp], %2/4 - -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_CHROMA_SP_W4_4R - - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m0, 2 - pextrd [r5 + r3], m0, 3 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 4 - - dec r4d - jnz .loopW - - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - %1] + pmaddwd m6, m5, [r6 + 1 * 16] + paddd m3, m6 ;m3=[3+4+5+6] Row4 + pmaddwd m5, [r6 + 2 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 - dec dword [rsp] - jnz .loopH + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[6 7] + pmaddwd m6, m4, [r6 + 2 * 16] + paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 + pmaddwd m4, [r6 + 3 * 16] + paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end + psrad m0, 6 - RET -%endmacro + lea r0, [r0 + 2 * r1] + movq m4, [r0] + punpcklwd m5, m4 ;m5=[7 8] + pmaddwd m6, m5, [r6 + 2 * 16] + paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 + pmaddwd m5, [r6 + 3 * 16] + paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end + psrad m1, 6 - FILTER_VER_CHROMA_SP 4, 4 - FILTER_VER_CHROMA_SP 4, 8 - FILTER_VER_CHROMA_SP 16, 16 - FILTER_VER_CHROMA_SP 16, 8 - FILTER_VER_CHROMA_SP 16, 12 - FILTER_VER_CHROMA_SP 12, 16 - FILTER_VER_CHROMA_SP 16, 4 - FILTER_VER_CHROMA_SP 4, 16 - FILTER_VER_CHROMA_SP 32, 32 - FILTER_VER_CHROMA_SP 32, 16 - FILTER_VER_CHROMA_SP 16, 32 - FILTER_VER_CHROMA_SP 32, 24 - FILTER_VER_CHROMA_SP 24, 32 - FILTER_VER_CHROMA_SP 32, 8 + packssdw m0, m1 - FILTER_VER_CHROMA_SP 16, 24 - FILTER_VER_CHROMA_SP 16, 64 - FILTER_VER_CHROMA_SP 12, 32 - FILTER_VER_CHROMA_SP 4, 32 - FILTER_VER_CHROMA_SP 32, 64 - FILTER_VER_CHROMA_SP 32, 48 - FILTER_VER_CHROMA_SP 24, 64 + movlps [r2], m0 + movhps [r2 + r3], m0 - FILTER_VER_CHROMA_SP 64, 64 - FILTER_VER_CHROMA_SP 64, 32 - FILTER_VER_CHROMA_SP 64, 48 - FILTER_VER_CHROMA_SP 48, 64 - FILTER_VER_CHROMA_SP 64, 16 + movq m5, [r0 + r1] + punpcklwd m4, m5 ;m4=[8 9] + pmaddwd m4, [r6 + 3 * 16] + paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end + psrad m2, 6 + movq m4, [r0 + 2 * r1] + punpcklwd m5, m4 ;m5=[9 10] + pmaddwd m5, [r6 + 3 * 16] + paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end + psrad m3, 6 -%macro PROCESS_CHROMA_SP_W2_4R 1 - movd m0, [r0] - movd m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] + packssdw m2, m3 - lea r0, [r0 + 2 * r1] - movd m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - punpcklqdq m0, m1 ;m0=[0 1 1 2] - pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2 + movlps [r2 + 2 * r3], m2 + lea r5, [3 * r3] + movhps [r2 + r5], m2 - movd m1, [r0 + r1] - punpcklwd m2, m1 ;m2=[2 3] + lea r5, [8 * r1 - 2 * 4] + sub r0, r5 + add r2, 2 * 4 - lea r0, [r0 + 2 * r1] - movd m3, [r0] - punpcklwd m1, m3 ;m2=[3 4] - punpcklqdq m2, m1 ;m2=[2 3 3 4] + dec r4d + jnz .loopW - pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2 - pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4 - paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2 + lea r0, [r0 + 4 * r1 - 2 * %1] + lea r2, [r2 + 4 * r3 - 2 * %1] - movd m1, [r0 + r1] - punpcklwd m3, m1 ;m3=[4 5] + dec dword [rsp] + jnz .loopH - movd m4, [r0 + 2 * r1] - punpcklwd m1, m4 ;m1=[5 6] - punpcklqdq m3, m1 ;m2=[4 5 5 6] - pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4 - paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4 + RET %endmacro -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP_W2_4R 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6 + FILTER_VER_LUMA_SS 4, 4 + FILTER_VER_LUMA_SS 8, 8 + FILTER_VER_LUMA_SS 8, 4 + FILTER_VER_LUMA_SS 4, 8 + FILTER_VER_LUMA_SS 16, 16 + FILTER_VER_LUMA_SS 16, 8 + FILTER_VER_LUMA_SS 8, 16 + FILTER_VER_LUMA_SS 16, 12 + FILTER_VER_LUMA_SS 12, 16 + FILTER_VER_LUMA_SS 16, 4 + FILTER_VER_LUMA_SS 4, 16 + FILTER_VER_LUMA_SS 32, 32 + FILTER_VER_LUMA_SS 32, 16 + FILTER_VER_LUMA_SS 16, 32 + FILTER_VER_LUMA_SS 32, 24 + FILTER_VER_LUMA_SS 24, 32 + FILTER_VER_LUMA_SS 32, 8 + FILTER_VER_LUMA_SS 8, 32 + FILTER_VER_LUMA_SS 64, 64 + FILTER_VER_LUMA_SS 64, 32 + FILTER_VER_LUMA_SS 32, 64 + FILTER_VER_LUMA_SS 64, 48 + FILTER_VER_LUMA_SS 48, 64 + FILTER_VER_LUMA_SS 64, 16 + FILTER_VER_LUMA_SS 16, 64 - add r1d, r1d - sub r0, r1 - shl r4d, 5 +%macro FILTER_VER_LUMA_AVX2_4x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_4x4, 4, 6, 7 + mov r4d, r4m + add r1d, r1d + shl r4d, 7 %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - lea r5, [tab_ChromaCoeffV + r4] + lea r5, [pw_LumaCoeffVer + r4] %endif - mova m5, [tab_c_526336] + lea r4, [r1 * 3] + sub r0, r4 - mov r4d, (%2/4) +%ifidn %1,sp + mova m6, [pd_526336] +%else + add r3d, r3d +%endif -.loopH: - PROCESS_CHROMA_SP_W2_4R r5 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + pmaddwd m4, [r5 + 1 * mmsize] + paddd m0, m5 + paddd m2, m4 + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + pmaddwd m1, [r5 + 2 * mmsize] + paddd m0, m5 + paddd m2, m1 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + 2 * r1] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [A 9 9 8] + pmaddwd m4, [r5 + 3 * mmsize] + paddd m2, m4 + +%ifidn %1,sp + paddd m0, m6 + paddd m2, m6 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + vextracti128 xm2, m0, 1 + lea r4, [r3 * 3] - paddd m0, m5 - paddd m2, m5 +%ifidn %1,sp + packuswb xm0, xm2 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r4], xm0, 3 +%else + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r4], xm2 +%endif + RET +%endmacro - psrad m0, 12 - psrad m2, 12 +FILTER_VER_LUMA_AVX2_4x4 sp +FILTER_VER_LUMA_AVX2_4x4 ss - packssdw m0, m2 - packuswb m0, m0 +%macro FILTER_VER_LUMA_AVX2_4x8 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_4x8, 4, 7, 8 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d - pextrw [r2], m0, 0 - pextrw [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrw [r2], m0, 2 - pextrw [r2 + r3], m0, 3 +%ifdef PIC + lea r5, [pw_LumaCoeffVer] + add r5, r4 +%else + lea r5, [pw_LumaCoeffVer + r4] +%endif - lea r2, [r2 + 2 * r3] + lea r4, [r1 * 3] + sub r0, r4 - dec r4d - jnz .loopH +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + paddd m0, m5 + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m5, m1, [r5 + 2 * mmsize] + paddd m2, m5 + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [A 9 9 8] + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m2, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m4, m3 + pmaddwd m6, [r5 + 1 * mmsize] + paddd m1, m6 + +%ifidn %1,sp + paddd m0, m7 + paddd m2, m7 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + + movq xm3, [r0 + r4] + punpcklwd xm5, xm3 + lea r0, [r0 + 4 * r1] + movq xm2, [r0] + punpcklwd xm3, xm2 + vinserti128 m5, m5, xm3, 1 ; m5 = [C B B A] + pmaddwd m3, m5, [r5 + 3 * mmsize] + paddd m4, m3 + pmaddwd m5, [r5 + 2 * mmsize] + paddd m1, m5 + movq xm3, [r0 + r1] + punpcklwd xm2, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m2, m2, xm3, 1 ; m2 = [E D D C] + pmaddwd m2, [r5 + 3 * mmsize] + paddd m1, m2 + +%ifidn %1,sp + paddd m4, m7 + paddd m1, m7 + psrad m4, 12 + psrad m1, 12 +%else + psrad m4, 6 + psrad m1, 6 +%endif + packssdw m4, m1 +%ifidn %1,sp + packuswb m0, m4 + vextracti128 xm2, m0, 1 + movd [r2], xm0 + movd [r2 + r3], xm2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r6], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm0, 2 + pextrd [r2 + r3], xm2, 2 + pextrd [r2 + r3 * 2], xm0, 3 + pextrd [r2 + r6], xm2, 3 +%else + vextracti128 xm2, m0, 1 + vextracti128 xm1, m4, 1 + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 + lea r2, [r2 + r3 * 4] + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 +%endif RET %endmacro -FILTER_VER_CHROMA_SP_W2_4R 2, 4 -FILTER_VER_CHROMA_SP_W2_4R 2, 8 +FILTER_VER_LUMA_AVX2_4x8 sp +FILTER_VER_LUMA_AVX2_4x8 ss -FILTER_VER_CHROMA_SP_W2_4R 2, 16 +%macro PROCESS_LUMA_AVX2_W4_16R 1 + movq xm0, [r0] + movq xm1, [r0 + r1] + punpcklwd xm0, xm1 + movq xm2, [r0 + r1 * 2] + punpcklwd xm1, xm2 + vinserti128 m0, m0, xm1, 1 ; m0 = [2 1 1 0] + pmaddwd m0, [r5] + movq xm3, [r0 + r4] + punpcklwd xm2, xm3 + lea r0, [r0 + 4 * r1] + movq xm4, [r0] + punpcklwd xm3, xm4 + vinserti128 m2, m2, xm3, 1 ; m2 = [4 3 3 2] + pmaddwd m5, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m5 + movq xm3, [r0 + r1] + punpcklwd xm4, xm3 + movq xm1, [r0 + r1 * 2] + punpcklwd xm3, xm1 + vinserti128 m4, m4, xm3, 1 ; m4 = [6 5 5 4] + pmaddwd m5, m4, [r5 + 2 * mmsize] + paddd m0, m5 + pmaddwd m5, m4, [r5 + 1 * mmsize] + paddd m2, m5 + pmaddwd m4, [r5] + movq xm3, [r0 + r4] + punpcklwd xm1, xm3 + lea r0, [r0 + 4 * r1] + movq xm6, [r0] + punpcklwd xm3, xm6 + vinserti128 m1, m1, xm3, 1 ; m1 = [8 7 7 6] + pmaddwd m5, m1, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m5, m1, [r5 + 2 * mmsize] + paddd m2, m5 + pmaddwd m5, m1, [r5 + 1 * mmsize] + paddd m4, m5 + pmaddwd m1, [r5] + movq xm3, [r0 + r1] + punpcklwd xm6, xm3 + movq xm5, [r0 + 2 * r1] + punpcklwd xm3, xm5 + vinserti128 m6, m6, xm3, 1 ; m6 = [10 9 9 8] + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m2, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m4, m3 + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m1, m3 + pmaddwd m6, [r5] + +%ifidn %1,sp + paddd m0, m7 + paddd m2, m7 + psrad m0, 12 + psrad m2, 12 +%else + psrad m0, 6 + psrad m2, 6 +%endif + packssdw m0, m2 + vextracti128 xm2, m0, 1 +%ifidn %1,sp + packuswb xm0, xm2 + movd [r2], xm0 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm0, 1 + pextrd [r2 + r6], xm0, 3 +%else + movq [r2], xm0 + movq [r2 + r3], xm2 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm2 +%endif -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_4x2, 5, 6, 5 + movq xm2, [r0 + r4] + punpcklwd xm5, xm2 + lea r0, [r0 + 4 * r1] + movq xm0, [r0] + punpcklwd xm2, xm0 + vinserti128 m5, m5, xm2, 1 ; m5 = [12 11 11 10] + pmaddwd m2, m5, [r5 + 3 * mmsize] + paddd m4, m2 + pmaddwd m2, m5, [r5 + 2 * mmsize] + paddd m1, m2 + pmaddwd m2, m5, [r5 + 1 * mmsize] + paddd m6, m2 + pmaddwd m5, [r5] + movq xm2, [r0 + r1] + punpcklwd xm0, xm2 + movq xm3, [r0 + 2 * r1] + punpcklwd xm2, xm3 + vinserti128 m0, m0, xm2, 1 ; m0 = [14 13 13 12] + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m1, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m6, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m5, m2 + pmaddwd m0, [r5] + +%ifidn %1,sp + paddd m4, m7 + paddd m1, m7 + psrad m4, 12 + psrad m1, 12 +%else + psrad m4, 6 + psrad m1, 6 +%endif + packssdw m4, m1 + vextracti128 xm1, m4, 1 + lea r2, [r2 + r3 * 4] +%ifidn %1,sp + packuswb xm4, xm1 + movd [r2], xm4 + pextrd [r2 + r3], xm4, 2 + pextrd [r2 + r3 * 2], xm4, 1 + pextrd [r2 + r6], xm4, 3 +%else + movq [r2], xm4 + movq [r2 + r3], xm1 + movhps [r2 + r3 * 2], xm4 + movhps [r2 + r6], xm1 +%endif - add r1d, r1d - sub r0, r1 - shl r4d, 5 + movq xm4, [r0 + r4] + punpcklwd xm3, xm4 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] + punpcklwd xm4, xm1 + vinserti128 m3, m3, xm4, 1 ; m3 = [16 15 15 14] + pmaddwd m4, m3, [r5 + 3 * mmsize] + paddd m6, m4 + pmaddwd m4, m3, [r5 + 2 * mmsize] + paddd m5, m4 + pmaddwd m4, m3, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m3, [r5] + movq xm4, [r0 + r1] + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m1, m1, xm4, 1 ; m1 = [18 17 17 16] + pmaddwd m4, m1, [r5 + 3 * mmsize] + paddd m5, m4 + pmaddwd m4, m1, [r5 + 2 * mmsize] + paddd m0, m4 + pmaddwd m1, [r5 + 1 * mmsize] + paddd m3, m1 + movq xm4, [r0 + r4] + punpcklwd xm2, xm4 + lea r0, [r0 + 4 * r1] + movq xm1, [r0] + punpcklwd xm4, xm1 + vinserti128 m2, m2, xm4, 1 ; m2 = [20 19 19 18] + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m3, m2 + movq xm4, [r0 + r1] + punpcklwd xm1, xm4 + movq xm2, [r0 + 2 * r1] + punpcklwd xm4, xm2 + vinserti128 m1, m1, xm4, 1 ; m1 = [22 21 21 20] + pmaddwd m1, [r5 + 3 * mmsize] + paddd m3, m1 + +%ifidn %1,sp + paddd m6, m7 + paddd m5, m7 + paddd m0, m7 + paddd m3, m7 + psrad m6, 12 + psrad m5, 12 + psrad m0, 12 + psrad m3, 12 +%else + psrad m6, 6 + psrad m5, 6 + psrad m0, 6 + psrad m3, 6 +%endif + packssdw m6, m5 + packssdw m0, m3 + lea r2, [r2 + r3 * 4] + +%ifidn %1,sp + packuswb m6, m0 + vextracti128 xm0, m6, 1 + movd [r2], xm6 + movd [r2 + r3], xm0 + pextrd [r2 + r3 * 2], xm6, 1 + pextrd [r2 + r6], xm0, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm6, 2 + pextrd [r2 + r3], xm0, 2 + pextrd [r2 + r3 * 2], xm6, 3 + pextrd [r2 + r6], xm0, 3 +%else + vextracti128 xm5, m6, 1 + vextracti128 xm3, m0, 1 + movq [r2], xm6 + movq [r2 + r3], xm5 + movhps [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm5 + lea r2, [r2 + r3 * 4] + movq [r2], xm0 + movq [r2 + r3], xm3 + movhps [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm3 +%endif +%endmacro + +%macro FILTER_VER_LUMA_AVX2_4x16 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_4x16, 4, 7, 8 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - lea r5, [tab_ChromaCoeffV + r4] + lea r5, [pw_LumaCoeffVer + r4] %endif - mova m4, [tab_c_526336] + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + PROCESS_LUMA_AVX2_W4_16R %1 + RET +%endmacro - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 +FILTER_VER_LUMA_AVX2_4x16 sp +FILTER_VER_LUMA_AVX2_4x16 ss - lea r0, [r0 + 2 * r1] - movq m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 +%macro FILTER_VER_LUMA_S_AVX2_8x8 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_8x8, 4, 6, 12 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d - movq m3, [r0 + r1] - punpcklwd m2, m3 ;m4=[2 3] - pmaddwd m2, [r5 + 1 * 16] - paddd m0, m2 ;m0=[0+1+2+3] Row1 done - paddd m0, m4 - psrad m0, 12 +%ifdef PIC + lea r5, [pw_LumaCoeffVer] + add r5, r4 +%else + lea r5, [pw_LumaCoeffVer + r4] +%endif - movq m2, [r0 + 2 * r1] - punpcklwd m3, m2 ;m5=[3 4] - pmaddwd m3, [r5 + 1 * 16] - paddd m1, m3 ;m1 = [1+2+3+4] Row2 done - paddd m1, m4 - psrad m1, 12 + lea r4, [r1 * 3] + sub r0, r4 - packssdw m0, m1 - packuswb m0, m0 +%ifidn %1,sp + mova m11, [pd_526336] +%else + add r3d, r3d +%endif + + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + pmaddwd m2, [r5] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + pmaddwd m3, [r5] + paddd m1, m5 + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + pmaddwd m5, [r5] + paddd m3, m7 + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + pmaddwd m6, [r5] + paddd m4, m8 + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + pmaddwd m7, [r5] + paddd m5, m9 + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + pmaddwd m8, [r5 + 1 * mmsize] + paddd m4, m10 + paddd m6, m8 + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm8, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm8, 1 + pmaddwd m8, m9, [r5 + 3 * mmsize] + paddd m3, m8 + pmaddwd m8, m9, [r5 + 2 * mmsize] + pmaddwd m9, [r5 + 1 * mmsize] + paddd m5, m8 + paddd m7, m9 + movu xm8, [r0 + r4] ; m8 = row 11 + punpckhwd xm9, xm10, xm8 + punpcklwd xm10, xm8 + vinserti128 m10, m10, xm9, 1 + pmaddwd m9, m10, [r5 + 3 * mmsize] + pmaddwd m10, [r5 + 2 * mmsize] + paddd m4, m9 + paddd m6, m10 - movd [r2], m0 - pextrd [r2 + r3], m0, 1 + lea r4, [r3 * 3] +%ifidn %1,sp + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m1, [interp8_hps_shuf] + vpermd m0, m1, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 +%endif + lea r0, [r0 + r1 * 4] + movu xm9, [r0] ; m9 = row 12 + punpckhwd xm3, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm3, 1 + pmaddwd m3, m8, [r5 + 3 * mmsize] + pmaddwd m8, [r5 + 2 * mmsize] + paddd m5, m3 + paddd m7, m8 + movu xm3, [r0 + r1] ; m3 = row 13 + punpckhwd xm0, xm9, xm3 + punpcklwd xm9, xm3 + vinserti128 m9, m9, xm0, 1 + pmaddwd m9, [r5 + 3 * mmsize] + paddd m6, m9 + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm9, xm3, xm0 + punpcklwd xm3, xm0 + vinserti128 m3, m3, xm9, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m7, m3 + +%ifidn %1,sp + paddd m4, m11 + paddd m5, m11 + paddd m6, m11 + paddd m7, m11 + psrad m4, 12 + psrad m5, 12 + psrad m6, 12 + psrad m7, 12 +%else + psrad m4, 6 + psrad m5, 6 + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m4, m5 + packssdw m6, m7 + lea r2, [r2 + r3 * 4] +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m1, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r4], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm5 + movu [r2 + r3 * 2], xm6 + movu [r2 + r4], xm7 +%endif RET +%endif +%endmacro -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP_W6_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7 +FILTER_VER_LUMA_S_AVX2_8x8 sp +FILTER_VER_LUMA_S_AVX2_8x8 ss - add r1d, r1d - sub r0, r1 - shl r4d, 5 +%macro FILTER_VER_LUMA_S_AVX2_8xN 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_8x%2, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - lea r6, [tab_ChromaCoeffV + r4] + lea r5, [pw_LumaCoeffVer + r4] %endif - mova m6, [tab_c_526336] - - mov r4d, %2/4 - + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m14, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + lea r7, [r1 * 4] + mov r8d, %2 / 16 .loopH: - PROCESS_CHROMA_SP_W4_4R - - paddd m0, m6 - paddd m1, m6 - paddd m2, m6 - paddd m3, m6 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r5, [r2 + 2 * r3] - pextrd [r5], m0, 2 - pextrd [r5 + r3], m0, 3 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 4 - - PROCESS_CHROMA_SP_W2_4R r6 - - paddd m0, m6 - paddd m2, m6 - - psrad m0, 12 - psrad m2, 12 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r0 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r0, [r0 + r1 * 4] + movu xm8, [r0] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r0 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r0 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r0 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r0, [r0 + r1 * 4] + movu xm12, [r0] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] + +%ifidn %1,sp + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, 12 + psrad m5, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 +%ifidn %1,sp + packuswb m0, m2 + mova m1, [interp8_hps_shuf] + vpermd m0, m1, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif - packssdw m0, m2 - packuswb m0, m0 + movu xm13, [r0 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] + movu xm0, [r0 + r1 * 2] ; m0 = row 14 + punpckhwd xm2, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm2, 1 + pmaddwd m2, m13, [r5 + 3 * mmsize] + paddd m7, m2 + pmaddwd m2, m13, [r5 + 2 * mmsize] + paddd m9, m2 + pmaddwd m2, m13, [r5 + 1 * mmsize] + paddd m11, m2 + pmaddwd m13, [r5] + +%ifidn %1,sp + paddd m6, m14 + paddd m7, m14 + psrad m6, 12 + psrad m7, 12 +%else + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m6, m7 + lea r2, [r2 + r3 * 4] - pextrw [r2], m0, 0 - pextrw [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrw [r2], m0, 2 - pextrw [r2 + r3], m0, 3 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m1, m4 + vextracti128 xm6, m4, 1 + movq [r2], xm4 + movhps [r2 + r3], xm4 + movq [r2 + r3 * 2], xm6 + movhps [r2 + r6], xm6 +%else + vpermq m6, m6, 11011000b + vpermq m4, m4, 11011000b + vextracti128 xm1, m4, 1 + vextracti128 xm7, m6, 1 + movu [r2], xm4 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm6 + movu [r2 + r6], xm7 +%endif - sub r0, 2 * 4 - lea r2, [r2 + 2 * r3 - 4] + movu xm6, [r0 + r4] ; m6 = row 15 + punpckhwd xm5, xm0, xm6 + punpcklwd xm0, xm6 + vinserti128 m0, m0, xm5, 1 + pmaddwd m5, m0, [r5 + 3 * mmsize] + paddd m8, m5 + pmaddwd m5, m0, [r5 + 2 * mmsize] + paddd m10, m5 + pmaddwd m5, m0, [r5 + 1 * mmsize] + paddd m12, m5 + pmaddwd m0, [r5] + lea r0, [r0 + r1 * 4] + movu xm2, [r0] ; m2 = row 16 + punpckhwd xm3, xm6, xm2 + punpcklwd xm6, xm2 + vinserti128 m6, m6, xm3, 1 + pmaddwd m3, m6, [r5 + 3 * mmsize] + paddd m9, m3 + pmaddwd m3, m6, [r5 + 2 * mmsize] + paddd m11, m3 + pmaddwd m3, m6, [r5 + 1 * mmsize] + paddd m13, m3 + pmaddwd m6, [r5] + movu xm3, [r0 + r1] ; m3 = row 17 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r0 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m2, m3, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m3, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m6, m3 + movu xm2, [r0 + r4] ; m2 = row 19 + punpckhwd xm7, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm7, 1 + pmaddwd m7, m4, [r5 + 3 * mmsize] + paddd m12, m7 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r0, [r0 + r1 * 4] + movu xm7, [r0] ; m7 = row 20 + punpckhwd xm3, xm2, xm7 + punpcklwd xm2, xm7 + vinserti128 m2, m2, xm3, 1 + pmaddwd m3, m2, [r5 + 3 * mmsize] + paddd m13, m3 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m6, m2 + movu xm3, [r0 + r1] ; m3 = row 21 + punpckhwd xm2, xm7, xm3 + punpcklwd xm7, xm3 + vinserti128 m7, m7, xm2, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m0, m7 + movu xm2, [r0 + r1 * 2] ; m2 = row 22 + punpckhwd xm7, xm3, xm2 + punpcklwd xm3, xm2 + vinserti128 m3, m3, xm7, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m6, m3 + +%ifidn %1,sp + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m6, m14 + psrad m8, 12 + psrad m9, 12 + psrad m10, 12 + psrad m11, 12 + psrad m12, 12 + psrad m13, 12 + psrad m0, 12 + psrad m6, 12 +%else + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m6, 6 +%endif + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m6 + lea r2, [r2 + r3 * 4] - dec r4d - jnz .loopH +%ifidn %1,sp + packuswb m8, m10 + packuswb m12, m0 + vpermd m8, m1, m8 + vpermd m12, m1, m12 + vextracti128 xm10, m8, 1 + vextracti128 xm0, m12, 1 + movq [r2], xm8 + movhps [r2 + r3], xm8 + movq [r2 + r3 * 2], xm10 + movhps [r2 + r6], xm10 + lea r2, [r2 + r3 * 4] + movq [r2], xm12 + movhps [r2 + r3], xm12 + movq [r2 + r3 * 2], xm0 + movhps [r2 + r6], xm0 +%else + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm6, m0, 1 + movu [r2], xm8 + movu [r2 + r3], xm9 + movu [r2 + r3 * 2], xm10 + movu [r2 + r6], xm11 + lea r2, [r2 + r3 * 4] + movu [r2], xm12 + movu [r2 + r3], xm13 + movu [r2 + r3 * 2], xm0 + movu [r2 + r6], xm6 +%endif + lea r2, [r2 + r3 * 4] + sub r0, r7 + dec r8d + jnz .loopH RET +%endif %endmacro -FILTER_VER_CHROMA_SP_W6_H4 6, 8 - -FILTER_VER_CHROMA_SP_W6_H4 6, 16 - -%macro PROCESS_CHROMA_SP_W8_2R 0 - movu m1, [r0] - movu m3, [r0 + r1] - punpcklwd m0, m1, m3 - pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l - punpckhwd m1, m3 - pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h - - movu m4, [r0 + 2 * r1] - punpcklwd m2, m3, m4 - pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l - punpckhwd m3, m4 - pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h - - lea r0, [r0 + 2 * r1] - movu m5, [r0 + r1] - punpcklwd m6, m4, m5 - pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l - paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum - punpckhwd m4, m5 - pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h - paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum +FILTER_VER_LUMA_S_AVX2_8xN sp, 16 +FILTER_VER_LUMA_S_AVX2_8xN sp, 32 +FILTER_VER_LUMA_S_AVX2_8xN ss, 16 +FILTER_VER_LUMA_S_AVX2_8xN ss, 32 - movu m4, [r0 + 2 * r1] - punpcklwd m6, m5, m4 - pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l - paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum - punpckhwd m5, m4 - pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h - paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum +%macro PROCESS_LUMA_S_AVX2_W8_4R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r0, [r0 + r1 * 4] + movu xm4, [r0] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r0 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m4, [r5 + 1 * mmsize] + paddd m2, m4 + movu xm6, [r0 + r1 * 2] ; m6 = row 6 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m4, m5, [r5 + 2 * mmsize] + paddd m1, m4 + pmaddwd m5, [r5 + 1 * mmsize] + paddd m3, m5 + movu xm4, [r0 + r4] ; m4 = row 7 + punpckhwd xm5, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm5, 1 + pmaddwd m5, m6, [r5 + 3 * mmsize] + paddd m0, m5 + pmaddwd m6, [r5 + 2 * mmsize] + paddd m2, m6 + lea r0, [r0 + r1 * 4] + movu xm5, [r0] ; m5 = row 8 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m1, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m3, m4 + movu xm6, [r0 + r1] ; m6 = row 9 + punpckhwd xm4, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm4, 1 + pmaddwd m5, [r5 + 3 * mmsize] + paddd m2, m5 + movu xm4, [r0 + r1 * 2] ; m4 = row 10 + punpckhwd xm5, xm6, xm4 + punpcklwd xm6, xm4 + vinserti128 m6, m6, xm5, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m3, m6 + +%ifidn %1,sp + paddd m0, m7 + paddd m1, m7 + paddd m2, m7 + paddd m3, m7 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 +%ifidn %1,sp + packuswb m0, m2 + mova m4, [interp8_hps_shuf] + vpermd m0, m4, m0 + vextracti128 xm2, m0, 1 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 +%endif %endmacro -;-------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx) -;-------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SP_W8_H2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8 - - add r1d, r1d - sub r0, r1 - shl r4d, 5 +%macro FILTER_VER_LUMA_S_AVX2_8x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_8x4, 4, 6, 8 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - lea r5, [tab_ChromaCoeffV + r4] + lea r5, [pw_LumaCoeffVer + r4] %endif - mova m7, [tab_c_526336] - - mov r4d, %2/2 -.loopH: - PROCESS_CHROMA_SP_W8_2R - - paddd m0, m7 - paddd m1, m7 - paddd m2, m7 - paddd m3, m7 - - psrad m0, 12 - psrad m1, 12 - psrad m2, 12 - psrad m3, 12 - - packssdw m0, m1 - packssdw m2, m3 - - packuswb m0, m2 - - movlps [r2], m0 - movhps [r2 + r3], m0 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif + PROCESS_LUMA_S_AVX2_W8_4R %1 + lea r4, [r3 * 3] +%ifidn %1,sp + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r4], xm2 +%else + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r4], xm3 +%endif RET %endmacro -FILTER_VER_CHROMA_SP_W8_H2 8, 2 -FILTER_VER_CHROMA_SP_W8_H2 8, 4 -FILTER_VER_CHROMA_SP_W8_H2 8, 6 -FILTER_VER_CHROMA_SP_W8_H2 8, 8 -FILTER_VER_CHROMA_SP_W8_H2 8, 16 -FILTER_VER_CHROMA_SP_W8_H2 8, 32 - -FILTER_VER_CHROMA_SP_W8_H2 8, 12 -FILTER_VER_CHROMA_SP_W8_H2 8, 64 - +FILTER_VER_LUMA_S_AVX2_8x4 sp +FILTER_VER_LUMA_S_AVX2_8x4 ss -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro FILTER_HORIZ_CHROMA_2xN 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride -%define coef2 m3 -%define Tm0 m2 -%define t1 m1 -%define t0 m0 +%macro PROCESS_LUMA_AVX2_W8_16R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] + +%ifidn %1,sp + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, 12 + psrad m5, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 +%ifidn %1,sp + packuswb m0, m2 + mova m5, [interp8_hps_shuf] + vpermd m0, m5, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif - dec srcq - mov r4d, r4m - add dststrided, dststrided + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m0, m12, [r5 + 1 * mmsize] + paddd m10, m0 + pmaddwd m12, [r5] + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m13, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m1, m13, [r5 + 1 * mmsize] + paddd m11, m1 + pmaddwd m13, [r5] + +%ifidn %1,sp + paddd m6, m14 + paddd m7, m14 + psrad m6, 12 + psrad m7, 12 +%else + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m6, m7 + lea r8, [r2 + r3 * 4] -%ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m5, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm1, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 %endif - pshufd coef2, coef2, 0 - mova t1, [pw_2000] - mova Tm0, [tab_Tm] + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m2, m0, [r5 + 2 * mmsize] + paddd m10, m2 + pmaddwd m2, m0, [r5 + 1 * mmsize] + paddd m12, m2 + pmaddwd m0, [r5] + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m3, m1, [r5 + 3 * mmsize] + paddd m9, m3 + pmaddwd m3, m1, [r5 + 2 * mmsize] + paddd m11, m3 + pmaddwd m3, m1, [r5 + 1 * mmsize] + paddd m13, m3 + pmaddwd m1, [r5] + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 3 * mmsize] + paddd m10, m4 + pmaddwd m4, m2, [r5 + 2 * mmsize] + paddd m12, m4 + pmaddwd m2, [r5 + 1 * mmsize] + paddd m0, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m2, m3, [r5 + 3 * mmsize] + paddd m11, m2 + pmaddwd m2, m3, [r5 + 2 * mmsize] + paddd m13, m2 + pmaddwd m3, [r5 + 1 * mmsize] + paddd m1, m3 + movu xm2, [r7 + r4] ; m2 = row 19 + punpckhwd xm6, xm4, xm2 + punpcklwd xm4, xm2 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 3 * mmsize] + paddd m12, m6 + pmaddwd m4, [r5 + 2 * mmsize] + paddd m0, m4 + lea r7, [r7 + r1 * 4] + movu xm6, [r7] ; m6 = row 20 + punpckhwd xm7, xm2, xm6 + punpcklwd xm2, xm6 + vinserti128 m2, m2, xm7, 1 + pmaddwd m7, m2, [r5 + 3 * mmsize] + paddd m13, m7 + pmaddwd m2, [r5 + 2 * mmsize] + paddd m1, m2 + movu xm7, [r7 + r1] ; m7 = row 21 + punpckhwd xm2, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm2, 1 + pmaddwd m6, [r5 + 3 * mmsize] + paddd m0, m6 + movu xm2, [r7 + r1 * 2] ; m2 = row 22 + punpckhwd xm3, xm7, xm2 + punpcklwd xm7, xm2 + vinserti128 m7, m7, xm3, 1 + pmaddwd m7, [r5 + 3 * mmsize] + paddd m1, m7 - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 +%ifidn %1,sp + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + paddd m12, m14 + paddd m13, m14 + paddd m0, m14 + paddd m1, m14 + psrad m8, 12 + psrad m9, 12 + psrad m10, 12 + psrad m11, 12 + psrad m12, 12 + psrad m13, 12 + psrad m0, 12 + psrad m1, 12 +%else + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 + psrad m12, 6 + psrad m13, 6 + psrad m0, 6 + psrad m1, 6 +%endif + packssdw m8, m9 + packssdw m10, m11 + packssdw m12, m13 + packssdw m0, m1 + lea r8, [r8 + r3 * 4] -.loopH: - movh t0, [srcq] - pshufb t0, t0, Tm0 - pmaddubsw t0, coef2 - phaddw t0, t0 - psubw t0, t1 - movd [dstq], t0 +%ifidn %1,sp + packuswb m8, m10 + packuswb m12, m0 + vpermd m8, m5, m8 + vpermd m12, m5, m12 + vextracti128 xm10, m8, 1 + vextracti128 xm0, m12, 1 + movq [r8], xm8 + movhps [r8 + r3], xm8 + movq [r8 + r3 * 2], xm10 + movhps [r8 + r6], xm10 + lea r8, [r8 + r3 * 4] + movq [r8], xm12 + movhps [r8 + r3], xm12 + movq [r8 + r3 * 2], xm0 + movhps [r8 + r6], xm0 +%else + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vpermq m12, m12, 11011000b + vpermq m0, m0, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + vextracti128 xm13, m12, 1 + vextracti128 xm1, m0, 1 + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + lea r8, [r8 + r3 * 4] + movu [r8], xm12 + movu [r8 + r3], xm13 + movu [r8 + r3 * 2], xm0 + movu [r8 + r6], xm1 +%endif +%endmacro - lea srcq, [srcq + srcstrideq] - lea dstq, [dstq + dststrideq] +%macro FILTER_VER_LUMA_AVX2_Nx16 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_%2x16, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d - dec r4d - jnz .loopH +%ifdef PIC + lea r5, [pw_LumaCoeffVer] + add r5, r4 +%else + lea r5, [pw_LumaCoeffVer + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m14, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_LUMA_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loopW RET +%endif %endmacro -FILTER_HORIZ_CHROMA_2xN 2, 4 -FILTER_HORIZ_CHROMA_2xN 2, 8 - -FILTER_HORIZ_CHROMA_2xN 2, 16 - -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro FILTER_HORIZ_CHROMA_4xN 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride -%define coef2 m3 -%define Tm0 m2 -%define t1 m1 -%define t0 m0 - - dec srcq - mov r4d, r4m - add dststrided, dststrided +FILTER_VER_LUMA_AVX2_Nx16 sp, 16 +FILTER_VER_LUMA_AVX2_Nx16 sp, 32 +FILTER_VER_LUMA_AVX2_Nx16 sp, 64 +FILTER_VER_LUMA_AVX2_Nx16 ss, 16 +FILTER_VER_LUMA_AVX2_Nx16 ss, 32 +FILTER_VER_LUMA_AVX2_Nx16 ss, 64 + +%macro FILTER_VER_LUMA_AVX2_NxN 3 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%3_%1x%2, 4, 12, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + lea r5, [pw_LumaCoeffVer + r4] %endif - pshufd coef2, coef2, 0 - mova t1, [pw_2000] - mova Tm0, [tab_Tm] + lea r4, [r1 * 3] + sub r0, r4 - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 +%ifidn %3,sp + mova m14, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + lea r11, [r1 * 4] + mov r9d, %2 / 16 .loopH: - movh t0, [srcq] - pshufb t0, t0, Tm0 - pmaddubsw t0, coef2 - phaddw t0, t0 - psubw t0, t1 - movlps [dstq], t0 - - lea srcq, [srcq + srcstrideq] - lea dstq, [dstq + dststrideq] - - dec r4d - jnz .loopH + mov r10d, %1 / 8 +.loopW: + PROCESS_LUMA_AVX2_W8_16R %3 +%ifidn %3,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r10d + jnz .loopW + sub r7, r11 + lea r0, [r7 - 2 * %1 + 16] +%ifidn %3,sp + lea r2, [r8 + r3 * 4 - %1 + 8] +%else + lea r2, [r8 + r3 * 4 - 2 * %1 + 16] +%endif + dec r9d + jnz .loopH RET +%endif %endmacro -FILTER_HORIZ_CHROMA_4xN 4, 2 -FILTER_HORIZ_CHROMA_4xN 4, 4 -FILTER_HORIZ_CHROMA_4xN 4, 8 -FILTER_HORIZ_CHROMA_4xN 4, 16 +FILTER_VER_LUMA_AVX2_NxN 16, 32, sp +FILTER_VER_LUMA_AVX2_NxN 16, 64, sp +FILTER_VER_LUMA_AVX2_NxN 24, 32, sp +FILTER_VER_LUMA_AVX2_NxN 32, 32, sp +FILTER_VER_LUMA_AVX2_NxN 32, 64, sp +FILTER_VER_LUMA_AVX2_NxN 48, 64, sp +FILTER_VER_LUMA_AVX2_NxN 64, 32, sp +FILTER_VER_LUMA_AVX2_NxN 64, 48, sp +FILTER_VER_LUMA_AVX2_NxN 64, 64, sp +FILTER_VER_LUMA_AVX2_NxN 16, 32, ss +FILTER_VER_LUMA_AVX2_NxN 16, 64, ss +FILTER_VER_LUMA_AVX2_NxN 24, 32, ss +FILTER_VER_LUMA_AVX2_NxN 32, 32, ss +FILTER_VER_LUMA_AVX2_NxN 32, 64, ss +FILTER_VER_LUMA_AVX2_NxN 48, 64, ss +FILTER_VER_LUMA_AVX2_NxN 64, 32, ss +FILTER_VER_LUMA_AVX2_NxN 64, 48, ss +FILTER_VER_LUMA_AVX2_NxN 64, 64, ss -FILTER_HORIZ_CHROMA_4xN 4, 32 +%macro FILTER_VER_LUMA_S_AVX2_12x16 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_12x16, 4, 9, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d -%macro PROCESS_CHROMA_W6 3 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - psubw %2, %3 - movh [dstq], %2 - pshufd %2, %2, 2 - movd [dstq + 8], %2 -%endmacro +%ifdef PIC + lea r5, [pw_LumaCoeffVer] + add r5, r4 +%else + lea r5, [pw_LumaCoeffVer + r4] +%endif -%macro PROCESS_CHROMA_W12 3 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - psubw %2, %3 - movu [dstq], %2 - movu %1, [srcq + 8] - pshufb %1, %1, Tm0 - pmaddubsw %1, coef2 - phaddw %1, %1 - psubw %1, %3 - movh [dstq + 16], %1 + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m14, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + PROCESS_LUMA_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + mova m7, m14 + PROCESS_LUMA_AVX2_W4_16R %1 + RET +%endif %endmacro -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro FILTER_HORIZ_CHROMA 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride -%define coef2 m5 -%define Tm0 m4 -%define Tm1 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 +FILTER_VER_LUMA_S_AVX2_12x16 sp +FILTER_VER_LUMA_S_AVX2_12x16 ss - dec srcq - mov r4d, r4m - add dststrided, dststrided +%macro FILTER_VER_LUMA_S_AVX2_16x12 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_16x12, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + lea r5, [pw_LumaCoeffVer + r4] %endif - pshufd coef2, coef2, 0 - mova t2, [pw_2000] - mova Tm0, [tab_Tm] - mova Tm1, [tab_Tm + 16] + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m14, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, 2 +.loopW: + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m10, m8, [r5 + 1 * mmsize] + paddd m6, m10 + pmaddwd m8, [r5] + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm11, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm11, 1 + pmaddwd m11, m9, [r5 + 3 * mmsize] + paddd m3, m11 + pmaddwd m11, m9, [r5 + 2 * mmsize] + paddd m5, m11 + pmaddwd m11, m9, [r5 + 1 * mmsize] + paddd m7, m11 + pmaddwd m9, [r5] + movu xm11, [r7 + r4] ; m11 = row 11 + punpckhwd xm12, xm10, xm11 + punpcklwd xm10, xm11 + vinserti128 m10, m10, xm12, 1 + pmaddwd m12, m10, [r5 + 3 * mmsize] + paddd m4, m12 + pmaddwd m12, m10, [r5 + 2 * mmsize] + paddd m6, m12 + pmaddwd m12, m10, [r5 + 1 * mmsize] + paddd m8, m12 + pmaddwd m10, [r5] + lea r7, [r7 + r1 * 4] + movu xm12, [r7] ; m12 = row 12 + punpckhwd xm13, xm11, xm12 + punpcklwd xm11, xm12 + vinserti128 m11, m11, xm13, 1 + pmaddwd m13, m11, [r5 + 3 * mmsize] + paddd m5, m13 + pmaddwd m13, m11, [r5 + 2 * mmsize] + paddd m7, m13 + pmaddwd m13, m11, [r5 + 1 * mmsize] + paddd m9, m13 + pmaddwd m11, [r5] + +%ifidn %1,sp + paddd m0, m14 + paddd m1, m14 + paddd m2, m14 + paddd m3, m14 + paddd m4, m14 + paddd m5, m14 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, 12 + psrad m5, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + +%ifidn %1,sp + packuswb m0, m2 + mova m5, [interp8_hps_shuf] + vpermd m0, m5, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 + movu xm13, [r7 + r1] ; m13 = row 13 + punpckhwd xm0, xm12, xm13 + punpcklwd xm12, xm13 + vinserti128 m12, m12, xm0, 1 + pmaddwd m0, m12, [r5 + 3 * mmsize] + paddd m6, m0 + pmaddwd m0, m12, [r5 + 2 * mmsize] + paddd m8, m0 + pmaddwd m12, [r5 + 1 * mmsize] + paddd m10, m12 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm13, xm0 + punpcklwd xm13, xm0 + vinserti128 m13, m13, xm1, 1 + pmaddwd m1, m13, [r5 + 3 * mmsize] + paddd m7, m1 + pmaddwd m1, m13, [r5 + 2 * mmsize] + paddd m9, m1 + pmaddwd m13, [r5 + 1 * mmsize] + paddd m11, m13 + +%ifidn %1,sp + paddd m6, m14 + paddd m7, m14 + psrad m6, 12 + psrad m7, 12 +%else + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m6, m7 + lea r8, [r2 + r3 * 4] -.loopH: - PROCESS_CHROMA_W%1 t0, t1, t2 - add srcq, srcstrideq - add dstq, dststrideq +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m5, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm1, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm1 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%endif - dec r4d - jnz .loopH + movu xm1, [r7 + r4] ; m1 = row 15 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m2, m0, [r5 + 3 * mmsize] + paddd m8, m2 + pmaddwd m0, [r5 + 2 * mmsize] + paddd m10, m0 + lea r7, [r7 + r1 * 4] + movu xm2, [r7] ; m2 = row 16 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m3, m1, [r5 + 3 * mmsize] + paddd m9, m3 + pmaddwd m1, [r5 + 2 * mmsize] + paddd m11, m1 + movu xm3, [r7 + r1] ; m3 = row 17 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m2, [r5 + 3 * mmsize] + paddd m10, m2 + movu xm4, [r7 + r1 * 2] ; m4 = row 18 + punpckhwd xm2, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm2, 1 + pmaddwd m3, [r5 + 3 * mmsize] + paddd m11, m3 + +%ifidn %1,sp + paddd m8, m14 + paddd m9, m14 + paddd m10, m14 + paddd m11, m14 + psrad m8, 12 + psrad m9, 12 + psrad m10, 12 + psrad m11, 12 +%else + psrad m8, 6 + psrad m9, 6 + psrad m10, 6 + psrad m11, 6 +%endif + packssdw m8, m9 + packssdw m10, m11 + lea r8, [r8 + r3 * 4] + +%ifidn %1,sp + packuswb m8, m10 + vpermd m8, m5, m8 + vextracti128 xm10, m8, 1 + movq [r8], xm8 + movhps [r8 + r3], xm8 + movq [r8 + r3 * 2], xm10 + movhps [r8 + r6], xm10 + add r2, 8 +%else + vpermq m8, m8, 11011000b + vpermq m10, m10, 11011000b + vextracti128 xm9, m8, 1 + vextracti128 xm11, m10, 1 + movu [r8], xm8 + movu [r8 + r3], xm9 + movu [r8 + r3 * 2], xm10 + movu [r8 + r6], xm11 + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loopW + RET +%endif +%endmacro + +FILTER_VER_LUMA_S_AVX2_16x12 sp +FILTER_VER_LUMA_S_AVX2_16x12 ss + +%macro FILTER_VER_LUMA_S_AVX2_16x4 1 +INIT_YMM avx2 +cglobal interp_8tap_vert_%1_16x4, 4, 7, 8, 0 - gprsize + mov r4d, r4m + shl r4d, 7 + add r1d, r1d + +%ifdef PIC + lea r5, [pw_LumaCoeffVer] + add r5, r4 +%else + lea r5, [pw_LumaCoeffVer + r4] +%endif + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m7, [pd_526336] +%else + add r3d, r3d +%endif + mov dword [rsp], 2 +.loopW: + PROCESS_LUMA_S_AVX2_W8_4R %1 + lea r6, [r3 * 3] +%ifidn %1,sp + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 + add r2, 8 +%else + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 + add r2, 16 +%endif + lea r6, [8 * r1 - 16] + sub r0, r6 + dec dword [rsp] + jnz .loopW RET %endmacro -FILTER_HORIZ_CHROMA 6, 8 -FILTER_HORIZ_CHROMA 12, 16 +FILTER_VER_LUMA_S_AVX2_16x4 sp +FILTER_VER_LUMA_S_AVX2_16x4 ss -FILTER_HORIZ_CHROMA 6, 16 -FILTER_HORIZ_CHROMA 12, 32 +%macro PROCESS_LUMA_S_AVX2_W8_8R 1 + movu xm0, [r0] ; m0 = row 0 + movu xm1, [r0 + r1] ; m1 = row 1 + punpckhwd xm2, xm0, xm1 + punpcklwd xm0, xm1 + vinserti128 m0, m0, xm2, 1 + pmaddwd m0, [r5] + movu xm2, [r0 + r1 * 2] ; m2 = row 2 + punpckhwd xm3, xm1, xm2 + punpcklwd xm1, xm2 + vinserti128 m1, m1, xm3, 1 + pmaddwd m1, [r5] + movu xm3, [r0 + r4] ; m3 = row 3 + punpckhwd xm4, xm2, xm3 + punpcklwd xm2, xm3 + vinserti128 m2, m2, xm4, 1 + pmaddwd m4, m2, [r5 + 1 * mmsize] + paddd m0, m4 + pmaddwd m2, [r5] + lea r7, [r0 + r1 * 4] + movu xm4, [r7] ; m4 = row 4 + punpckhwd xm5, xm3, xm4 + punpcklwd xm3, xm4 + vinserti128 m3, m3, xm5, 1 + pmaddwd m5, m3, [r5 + 1 * mmsize] + paddd m1, m5 + pmaddwd m3, [r5] + movu xm5, [r7 + r1] ; m5 = row 5 + punpckhwd xm6, xm4, xm5 + punpcklwd xm4, xm5 + vinserti128 m4, m4, xm6, 1 + pmaddwd m6, m4, [r5 + 2 * mmsize] + paddd m0, m6 + pmaddwd m6, m4, [r5 + 1 * mmsize] + paddd m2, m6 + pmaddwd m4, [r5] + movu xm6, [r7 + r1 * 2] ; m6 = row 6 + punpckhwd xm7, xm5, xm6 + punpcklwd xm5, xm6 + vinserti128 m5, m5, xm7, 1 + pmaddwd m7, m5, [r5 + 2 * mmsize] + paddd m1, m7 + pmaddwd m7, m5, [r5 + 1 * mmsize] + paddd m3, m7 + pmaddwd m5, [r5] + movu xm7, [r7 + r4] ; m7 = row 7 + punpckhwd xm8, xm6, xm7 + punpcklwd xm6, xm7 + vinserti128 m6, m6, xm8, 1 + pmaddwd m8, m6, [r5 + 3 * mmsize] + paddd m0, m8 + pmaddwd m8, m6, [r5 + 2 * mmsize] + paddd m2, m8 + pmaddwd m8, m6, [r5 + 1 * mmsize] + paddd m4, m8 + pmaddwd m6, [r5] + lea r7, [r7 + r1 * 4] + movu xm8, [r7] ; m8 = row 8 + punpckhwd xm9, xm7, xm8 + punpcklwd xm7, xm8 + vinserti128 m7, m7, xm9, 1 + pmaddwd m9, m7, [r5 + 3 * mmsize] + paddd m1, m9 + pmaddwd m9, m7, [r5 + 2 * mmsize] + paddd m3, m9 + pmaddwd m9, m7, [r5 + 1 * mmsize] + paddd m5, m9 + pmaddwd m7, [r5] + movu xm9, [r7 + r1] ; m9 = row 9 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m2, m10 + pmaddwd m10, m8, [r5 + 2 * mmsize] + paddd m4, m10 + pmaddwd m8, [r5 + 1 * mmsize] + paddd m6, m8 + movu xm10, [r7 + r1 * 2] ; m10 = row 10 + punpckhwd xm8, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm8, 1 + pmaddwd m8, m9, [r5 + 3 * mmsize] + paddd m3, m8 + pmaddwd m8, m9, [r5 + 2 * mmsize] + paddd m5, m8 + pmaddwd m9, [r5 + 1 * mmsize] + paddd m7, m9 + movu xm8, [r7 + r4] ; m8 = row 11 + punpckhwd xm9, xm10, xm8 + punpcklwd xm10, xm8 + vinserti128 m10, m10, xm9, 1 + pmaddwd m9, m10, [r5 + 3 * mmsize] + paddd m4, m9 + pmaddwd m10, [r5 + 2 * mmsize] + paddd m6, m10 + lea r7, [r7 + r1 * 4] + movu xm9, [r7] ; m9 = row 12 + punpckhwd xm10, xm8, xm9 + punpcklwd xm8, xm9 + vinserti128 m8, m8, xm10, 1 + pmaddwd m10, m8, [r5 + 3 * mmsize] + paddd m5, m10 + pmaddwd m8, [r5 + 2 * mmsize] + paddd m7, m8 + +%ifidn %1,sp + paddd m0, m11 + paddd m1, m11 + paddd m2, m11 + paddd m3, m11 + paddd m4, m11 + paddd m5, m11 + psrad m0, 12 + psrad m1, 12 + psrad m2, 12 + psrad m3, 12 + psrad m4, 12 + psrad m5, 12 +%else + psrad m0, 6 + psrad m1, 6 + psrad m2, 6 + psrad m3, 6 + psrad m4, 6 + psrad m5, 6 +%endif + packssdw m0, m1 + packssdw m2, m3 + packssdw m4, m5 + +%ifidn %1,sp + packuswb m0, m2 + mova m5, [interp8_hps_shuf] + vpermd m0, m5, m0 + vextracti128 xm2, m0, 1 + movq [r2], xm0 + movhps [r2 + r3], xm0 + movq [r2 + r3 * 2], xm2 + movhps [r2 + r6], xm2 +%else + vpermq m0, m0, 11011000b + vpermq m2, m2, 11011000b + vextracti128 xm1, m0, 1 + vextracti128 xm3, m2, 1 + movu [r2], xm0 + movu [r2 + r3], xm1 + movu [r2 + r3 * 2], xm2 + movu [r2 + r6], xm3 +%endif -%macro PROCESS_CHROMA_W8 3 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - psubw %2, %3 - movu [dstq], %2 -%endmacro + movu xm10, [r7 + r1] ; m10 = row 13 + punpckhwd xm0, xm9, xm10 + punpcklwd xm9, xm10 + vinserti128 m9, m9, xm0, 1 + pmaddwd m9, [r5 + 3 * mmsize] + paddd m6, m9 + movu xm0, [r7 + r1 * 2] ; m0 = row 14 + punpckhwd xm1, xm10, xm0 + punpcklwd xm10, xm0 + vinserti128 m10, m10, xm1, 1 + pmaddwd m10, [r5 + 3 * mmsize] + paddd m7, m10 + +%ifidn %1,sp + paddd m6, m11 + paddd m7, m11 + psrad m6, 12 + psrad m7, 12 +%else + psrad m6, 6 + psrad m7, 6 +%endif + packssdw m6, m7 + lea r8, [r2 + r3 * 4] -;----------------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;----------------------------------------------------------------------------------------------------------------------------- -%macro FILTER_HORIZ_CHROMA_8xN 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride -%define coef2 m5 -%define Tm0 m4 -%define Tm1 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 +%ifidn %1,sp + packuswb m4, m6 + vpermd m4, m5, m4 + vextracti128 xm6, m4, 1 + movq [r8], xm4 + movhps [r8 + r3], xm4 + movq [r8 + r3 * 2], xm6 + movhps [r8 + r6], xm6 +%else + vpermq m4, m4, 11011000b + vpermq m6, m6, 11011000b + vextracti128 xm5, m4, 1 + vextracti128 xm7, m6, 1 + movu [r8], xm4 + movu [r8 + r3], xm5 + movu [r8 + r3 * 2], xm6 + movu [r8 + r6], xm7 +%endif +%endmacro - dec srcq - mov r4d, r4m - add dststrided, dststrided +%macro FILTER_VER_LUMA_AVX2_Nx8 2 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_%2x8, 4, 10, 12 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d %ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] + lea r5, [pw_LumaCoeffVer] + add r5, r4 %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + lea r5, [pw_LumaCoeffVer + r4] %endif - pshufd coef2, coef2, 0 - mova t2, [pw_2000] - mova Tm0, [tab_Tm] - mova Tm1, [tab_Tm + 16] - - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 - -.loopH: - PROCESS_CHROMA_W8 t0, t1, t2 - add srcq, srcstrideq - add dstq, dststrideq - - dec r4d - jnz .loopH - + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m11, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, %2 / 8 +.loopW: + PROCESS_LUMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loopW RET +%endif %endmacro -FILTER_HORIZ_CHROMA_8xN 8, 2 -FILTER_HORIZ_CHROMA_8xN 8, 4 -FILTER_HORIZ_CHROMA_8xN 8, 6 -FILTER_HORIZ_CHROMA_8xN 8, 8 -FILTER_HORIZ_CHROMA_8xN 8, 16 -FILTER_HORIZ_CHROMA_8xN 8, 32 - -FILTER_HORIZ_CHROMA_8xN 8, 12 -FILTER_HORIZ_CHROMA_8xN 8, 64 +FILTER_VER_LUMA_AVX2_Nx8 sp, 32 +FILTER_VER_LUMA_AVX2_Nx8 sp, 16 +FILTER_VER_LUMA_AVX2_Nx8 ss, 32 +FILTER_VER_LUMA_AVX2_Nx8 ss, 16 -%macro PROCESS_CHROMA_W16 4 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq], %2 - movu [dstq + 16], %4 -%endmacro +%macro FILTER_VER_LUMA_S_AVX2_32x24 1 +INIT_YMM avx2 +%if ARCH_X86_64 == 1 +cglobal interp_8tap_vert_%1_32x24, 4, 10, 15 + mov r4d, r4m + shl r4d, 7 + add r1d, r1d -%macro PROCESS_CHROMA_W24 4 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq], %2 - movu [dstq + 16], %4 - movu %1, [srcq + 16] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - psubw %2, %3 - movu [dstq + 32], %2 -%endmacro +%ifdef PIC + lea r5, [pw_LumaCoeffVer] + add r5, r4 +%else + lea r5, [pw_LumaCoeffVer + r4] +%endif -%macro PROCESS_CHROMA_W32 4 - movu %1, [srcq] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq], %2 - movu [dstq + 16], %4 - movu %1, [srcq + 16] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + 24] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq + 32], %2 - movu [dstq + 48], %4 + lea r4, [r1 * 3] + sub r0, r4 +%ifidn %1,sp + mova m14, [pd_526336] +%else + add r3d, r3d +%endif + lea r6, [r3 * 3] + mov r9d, 4 +.loopW: + PROCESS_LUMA_AVX2_W8_16R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loopW + lea r9, [r1 * 4] + sub r7, r9 + lea r0, [r7 - 48] +%ifidn %1,sp + lea r2, [r8 + r3 * 4 - 24] +%else + lea r2, [r8 + r3 * 4 - 48] +%endif + mova m11, m14 + mov r9d, 4 +.loop: + PROCESS_LUMA_S_AVX2_W8_8R %1 +%ifidn %1,sp + add r2, 8 +%else + add r2, 16 +%endif + add r0, 16 + dec r9d + jnz .loop + RET +%endif %endmacro -%macro PROCESS_CHROMA_W16o 5 - movu %1, [srcq + %5] - pshufb %2, %1, Tm0 - pmaddubsw %2, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %2, %1 - movu %1, [srcq + %5 + 8] - pshufb %4, %1, Tm0 - pmaddubsw %4, coef2 - pshufb %1, %1, Tm1 - pmaddubsw %1, coef2 - phaddw %4, %1 - psubw %2, %3 - psubw %4, %3 - movu [dstq + %5 * 2], %2 - movu [dstq + %5 * 2 + 16], %4 -%endmacro +FILTER_VER_LUMA_S_AVX2_32x24 sp +FILTER_VER_LUMA_S_AVX2_32x24 ss -%macro PROCESS_CHROMA_W48 4 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 -%endmacro +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_32x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_32x32, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d -%macro PROCESS_CHROMA_W64 4 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 0 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 16 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 32 - PROCESS_CHROMA_W16o %1, %2, %3, %4, 48 -%endmacro +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif -;------------------------------------------------------------------------------------------------------------------------------ -; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) -;------------------------------------------------------------------------------------------------------------------------------ -%macro FILTER_HORIZ_CHROMA_WxN 2 -INIT_XMM sse4 -cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride -%define coef2 m6 -%define Tm0 m5 -%define Tm1 m4 -%define t3 m3 -%define t2 m2 -%define t1 m1 -%define t0 m0 + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [tab_Tm] - dec srcq - mov r4d, r4m - add dststrided, dststrided + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, 32 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 + +.loop + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 16] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 24] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2 + 32], m3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_16x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_16x16, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d %ifdef PIC - lea r6, [tab_ChromaCoeff] - movd coef2, [r6 + r4 * 4] + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] %else - movd coef2, [tab_ChromaCoeff + r4 * 4] + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - pshufd coef2, coef2, 0 - mova t2, [pw_2000] - mova Tm0, [tab_Tm] - mova Tm1, [tab_Tm + 16] + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [tab_Tm] - mov r4d, %2 - cmp r5m, byte 0 - je .loopH - sub srcq, srcstrideq - add r4d, 3 + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, 16 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 -.loopH: - PROCESS_CHROMA_W%1 t0, t1, t2, t3 - add srcq, srcstrideq - add dstq, dststrideq +.loop + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 - dec r4d - jnz .loopH + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2], m3 - RET -%endmacro + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET -FILTER_HORIZ_CHROMA_WxN 16, 4 -FILTER_HORIZ_CHROMA_WxN 16, 8 -FILTER_HORIZ_CHROMA_WxN 16, 12 -FILTER_HORIZ_CHROMA_WxN 16, 16 -FILTER_HORIZ_CHROMA_WxN 16, 32 -FILTER_HORIZ_CHROMA_WxN 24, 32 -FILTER_HORIZ_CHROMA_WxN 32, 8 -FILTER_HORIZ_CHROMA_WxN 32, 16 -FILTER_HORIZ_CHROMA_WxN 32, 24 -FILTER_HORIZ_CHROMA_WxN 32, 32 +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PS_16xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d -FILTER_HORIZ_CHROMA_WxN 16, 24 -FILTER_HORIZ_CHROMA_WxN 16, 64 -FILTER_HORIZ_CHROMA_WxN 24, 64 -FILTER_HORIZ_CHROMA_WxN 32, 48 -FILTER_HORIZ_CHROMA_WxN 32, 64 +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif -FILTER_HORIZ_CHROMA_WxN 64, 64 -FILTER_HORIZ_CHROMA_WxN 64, 32 -FILTER_HORIZ_CHROMA_WxN 64, 48 -FILTER_HORIZ_CHROMA_WxN 48, 64 -FILTER_HORIZ_CHROMA_WxN 64, 16 + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [tab_Tm] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, %2 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W16n 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8 +.loop + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + + vpermq m3, m3, 11011000b + movu [r2], m3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET +%endmacro - mov r4d, r4m - sub r0, r1 - add r3d, r3d + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 32 + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 12 + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 8 + IPFILTER_CHROMA_PS_16xN_AVX2 16 , 4 + +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_32xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PS_32xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] %else - movd m0, [tab_ChromaCoeff + r4 * 4] + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - pshufb m1, m0, [tab_Vm] - pshufb m0, [tab_Vm + 16] - mov r4d, %2/2 + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [tab_Tm] -.loop: + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 + mov r6d, %2 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 3 - mov r6d, %1/16 +.loop + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 8] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + + vpermq m3, m3, 11011000b + movu [r2], m3 + + vbroadcasti128 m3, [r0 + 16] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 24] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + + packssdw m3, m4 + psubw m3, m5 + + vpermq m3, m3, 11011000b + movu [r2 + 32], m3 + + add r2, r3 + add r0, r1 + dec r6d + jnz .loop + RET +%endmacro -.loopW: +IPFILTER_CHROMA_PS_32xN_AVX2 32 , 16 +IPFILTER_CHROMA_PS_32xN_AVX2 32 , 24 +IPFILTER_CHROMA_PS_32xN_AVX2 32 , 8 +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_4x4, 4,7,5 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d - movu m2, [r0] - movu m3, [r0 + r1] +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif - punpcklbw m4, m2, m3 - punpckhbw m2, m3 + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m1, [tab_Tm] - pmaddubsw m4, m1 - pmaddubsw m2, m1 + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - lea r5, [r0 + 2 * r1] - movu m5, [r5] - movu m7, [r5 + r1] + dec r0 + test r5d, r5d + je .label + sub r0 , r1 - punpcklbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m4, m6 +.label + ; Row 0-1 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 - punpckhbw m6, m5, m7 - pmaddubsw m6, m0 - paddw m2, m6 + ; Row 2-3 + lea r0, [r0 + r1 * 2] + movu xm4, [r0] + vinserti128 m4, m4, [r0 + r1], 1 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 - mova m6, [pw_2000] + packssdw m3, m4 + psubw m3, [pw_2000] + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 + movhps [r2 + r3], xm4 - psubw m4, m6 - psubw m2, m6 + test r5d, r5d + jz .end + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] - movu [r2], m4 - movu [r2 + 16], m2 + ;Row 5-6 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 - punpcklbw m4, m3, m5 - punpckhbw m3, m5 + ; Row 7 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 - pmaddubsw m4, m1 - pmaddubsw m3, m1 + packssdw m3, m4 + psubw m3, [pw_2000] + + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 +.end + RET - movu m5, [r5 + 2 * r1] +cglobal interp_4tap_horiz_ps_4x2, 4,7,5 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d - punpcklbw m2, m7, m5 - punpckhbw m7, m5 +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif - pmaddubsw m2, m0 - pmaddubsw m7, m0 + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m1, [tab_Tm] - paddw m4, m2 - paddw m3, m7 + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - psubw m4, m6 - psubw m3, m6 + dec r0 + test r5d, r5d + je .label + sub r0 , r1 - movu [r2 + r3], m4 - movu [r2 + r3 + 16], m3 +.label + ; Row 0-1 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 - add r0, 16 - add r2, 32 - dec r6d - jnz .loopW + packssdw m3, m3 + psubw m3, [pw_2000] + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 - lea r0, [r0 + r1 * 2 - %1] - lea r2, [r2 + r3 * 2 - %1 * 2] + test r5d, r5d + jz .end + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] - dec r4d - jnz .loop - RET -%endmacro + ;Row 2-3 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 -FILTER_V_PS_W16n 64, 64 -FILTER_V_PS_W16n 64, 32 -FILTER_V_PS_W16n 64, 48 -FILTER_V_PS_W16n 48, 64 -FILTER_V_PS_W16n 64, 16 + ; Row 5 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + psubw m3, [pw_2000] -;------------------------------------------------------------------------------------------------------------ -;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------ -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_2x4, 4, 6, 7 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 +.end + RET - mov r4d, r4m - sub r0, r1 - add r3d, r3d +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +%macro IPFILTER_CHROMA_PS_4xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_%1x%2, 4,7,5 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] %else - movd m0, [tab_ChromaCoeff + r4 * 4] + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - pshufb m0, [tab_Cm] + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m1, [tab_Tm] - lea r5, [3 * r1] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] + mov r4, %2 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 - pmaddubsw m2, m0 +.loop + sub r4d, 4 + ; Row 0-1 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 - lea r0, [r0 + 4 * r1] - movd m6, [r0] + ; Row 2-3 + lea r0, [r0 + r1 * 2] + movu xm4, [r0] + vinserti128 m4, m4, [r0 + r1], 1 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 - punpcklbw m3, m4 - punpcklbw m1, m5, m6 - punpcklbw m3, m1 + packssdw m3, m4 + psubw m3, [pw_2000] + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 + movhps [r2 + r3], xm4 - pmaddubsw m3, m0 - phaddw m2, m3 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] - mova m1, [pw_2000] + test r4d, r4d + jnz .loop + test r5d, r5d + jz .end - psubw m2, m1 + ;Row 5-6 + movu xm3, [r0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 - movd [r2], m2 - pextrd [r2 + r3], m2, 2 + ; Row 7 + lea r0, [r0 + r1 * 2] + vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 - movd m2, [r0 + r1] + packssdw m3, m4 + psubw m3, [pw_2000] - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movq [r2+r3], xm4 + lea r2, [r2 + r3 * 2] + movhps [r2], xm3 +.end +RET +%endmacro - pmaddubsw m4, m0 + IPFILTER_CHROMA_PS_4xN_AVX2 4 , 8 + IPFILTER_CHROMA_PS_4xN_AVX2 4 , 16 +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_8x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_8x8, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d - movd m3, [r0 + 2 * r1] +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif - punpcklbw m5, m6 - punpcklbw m2, m3 - punpcklbw m5, m2 + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [tab_Tm] - pmaddubsw m5, m0 - phaddw m4, m5 - psubw m4, m1 + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - lea r2, [r2 + 2 * r3] - movd [r2], m4 - pextrd [r2 + r3], m4, 2 + mov r6d, 4 + dec r0 + test r5d, r5d + je .loop + sub r0 , r1 + add r6d , 1 - RET +.loop + dec r6d + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 -;------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------- -%macro FILTER_V_PS_W2 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8 + ; Row 1 + vbroadcasti128 m4, [r0 + r1] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 - mov r4d, r4m - sub r0, r1 - add r3d, r3d + packssdw m3, m4 + psubw m3, m5 + + vpermq m3, m3, 11011000b + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movu [r2 + r3], xm4 + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + test r6d, r6d + jnz .loop + test r5d, r5d + je .end + + ;Row 11 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + packssdw m3, m3 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2], xm3 +.end + RET +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_4x2, 4,6,4 + mov r4d, r4m %ifdef PIC - lea r5, [tab_ChromaCoeff] - movd m0, [r5 + r4 * 4] + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] %else - movd m0, [tab_ChromaCoeff + r4 * 4] + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - pshufb m0, [tab_Cm] + vbroadcasti128 m1, [tab_Tm] - mova m1, [pw_2000] - lea r5, [3 * r1] - mov r4d, %2/4 -.loop: - movd m2, [r0] - movd m3, [r0 + r1] - movd m4, [r0 + 2 * r1] - movd m5, [r0 + r5] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table - punpcklbw m2, m3 - punpcklbw m6, m4, m5 - punpcklbw m2, m6 + ; Row 0-1 + movu xm2, [r0 - 1] + vinserti128 m2, m2, [r0 + r1 - 1], 1 + pshufb m2, m1 + pmaddubsw m2, m0 + pmaddwd m2, [pw_1] + + packssdw m2, m2 + pmulhrsw m2, [pw_512] + vextracti128 xm3, m2, 1 + packuswb xm2, xm3 - pmaddubsw m2, m0 + movd [r2], xm2 + pextrd [r2+r3], xm2, 2 + RET - lea r0, [r0 + 4 * r1] - movd m6, [r0] +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_32xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_32xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_%1x%2, 4,6,7 + mov r4d, r4m - punpcklbw m3, m4 - punpcklbw m7, m5, m6 - punpcklbw m3, m7 +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif - pmaddubsw m3, m0 + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + mova m6, [pw_512] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - phaddw m2, m3 - psubw m2, m1 + dec r0 + mov r4d, %2 +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 - movd [r2], m2 - pshufd m2, m2, 2 - movd [r2 + r3], m2 + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 - movd m2, [r0 + r1] + packuswb m3, m4 + vpermq m3, m3, 11011000b - punpcklbw m4, m5 - punpcklbw m3, m6, m2 - punpcklbw m4, m3 + movu [r2], m3 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET +%endmacro - pmaddubsw m4, m0 +IPFILTER_CHROMA_PP_32xN_AVX2 32, 16 +IPFILTER_CHROMA_PP_32xN_AVX2 32, 24 +IPFILTER_CHROMA_PP_32xN_AVX2 32, 8 - movd m3, [r0 + 2 * r1] +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_8xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6 + mov r4d, r4m - punpcklbw m5, m6 - punpcklbw m2, m3 - punpcklbw m5, m2 +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif - pmaddubsw m5, m0 + movu m1, [tab_Tm] + vpbroadcastd m2, [pw_1] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - phaddw m4, m5 + sub r0, 1 + mov r4d, %2 - psubw m4, m1 +.loop: + sub r4d, 4 + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 - lea r2, [r2 + 2 * r3] - movd [r2], m4 - pshufd m4 , m4 ,2 - movd [r2 + r3], m4 + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, [pw_512] + lea r0, [r0 + r1 * 2] - lea r2, [r2 + 2 * r3] + ; Row 2 + vbroadcasti128 m4, [r0 ] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 - dec r4d - jnz .loop + ; Row 3 + vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, [pw_512] -RET + packuswb m3, m4 + mova m5, [interp_4tap_8x8_horiz_shuf] + vpermd m3, m5, m3 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movhps [r2 + r3], xm3 + lea r2, [r2 + r3 * 2] + movq [r2], xm4 + movhps [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1*2] + test r4d, r4d + jnz .loop + RET %endmacro -FILTER_V_PS_W2 2, 8 - -FILTER_V_PS_W2 2, 16 - -;----------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize +IPFILTER_CHROMA_PP_8xN_AVX2 8 , 16 +IPFILTER_CHROMA_PP_8xN_AVX2 8 , 32 +IPFILTER_CHROMA_PP_8xN_AVX2 8 , 4 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_4xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6 + mov r4d, r4m %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] %else - lea r6, [tab_ChromaCoeffV + r4] + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - mov dword [rsp], %2/4 - -.loopH: - mov r4d, (%1/4) -.loopW: - PROCESS_CHROMA_SP_W4_4R - - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 + vpbroadcastd m2, [pw_1] + vbroadcasti128 m1, [tab_Tm] + mov r4d, %2 + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - movlps [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movlps [r5], m2 - movhps [r5 + r3], m2 + dec r0 - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 +.loop + sub r4d, 4 + ; Row 0-1 + movu xm3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m3, m3, [r0 + r1], 1 + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 - dec r4d - jnz .loopW + ; Row 2-3 + lea r0, [r0 + r1 * 2] + movu xm4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + vinserti128 m4, m4, [r0 + r1], 1 + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - 2 * %1] + packssdw m3, m4 + pmulhrsw m3, [pw_512] + vextracti128 xm4, m3, 1 + packuswb xm3, xm4 - dec dword [rsp] - jnz .loopH + movd [r2], xm3 + pextrd [r2+r3], xm3, 2 + lea r2, [r2 + r3 * 2] + pextrd [r2], xm3, 1 + pextrd [r2+r3], xm3, 3 + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + test r4d, r4d + jnz .loop RET %endmacro - FILTER_VER_CHROMA_SS 4, 4 - FILTER_VER_CHROMA_SS 4, 8 - FILTER_VER_CHROMA_SS 16, 16 - FILTER_VER_CHROMA_SS 16, 8 - FILTER_VER_CHROMA_SS 16, 12 - FILTER_VER_CHROMA_SS 12, 16 - FILTER_VER_CHROMA_SS 16, 4 - FILTER_VER_CHROMA_SS 4, 16 - FILTER_VER_CHROMA_SS 32, 32 - FILTER_VER_CHROMA_SS 32, 16 - FILTER_VER_CHROMA_SS 16, 32 - FILTER_VER_CHROMA_SS 32, 24 - FILTER_VER_CHROMA_SS 24, 32 - FILTER_VER_CHROMA_SS 32, 8 +IPFILTER_CHROMA_PP_4xN_AVX2 4 , 8 +IPFILTER_CHROMA_PP_4xN_AVX2 4 , 16 - FILTER_VER_CHROMA_SS 16, 24 - FILTER_VER_CHROMA_SS 12, 32 - FILTER_VER_CHROMA_SS 4, 32 - FILTER_VER_CHROMA_SS 32, 64 - FILTER_VER_CHROMA_SS 16, 64 - FILTER_VER_CHROMA_SS 32, 48 - FILTER_VER_CHROMA_SS 24, 64 +%macro IPFILTER_LUMA_PS_32xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_%1x%2, 4, 7, 8 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] +%endif + mova m6, [tab_Lm + 32] + mova m1, [tab_Lm] + mov r4d, %2 ;height + add r3d, r3d + vbroadcasti128 m2, [pw_1] + mova m7, [interp8_hps_shuf] - FILTER_VER_CHROMA_SS 64, 64 - FILTER_VER_CHROMA_SS 64, 32 - FILTER_VER_CHROMA_SS 64, 48 - FILTER_VER_CHROMA_SS 48, 64 - FILTER_VER_CHROMA_SS 64, 16 + ; register map + ; m0 - interpolate coeff + ; m1 , m6 - shuffle order table + ; m2 - pw_1 -;--------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS_W2_4R 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5 + sub r0, 3 + test r5d, r5d + jz .label + lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride + sub r0, r6 + add r4d, 7 + +.label + lea r6, [pw_2000] +.loop + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + + vbroadcasti128 m4, [r0 + 8] + pshufb m5, m4, m6 ;row 0 (col 12 to 15) + pshufb m4, m1 ;row 0 (col 8 to 11) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + + movu [r2], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 16] + pshufb m4, m3, m6 ; row 0 (col 20 to 23) + pshufb m3, m1 ; row 0 (col 16 to 19) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 24] + pshufb m5, m4, m6 ;row 0 (col 28 to 31) + pshufb m4, m1 ;row 0 (col 24 to 27) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + + movu [r2 + 32], m3 ;row 0 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endmacro - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 +IPFILTER_LUMA_PS_32xN_AVX2 32 , 32 +IPFILTER_LUMA_PS_32xN_AVX2 32 , 16 +IPFILTER_LUMA_PS_32xN_AVX2 32 , 24 +IPFILTER_LUMA_PS_32xN_AVX2 32 , 8 +IPFILTER_LUMA_PS_32xN_AVX2 32 , 64 +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_48x64, 4, 7, 8 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] +%endif + mova m6, [tab_Lm + 32] + mova m1, [tab_Lm] + mov r4d, 64 ;height + add r3d, r3d + vbroadcasti128 m2, [pw_2000] + mova m7, [pw_1] + + ; register map + ; m0 - interpolate coeff + ; m1 , m6 - shuffle order table + ; m2 - pw_2000 + + sub r0, 3 + test r5d, r5d + jz .label + lea r6, [r1 * 3] ; r6 = (N / 2 - 1) * srcStride + sub r0, r6 ; r0(src)-r6 + add r4d, 7 ; blkheight += N - 1 (7 - 1 = 6 ; since the last one row not in loop) + +.label + lea r6, [interp8_hps_shuf] +.loop + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 8] + pshufb m5, m4, m6 ;row 0 (col 12 to 15) + pshufb m4, m1 ;row 0 (col 8 to 11) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m7 + pmaddwd m5, m7 + packssdw m4, m5 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + mova m5, [r6] + vpermd m3, m5, m3 + psubw m3, m2 + movu [r2], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 16] + pshufb m4, m3, m6 ; row 0 (col 20 to 23) + pshufb m3, m1 ; row 0 (col 16 to 19) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 24] + pshufb m5, m4, m6 ;row 0 (col 28 to 31) + pshufb m4, m1 ;row 0 (col 24 to 27) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m7 + pmaddwd m5, m7 + packssdw m4, m5 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + mova m5, [r6] + vpermd m3, m5, m3 + psubw m3, m2 + movu [r2 + 32], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 32] + pshufb m4, m3, m6 ; row 0 (col 36 to 39) + pshufb m3, m1 ; row 0 (col 32 to 35) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 40] + pshufb m5, m4, m6 ;row 0 (col 44 to 47) + pshufb m4, m1 ;row 0 (col 40 to 43) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m7 + pmaddwd m5, m7 + packssdw m4, m5 + pmaddwd m3, m7 + pmaddwd m4, m7 + packssdw m3, m4 + mova m5, [r6] + vpermd m3, m5, m3 + psubw m3, m2 + movu [r2 + 64], m3 ;row 0 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET + +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_24x32, 4,6,8 + sub r0, 3 + mov r4d, r4m %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] %else - lea r5, [tab_ChromaCoeffV + r4] + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] %endif + movu m3, [tab_Tm + 16] + vpbroadcastd m7, [pw_1] + lea r5, [tab_Tm] - mov r4d, (%2/4) - -.loopH: - PROCESS_CHROMA_SP_W2_4R r5 - - psrad m0, 6 - psrad m2, 6 - - packssdw m0, m2 - - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m0, 2 - pextrd [r2 + r3], m0, 3 - - lea r2, [r2 + 2 * r3] - - dec r4d - jnz .loopH - - RET -%endmacro + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 -FILTER_VER_CHROMA_SS_W2_4R 2, 4 -FILTER_VER_CHROMA_SS_W2_4R 2, 8 + mov r4d, 32 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m4, m3 + pshufb m4, [r5] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 -FILTER_VER_CHROMA_SS_W2_4R 2, 16 + vbroadcasti128 m5, [r0 + 8] + pshufb m6, m5, m3 + pshufb m5, [r5] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] -;--------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;--------------------------------------------------------------------------------------------------------------- -INIT_XMM sse2 -cglobal interp_4tap_vert_ss_4x2, 5, 6, 4 + vbroadcasti128 m2, [r0 + 16] + pshufb m5, m2, m3 + pshufb m2, [r5] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 + packssdw m2, m2 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b + + movu [r2], xm4 + movq [r2 + 16], xm5 + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +INIT_YMM avx2 +cglobal interp_8tap_horiz_pp_12x16, 4,6,8 + sub r0, 3 + mov r4d, r4m %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r5, [tab_LumaCoeff] + vpbroadcastd m0, [r5 + r4 * 8] + vpbroadcastd m1, [r5 + r4 * 8 + 4] %else - lea r5, [tab_ChromaCoeffV + r4] + vpbroadcastd m0, [tab_LumaCoeff + r4 * 8] + vpbroadcastd m1, [tab_LumaCoeff + r4 * 8 + 4] %endif + movu m3, [tab_Tm + 16] + vpbroadcastd m7, [pw_1] + lea r5, [tab_Tm] - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1 + ; register map + ; m0 , m1 interpolate coeff + ; m2 , m2 shuffle order table + ; m7 - pw_1 - lea r0, [r0 + 2 * r1] - movq m2, [r0] - punpcklwd m1, m2 ;m1=[1 2] - pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2 + mov r4d, 8 +.loop: + ; Row 0 + vbroadcasti128 m4, [r0] ;first 8 element + pshufb m5, m4, m3 + pshufb m4, [r5] + pmaddubsw m4, m0 + pmaddubsw m5, m1 + paddw m4, m5 + pmaddwd m4, m7 - movq m3, [r0 + r1] - punpcklwd m2, m3 ;m4=[2 3] - pmaddwd m2, [r5 + 1 * 16] - paddd m0, m2 ;m0=[0+1+2+3] Row1 done - psrad m0, 6 + vbroadcasti128 m5, [r0 + 8] ; element 8 to 11 + pshufb m6, m5, m3 + pshufb m5, [r5] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 - movq m2, [r0 + 2 * r1] - punpcklwd m3, m2 ;m5=[3 4] - pmaddwd m3, [r5 + 1 * 16] - paddd m1, m3 ;m1=[1+2+3+4] Row2 done - psrad m1, 6 + packssdw m4, m5 ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00] + pmulhrsw m4, [pw_512] - packssdw m0, m1 + ;Row 1 + vbroadcasti128 m2, [r0 + r1] + pshufb m5, m2, m3 + pshufb m2, [r5] + pmaddubsw m2, m0 + pmaddubsw m5, m1 + paddw m2, m5 + pmaddwd m2, m7 - movlps [r2], m0 - movhps [r2 + r3], m0 + vbroadcasti128 m5, [r0 + r1 + 8] + pshufb m6, m5, m3 + pshufb m5, [r5] + pmaddubsw m5, m0 + pmaddubsw m6, m1 + paddw m5, m6 + pmaddwd m5, m7 - RET + packssdw m2, m5 + pmulhrsw m2, [pw_512] + packuswb m4, m2 + vpermq m4, m4, 11011000b + vextracti128 xm5, m4, 1 + pshufd xm4, xm4, 11011000b + pshufd xm5, xm5, 11011000b -;------------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;------------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS_W6_H4 2 -INIT_XMM sse4 -cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6 + movq [r2], xm4 + pextrd [r2+8], xm4, 2 + movq [r2 + r3], xm5 + pextrd [r2+r3+8], xm5, 2 + lea r0, [r0 + r1 * 2] + lea r2, [r2 + r3 * 2] + dec r4d + jnz .loop + RET - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 +;------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_pp_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PP_16xN_AVX2 2 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7 + mov r4d, r4m %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r6, [r5 + r4] + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] %else - lea r6, [tab_ChromaCoeffV + r4] + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - mov r4d, %2/4 - -.loopH: - PROCESS_CHROMA_SP_W4_4R - - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 - - packssdw m0, m1 - packssdw m2, m3 - - movlps [r2], m0 - movhps [r2 + r3], m0 - lea r5, [r2 + 2 * r3] - movlps [r5], m2 - movhps [r5 + r3], m2 - - lea r5, [4 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 - - PROCESS_CHROMA_SP_W2_4R r6 + mova m6, [pw_512] + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] - psrad m0, 6 - psrad m2, 6 + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - packssdw m0, m2 + dec r0 + mov r4d, %2/2 - movd [r2], m0 - pextrd [r2 + r3], m0, 1 - lea r2, [r2 + 2 * r3] - pextrd [r2], m0, 2 - pextrd [r2 + r3], m0, 3 +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 - sub r0, 2 * 4 - lea r2, [r2 + 2 * r3 - 2 * 4] + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 - dec r4d - jnz .loopH + packuswb m3, m4 + vpermq m3, m3, 11011000b + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movu [r2 + r3], xm4 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r4d + jnz .loop RET %endmacro -FILTER_VER_CHROMA_SS_W6_H4 6, 8 +IPFILTER_CHROMA_PP_16xN_AVX2 16 , 8 +IPFILTER_CHROMA_PP_16xN_AVX2 16 , 32 +IPFILTER_CHROMA_PP_16xN_AVX2 16 , 12 +IPFILTER_CHROMA_PP_16xN_AVX2 16 , 4 -FILTER_VER_CHROMA_SS_W6_H4 6, 16 +%macro IPFILTER_LUMA_PS_64xN_AVX2 1 +INIT_YMM avx2 +cglobal interp_8tap_horiz_ps_64x%1, 4, 7, 8 + mov r5d, r5m + mov r4d, r4m +%ifdef PIC + lea r6, [tab_LumaCoeff] + vpbroadcastq m0, [r6 + r4 * 8] +%else + vpbroadcastq m0, [tab_LumaCoeff + r4 * 8] +%endif + mova m6, [tab_Lm + 32] + mova m1, [tab_Lm] + mov r4d, %1 ;height + add r3d, r3d + vbroadcasti128 m2, [pw_1] + mova m7, [interp8_hps_shuf] + ; register map + ; m0 - interpolate coeff + ; m1 , m6 - shuffle order table + ; m2 - pw_2000 + + sub r0, 3 + test r5d, r5d + jz .label + lea r6, [r1 * 3] + sub r0, r6 ; r0(src)-r6 + add r4d, 7 ; blkheight += N - 1 + +.label + lea r6, [pw_2000] +.loop + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m3, m6 ; row 0 (col 4 to 7) + pshufb m3, m1 ; shuffled based on the col order tab_Lm row 0 (col 0 to 3) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 8] + pshufb m5, m4, m6 ;row 0 (col 12 to 15) + pshufb m4, m1 ;row 0 (col 8 to 11) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + movu [r2], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 16] + pshufb m4, m3, m6 ; row 0 (col 20 to 23) + pshufb m3, m1 ; row 0 (col 16 to 19) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 24] + pshufb m5, m4, m6 ;row 0 (col 28 to 31) + pshufb m4, m1 ;row 0 (col 24 to 27) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + movu [r2 + 32], m3 ;row 0 + + vbroadcasti128 m3, [r0 + 32] + pshufb m4, m3, m6 ; row 0 (col 36 to 39) + pshufb m3, m1 ; row 0 (col 32 to 35) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 40] + pshufb m5, m4, m6 ;row 0 (col 44 to 47) + pshufb m4, m1 ;row 0 (col 40 to 43) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + movu [r2 + 64], m3 ;row 0 + vbroadcasti128 m3, [r0 + 48] + pshufb m4, m3, m6 ; row 0 (col 52 to 55) + pshufb m3, m1 ; row 0 (col 48 to 51) + pmaddubsw m3, m0 + pmaddubsw m4, m0 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + + vbroadcasti128 m4, [r0 + 56] + pshufb m5, m4, m6 ;row 0 (col 60 to 63) + pshufb m4, m1 ;row 0 (col 56 to 59) + pmaddubsw m4, m0 + pmaddubsw m5, m0 + pmaddwd m4, m2 + pmaddwd m5, m2 + packssdw m4, m5 + pmaddwd m3, m2 + pmaddwd m4, m2 + packssdw m3, m4 + vpermd m3, m7, m3 + psubw m3, [r6] + movu [r2 + 96], m3 ;row 0 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + RET +%endmacro -;---------------------------------------------------------------------------------------------------------------- -; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;---------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_CHROMA_SS_W8_H2 2 -INIT_XMM sse2 -cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7 +IPFILTER_LUMA_PS_64xN_AVX2 64 +IPFILTER_LUMA_PS_64xN_AVX2 48 +IPFILTER_LUMA_PS_64xN_AVX2 32 +IPFILTER_LUMA_PS_64xN_AVX2 16 - add r1d, r1d - add r3d, r3d - sub r0, r1 - shl r4d, 5 +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;----------------------------------------------------------------------------------------------------------------------------- +%macro IPFILTER_CHROMA_PS_8xN_AVX2 1 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_8x%1, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d %ifdef PIC - lea r5, [tab_ChromaCoeffV] - lea r5, [r5 + r4] + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] %else - lea r5, [tab_ChromaCoeffV + r4] + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - mov r4d, %2/2 -.loopH: - PROCESS_CHROMA_SP_W8_2R + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [tab_Tm] - psrad m0, 6 - psrad m1, 6 - psrad m2, 6 - psrad m3, 6 + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - packssdw m0, m1 - packssdw m2, m3 + mov r6d, %1/2 + dec r0 + test r5d, r5d + jz .loop + sub r0 , r1 + inc r6d - movu [r2], m0 - movu [r2 + r3], m2 +.loop + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 - lea r2, [r2 + 2 * r3] + ; Row 1 + vbroadcasti128 m4, [r0 + r1] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movu [r2 + r3], xm4 + + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r6d + jnz .loop + test r5d, r5d + jz .end + + ;Row 11 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + packssdw m3, m3 + psubw m3, m5 + vpermq m3, m3, 11011000b + movu [r2], xm3 +.end + RET +%endmacro + + IPFILTER_CHROMA_PS_8xN_AVX2 2 + IPFILTER_CHROMA_PS_8xN_AVX2 32 + IPFILTER_CHROMA_PS_8xN_AVX2 16 + IPFILTER_CHROMA_PS_8xN_AVX2 6 + IPFILTER_CHROMA_PS_8xN_AVX2 4 - dec r4d - jnz .loopH +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_2x4, 4, 7, 3 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + mova xm3, [pw_2000] + dec r0 + test r5d, r5d + jz .label + sub r0, r1 + +.label + lea r6, [r1 * 3] + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r6] + + vinserti128 m1, m1, xm2, 1 + pshufb m1, [interp4_hps_shuf] + pmaddubsw m1, m0 + pmaddwd m1, [pw_1] + vextracti128 xm2, m1, 1 + packssdw xm1, xm2 + psubw xm1, xm3 + + lea r4, [r3 * 3] + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 + pextrd [r2 + r4], xm1, 3 + + test r5d, r5d + jz .end + lea r2, [r2 + r3 * 4] + lea r0, [r0 + r1 * 4] + + movq xm1, [r0] + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + vinserti128 m1, m1, xm2, 1 + pshufb m1, [interp4_hps_shuf] + pmaddubsw m1, m0 + pmaddwd m1, [pw_1] + vextracti128 xm2, m1, 1 + packssdw xm1, xm2 + psubw xm1, xm3 + + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + pextrd [r2 + r3 * 2], xm1, 2 +.end RET -%endmacro -FILTER_VER_CHROMA_SS_W8_H2 8, 2 -FILTER_VER_CHROMA_SS_W8_H2 8, 4 -FILTER_VER_CHROMA_SS_W8_H2 8, 6 -FILTER_VER_CHROMA_SS_W8_H2 8, 8 -FILTER_VER_CHROMA_SS_W8_H2 8, 16 -FILTER_VER_CHROMA_SS_W8_H2 8, 32 +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_2x8, 4, 7, 7 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d -FILTER_VER_CHROMA_SS_W8_H2 8, 12 -FILTER_VER_CHROMA_SS_W8_H2 8, 64 +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + vbroadcasti128 m6, [pw_2000] + test r5d, r5d + jz .label + sub r0, r1 -;----------------------------------------------------------------------------------------------------------------- -; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) -;----------------------------------------------------------------------------------------------------------------- -%macro FILTER_VER_LUMA_SS 2 -INIT_XMM sse2 -cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize +.label + mova m4, [interp4_hps_shuf] + mova m5, [pw_1] + dec r0 + lea r4, [r1 * 3] + movq xm1, [r0] ;row 0 + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m1, m1, xm2, 1 + lea r0, [r0 + r1 * 4] + movq xm3, [r0] + movhps xm3, [r0 + r1] + movq xm2, [r0 + r1 * 2] + movhps xm2, [r0 + r4] + vinserti128 m3, m3, xm2, 1 + + pshufb m1, m4 + pshufb m3, m4 + pmaddubsw m1, m0 + pmaddubsw m3, m0 + pmaddwd m1, m5 + pmaddwd m3, m5 + packssdw m1, m3 + psubw m1, m6 + + lea r4, [r3 * 3] + vextracti128 xm2, m1, 1 + + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + movd [r2 + r3 * 2], xm2 + pextrd [r2 + r4], xm2, 1 + lea r2, [r2 + r3 * 4] + pextrd [r2], xm1, 2 + pextrd [r2 + r3], xm1, 3 + pextrd [r2 + r3 * 2], xm2, 2 + pextrd [r2 + r4], xm2, 3 + test r5d, r5d + jz .end + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r3 * 4] + movq xm1, [r0] ;row 0 + movhps xm1, [r0 + r1] + movq xm2, [r0 + r1 * 2] + vinserti128 m1, m1, xm2, 1 + pshufb m1, m4 + pmaddubsw m1, m0 + pmaddwd m1, m5 + packssdw m1, m1 + psubw m1, m6 + vextracti128 xm2, m1, 1 + + movd [r2], xm1 + pextrd [r2 + r3], xm1, 1 + movd [r2 + r3 * 2], xm2 +.end + RET - add r1d, r1d - add r3d, r3d - lea r5, [3 * r1] - sub r0, r5 - shl r4d, 6 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_12x16, 4, 6, 7 + mov r4d, r4m %ifdef PIC - lea r5, [tab_LumaCoeffV] - lea r6, [r5 + r4] + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] %else - lea r6, [tab_LumaCoeffV + r4] + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] %endif - mov dword [rsp], %2/4 -.loopH: - mov r4d, (%1/4) -.loopW: - movq m0, [r0] - movq m1, [r0 + r1] - punpcklwd m0, m1 ;m0=[0 1] - pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1 + mova m6, [pw_512] + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m1, m4 ;m1=[1 2] - pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2 + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[2 3] - pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3 - pmaddwd m4, [r6 + 1 * 16] - paddd m0, m4 ;m0=[0+1+2+3] Row1 + dec r0 + mov r4d, 8 - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[3 4] - pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4 - pmaddwd m5, [r6 + 1 * 16] - paddd m1, m5 ;m1 = [1+2+3+4] Row2 +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[4 5] - pmaddwd m6, m4, [r6 + 1 * 16] - paddd m2, m6 ;m2=[2+3+4+5] Row3 - pmaddwd m4, [r6 + 2 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1 + ; Row 1 + vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + r1 + 4] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[5 6] - pmaddwd m6, m5, [r6 + 1 * 16] - paddd m3, m6 ;m3=[3+4+5+6] Row4 - pmaddwd m5, [r6 + 2 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2 + packuswb m3, m4 + vpermq m3, m3, 11011000b - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[6 7] - pmaddwd m6, m4, [r6 + 2 * 16] - paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3 - pmaddwd m4, [r6 + 3 * 16] - paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end - psrad m0, 6 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + pextrd [r2+8], xm3, 2 + movq [r2 + r3], xm4 + pextrd [r2 + r3 + 8],xm4, 2 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r4d + jnz .loop + RET - lea r0, [r0 + 2 * r1] - movq m4, [r0] - punpcklwd m5, m4 ;m5=[7 8] - pmaddwd m6, m5, [r6 + 2 * 16] - paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4 - pmaddwd m5, [r6 + 3 * 16] - paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end - psrad m1, 6 +INIT_YMM avx2 +cglobal interp_4tap_horiz_pp_24x32, 4,6,7 + mov r4d, r4m - packssdw m0, m1 +%ifdef PIC + lea r5, [tab_ChromaCoeff] + vpbroadcastd m0, [r5 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif - movlps [r2], m0 - movhps [r2 + r3], m0 + mova m1, [interp4_horiz_shuf1] + vpbroadcastd m2, [pw_1] + mova m6, [pw_512] + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - movq m5, [r0 + r1] - punpcklwd m4, m5 ;m4=[8 9] - pmaddwd m4, [r6 + 3 * 16] - paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end - psrad m2, 6 + dec r0 + mov r4d, 32 - movq m4, [r0 + 2 * r1] - punpcklwd m5, m4 ;m5=[9 10] - pmaddwd m5, [r6 + 3 * 16] - paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end - psrad m3, 6 +.loop: + ; Row 0 + vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + vbroadcasti128 m4, [r0 + 4] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + pmulhrsw m3, m6 - packssdw m2, m3 + vbroadcasti128 m4, [r0 + 16] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + vbroadcasti128 m5, [r0 + 20] + pshufb m5, m1 + pmaddubsw m5, m0 + pmaddwd m5, m2 + packssdw m4, m5 + pmulhrsw m4, m6 - movlps [r2 + 2 * r3], m2 - lea r5, [3 * r3] - movhps [r2 + r5], m2 + packuswb m3, m4 + vpermq m3, m3, 11011000b - lea r5, [8 * r1 - 2 * 4] - sub r0, r5 - add r2, 2 * 4 + vextracti128 xm4, m3, 1 + movu [r2], xm3 + movq [r2 + 16], xm4 + add r2, r3 + add r0, r1 + dec r4d + jnz .loop + RET - dec r4d - jnz .loopW +;----------------------------------------------------------------------------------------------------------------------------- +; void interp_4tap_horiz_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;-----------------------------------------------------------------------------------------------------------------------------; +INIT_YMM avx2 +cglobal interp_4tap_horiz_ps_6x8, 4,7,6 + mov r4d, r4m + mov r5d, r5m + add r3d, r3d - lea r0, [r0 + 4 * r1 - 2 * %1] - lea r2, [r2 + 4 * r3 - 2 * %1] +%ifdef PIC + lea r6, [tab_ChromaCoeff] + vpbroadcastd m0, [r6 + r4 * 4] +%else + vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif - dec dword [rsp] - jnz .loopH + vbroadcasti128 m2, [pw_1] + vbroadcasti128 m5, [pw_2000] + mova m1, [tab_Tm] - RET -%endmacro + ; register map + ; m0 - interpolate coeff + ; m1 - shuffle order table + ; m2 - constant word 1 - FILTER_VER_LUMA_SS 4, 4 - FILTER_VER_LUMA_SS 8, 8 - FILTER_VER_LUMA_SS 8, 4 - FILTER_VER_LUMA_SS 4, 8 - FILTER_VER_LUMA_SS 16, 16 - FILTER_VER_LUMA_SS 16, 8 - FILTER_VER_LUMA_SS 8, 16 - FILTER_VER_LUMA_SS 16, 12 - FILTER_VER_LUMA_SS 12, 16 - FILTER_VER_LUMA_SS 16, 4 - FILTER_VER_LUMA_SS 4, 16 - FILTER_VER_LUMA_SS 32, 32 - FILTER_VER_LUMA_SS 32, 16 - FILTER_VER_LUMA_SS 16, 32 - FILTER_VER_LUMA_SS 32, 24 - FILTER_VER_LUMA_SS 24, 32 - FILTER_VER_LUMA_SS 32, 8 - FILTER_VER_LUMA_SS 8, 32 - FILTER_VER_LUMA_SS 64, 64 - FILTER_VER_LUMA_SS 64, 32 - FILTER_VER_LUMA_SS 32, 64 - FILTER_VER_LUMA_SS 64, 48 - FILTER_VER_LUMA_SS 48, 64 - FILTER_VER_LUMA_SS 64, 16 - FILTER_VER_LUMA_SS 16, 64 + mov r6d, 8/2 + dec r0 + test r5d, r5d + jz .loop + sub r0 , r1 + inc r6d + +.loop + ; Row 0 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + + ; Row 1 + vbroadcasti128 m4, [r0 + r1] + pshufb m4, m1 + pmaddubsw m4, m0 + pmaddwd m4, m2 + packssdw m3, m4 + psubw m3, m5 + vpermq m3, m3, 11011000b + vextracti128 xm4, m3, 1 + movq [r2], xm3 + pextrd [r2 + 8], xm3, 2 + movq [r2 + r3], xm4 + pextrd [r2 + r3 + 8], xm4, 2 + lea r2, [r2 + r3 * 2] + lea r0, [r0 + r1 * 2] + dec r6d + jnz .loop + test r5d, r5d + jz .end + + ;Row 11 + vbroadcasti128 m3, [r0] + pshufb m3, m1 + pmaddubsw m3, m0 + pmaddwd m3, m2 + packssdw m3, m3 + psubw m3, m5 + vextracti128 xm4, m3, 1 + movq [r2], xm3 + movd [r2+8], xm4 +.end + RET diff -Nru x265-1.5/source/common/x86/ipfilter8.h x265-1.6/source/common/x86/ipfilter8.h --- x265-1.5/source/common/x86/ipfilter8.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/ipfilter8.h 2015-04-02 16:46:36.000000000 +0000 @@ -576,8 +576,12 @@ CHROMA_420_FILTERS(_avx2); CHROMA_420_SP_FILTERS(_sse2); CHROMA_420_SP_FILTERS_SSE4(_sse4); +CHROMA_420_SP_FILTERS(_avx2); +CHROMA_420_SP_FILTERS_SSE4(_avx2); CHROMA_420_SS_FILTERS(_sse2); CHROMA_420_SS_FILTERS_SSE4(_sse4); +CHROMA_420_SS_FILTERS(_avx2); +CHROMA_420_SS_FILTERS_SSE4(_avx2); CHROMA_422_FILTERS(_sse4); CHROMA_422_FILTERS(_avx2); @@ -617,10 +621,31 @@ LUMA_SP_FILTERS(_sse4); LUMA_SS_FILTERS(_sse2); LUMA_FILTERS(_avx2); - +LUMA_SP_FILTERS(_avx2); +LUMA_SS_FILTERS(_avx2); void x265_interp_8tap_hv_pp_8x8_sse4(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY); -void x265_luma_p2s_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height); - +void x265_pixelToShort_4x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_4x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_4x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_8x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_8x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_8x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_8x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_16x4_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_16x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_16x12_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_16x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_16x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_16x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_32x8_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_32x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_32x24_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_32x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_32x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_64x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_64x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_64x48_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); +void x265_pixelToShort_64x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst); #undef LUMA_FILTERS #undef LUMA_SP_FILTERS #undef LUMA_SS_FILTERS diff -Nru x265-1.5/source/common/x86/mc-a.asm x265-1.6/source/common/x86/mc-a.asm --- x265-1.5/source/common/x86/mc-a.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/mc-a.asm 2015-04-02 16:46:36.000000000 +0000 @@ -1759,7 +1759,570 @@ ADDAVG_W16_H4 24 ;----------------------------------------------------------------------------- +; addAvg avx2 code start +;----------------------------------------------------------------------------- + +INIT_YMM avx2 +cglobal addAvg_8x2, 6,6,4, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + movu xm0, [r0] + vinserti128 m0, m0, [r0 + 2 * r3], 1 + + movu xm2, [r1] + vinserti128 m2, m2, [r1 + 2 * r4], 1 + + paddw m0, m2 + pmulhrsw m0, [pw_256] + paddw m0, [pw_128] + + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movq [r2 + r5], xm1 + RET + +cglobal addAvg_8x6, 6,6,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r3], 1 + + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r4], 1 + + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movq [r2 + r5], xm1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu xm0, [r0] + vinserti128 m0, m0, [r0+ r3], 1 + + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r4], 1 + + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movq [r2 + r5], xm1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r3], 1 + + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r4], 1 + + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movq [r2 + r5], xm1 + RET + +%macro ADDAVG_W8_H4_AVX2 1 +INIT_YMM avx2 +cglobal addAvg_8x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + mov r6d, %1/4 + +.loop: + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r3], 1 + + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r4], 1 + + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movq [r2 + r5], xm1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu xm0, [r0] + vinserti128 m0, m0, [r0 + r3], 1 + + movu m2, [r1] + vinserti128 m2, m2, [r1 + r4], 1 + + paddw m0, m2 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movq [r2 + r5], xm1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro +ADDAVG_W8_H4_AVX2 4 +ADDAVG_W8_H4_AVX2 8 +ADDAVG_W8_H4_AVX2 16 +ADDAVG_W8_H4_AVX2 32 + +%macro ADDAVG_W12_H4_AVX2 1 +INIT_YMM avx2 +cglobal addAvg_12x%1, 6,7,7, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + mov r6d, %1/4 + +.loop: + movu xm0, [r0] + movu xm1, [r1] + movq xm2, [r0 + 16] + movq xm3, [r1 + 16] + vinserti128 m0, m0, xm2, 1 + vinserti128 m1, m1, xm3, 1 + + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu xm1, [r0 + r3] + movu xm2, [r1 + r4] + movq xm3, [r0 + r3 + 16] + movq xm6, [r1 + r3 + 16] + vinserti128 m1, m1, xm3, 1 + vinserti128 m2, m2, xm6, 1 + + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movd [r2 + 8], xm1 + vpshufd m1, m1, 2 + movhps [r2 + r5], xm0 + movd [r2 + r5 + 8], xm1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu xm0, [r0] + movu xm1, [r1] + movq xm2, [r0 + 16] + movq xm3, [r1 + 16] + vinserti128 m0, m0, xm2, 1 + vinserti128 m1, m1, xm3, 1 + + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu xm1, [r0 + r3] + movu xm2, [r1 + r4] + movq xm3, [r0 + r3 + 16] + movq xm6, [r1 + r3 + 16] + vinserti128 m1, m1, xm3, 1 + vinserti128 m2, m2, xm6, 1 + + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vextracti128 xm1, m0, 1 + movq [r2], xm0 + movd [r2 + 8], xm1 + vpshufd m1, m1, 2 + movhps [r2 + r5], xm0 + movd [r2 + r5 + 8], xm1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W12_H4_AVX2 16 + +%macro ADDAVG_W16_H4_AVX2 1 +INIT_YMM avx2 +cglobal addAvg_16x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + mov r6d, %1/4 + +.loop: + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + r3] + movu m2, [r1 + r4] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + vextracti128 [r2], m0, 0 + vextracti128 [r2 + r5], m0, 1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + r3] + movu m2, [r1 + r4] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + vextracti128 [r2], m0, 0 + vextracti128 [r2 + r5], m0, 1 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W16_H4_AVX2 4 +ADDAVG_W16_H4_AVX2 8 +ADDAVG_W16_H4_AVX2 12 +ADDAVG_W16_H4_AVX2 16 +ADDAVG_W16_H4_AVX2 32 +ADDAVG_W16_H4_AVX2 64 + +%macro ADDAVG_W24_H2_AVX2 1 +INIT_YMM avx2 +cglobal addAvg_24x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu xm1, [r0 + 32] + movu xm2, [r1 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 10001101b + vextracti128 [r2], m0, 1 + movq [r2 + 16], xm0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu xm1, [r0 + r3 + 32] + movu xm2, [r1 + r4 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 10001101b + vextracti128 [r2 + r5], m0, 1 + movq [r2 + r5 + 16], xm0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W24_H2_AVX2 32 + +%macro ADDAVG_W32_H2_AVX2 1 +INIT_YMM avx2 +cglobal addAvg_32x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 32] + movu m2, [r1 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2], m0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + r3 + 32] + movu m2, [r1 + r4 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2 + r5], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W32_H2_AVX2 8 +ADDAVG_W32_H2_AVX2 16 +ADDAVG_W32_H2_AVX2 24 +ADDAVG_W32_H2_AVX2 32 +ADDAVG_W32_H2_AVX2 64 + +%macro ADDAVG_W64_H2_AVX2 1 +INIT_YMM avx2 +cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 32] + movu m2, [r1 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2], m0 + + movu m0, [r0 + 64] + movu m1, [r1 + 64] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 96] + movu m2, [r1 + 96] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2 + 32], m0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + r3 + 32] + movu m2, [r1 + r4 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2 + r5], m0 + + movu m0, [r0 + r3 + 64] + movu m1, [r1 + r4 + 64] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + r3 + 96] + movu m2, [r1 + r4 + 96] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2 + r5 + 32], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W64_H2_AVX2 16 +ADDAVG_W64_H2_AVX2 32 +ADDAVG_W64_H2_AVX2 48 +ADDAVG_W64_H2_AVX2 64 + +%macro ADDAVG_W48_H2_AVX2 1 +INIT_YMM avx2 +cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + mova m4, [pw_256] + mova m5, [pw_128] + add r3, r3 + add r4, r4 + mov r6d, %1/2 + +.loop: + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + 32] + movu m2, [r1 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2], m0 + + movu m0, [r0 + 64] + movu m1, [r1 + 64] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + vpermq m0, m0, 11011000b + vextracti128 [r2 + 32], m0, 0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + movu m1, [r0 + r3 + 32] + movu m2, [r1 + r4 + 32] + paddw m1, m2 + pmulhrsw m1, m4 + paddw m1, m5 + + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r2 + r5], m0 + + movu m0, [r0 + r3 + 64] + movu m1, [r1 + r4 + 64] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + + packuswb m0, m0 + vpermq m0, m0, 11011000b + vextracti128 [r2 + r5 + 32], m0, 0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W48_H2_AVX2 64 + +;----------------------------------------------------------------------------- +; addAvg avx2 code end +;----------------------------------------------------------------------------- ;----------------------------------------------------------------------------- %macro ADDAVG_W24_H2 2 @@ -2377,6 +2940,46 @@ mova [t0], xm0 vextracti128 [t0+t1], m0, 1 AVG_END + +cglobal pixel_avg_weight_w32 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu m0, [t2] + movu m1, [t4] + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], m0 + AVG_END + +cglobal pixel_avg_weight_w64 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu m0, [t2] + movu m1, [t4] + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], m0 + movu m0, [t2 + 32] + movu m1, [t4 + 32] + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0 + 32], m0 + AVG_END + %endif ;HIGH_BIT_DEPTH ;============================================================================= @@ -2982,20 +3585,26 @@ ;AVG_FUNC 24, movdqu, movdqa ;AVGH 24, 32 -;AVG_FUNC 64, movdqu, movdqa -;AVGH 64, 64 -;AVGH 64, 48 -;AVGH 64, 16 - -;AVG_FUNC 32, movdqu, movdqa -;AVGH 32, 64 -;AVGH 32, 32 -;AVGH 32, 24 -;AVGH 32, 16 -;AVGH 32, 8 +AVG_FUNC 64, movdqu, movdqa +AVGH 64, 64 +AVGH 64, 48 +AVGH 64, 32 +AVGH 64, 16 + +AVG_FUNC 32, movdqu, movdqa +AVGH 32, 64 +AVGH 32, 32 +AVGH 32, 24 +AVGH 32, 16 +AVGH 32, 8 + AVG_FUNC 16, movdqu, movdqa +AVGH 16, 64 +AVGH 16, 32 AVGH 16, 16 -AVGH 16, 8 +AVGH 16, 12 +AVGH 16, 8 +AVGH 16, 4 %endif ;HIGH_BIT_DEPTH diff -Nru x265-1.5/source/common/x86/pixel-a.asm x265-1.6/source/common/x86/pixel-a.asm --- x265-1.5/source/common/x86/pixel-a.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/pixel-a.asm 2015-04-02 16:46:36.000000000 +0000 @@ -38,13 +38,15 @@ times 4 db 1, -1 times 8 db 1 times 4 db 1, -1 -hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1 +hmul_4p: times 4 db 1, 1, 1, 1, 1, -1, 1, -1 mask_10: times 4 dw 0, -1 mask_1100: times 2 dd 0, -1 hmul_8w: times 4 dw 1 times 2 dw 1, -1 + times 4 dw 1 + times 2 dw 1, -1 ALIGN 32 -hmul_w: dw 1, -1, 1, -1, 1, -1, 1, -1 +hmul_w: times 2 dw 1, -1, 1, -1, 1, -1, 1, -1 ALIGN 32 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15 @@ -1235,21 +1237,18 @@ RET %else - %if WIN64 -cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx) +cglobal pixel_satd_16x24, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 - lea r0, [r6 + 16*SIZEOF_PIXEL] - lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 pxor m7, m7 movhlps m7, m6 @@ -1259,22 +1258,18 @@ movd eax, m6 RET %else -cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64 +cglobal pixel_satd_16x24, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 - lea r0, [r6 + 16*SIZEOF_PIXEL] - mov r2, [rsp] - add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - mov r2, [rsp] - add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 pxor m7, m7 movhlps m7, m6 @@ -1284,84 +1279,38 @@ movd eax, m6 RET %endif - %if WIN64 -cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx) +cglobal pixel_satd_32x48, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 8*SIZEOF_PIXEL] - lea r2, [r7 + 8*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 16*SIZEOF_PIXEL] - lea r2, [r7 + 16*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - pxor m7, m7 - movhlps m7, m6 - paddd m6, m7 - pshufd m7, m6, 1 - paddd m6, m7 - movd eax, m6 - RET -%else -cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64 - SATD_START_SSE2 m6, m7 - mov r6, r0 - mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] - mov r2, [rsp] - add r2, 8*SIZEOF_PIXEL + lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 16*SIZEOF_PIXEL] - mov r2, [rsp] - add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - mov r2, [rsp] - add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - pxor m7, m7 - movhlps m7, m6 - paddd m6, m7 - pshufd m7, m6, 1 - paddd m6, m7 - movd eax, m6 - RET -%endif - -%if WIN64 -cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx) - SATD_START_SSE2 m6, m7 - mov r6, r0 - mov r7, r2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 8*SIZEOF_PIXEL] - lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 16*SIZEOF_PIXEL] - lea r2, [r7 + 16*SIZEOF_PIXEL] + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 @@ -1373,31 +1322,43 @@ movd eax, m6 RET %else -cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64 +cglobal pixel_satd_32x48, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 lea r0, [r6 + 24*SIZEOF_PIXEL] mov r2, [rsp] add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 pxor m7, m7 movhlps m7, m6 paddd m6, m7 @@ -1408,7 +1369,7 @@ %endif %if WIN64 -cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx) +cglobal pixel_satd_24x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 @@ -1416,20 +1377,26 @@ call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] lea r2, [r7 + 8*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] lea r2, [r7 + 16*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - lea r2, [r7 + 24*SIZEOF_PIXEL] call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 @@ -1442,7 +1409,7 @@ movd eax, m6 RET %else -cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64 +cglobal pixel_satd_24x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 @@ -1450,6 +1417,10 @@ call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 lea r0, [r6 + 8*SIZEOF_PIXEL] mov r2, [rsp] add r2, 8*SIZEOF_PIXEL @@ -1457,6 +1428,10 @@ call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 lea r0, [r6 + 16*SIZEOF_PIXEL] mov r2, [rsp] add r2, 16*SIZEOF_PIXEL @@ -1464,9 +1439,6 @@ call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - mov r2, [rsp] - add r2, 24*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 @@ -1481,7 +1453,7 @@ %endif %if WIN64 -cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx) +cglobal pixel_satd_8x64, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 @@ -1493,36 +1465,6 @@ call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 8*SIZEOF_PIXEL] - lea r2, [r7 + 8*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 16*SIZEOF_PIXEL] - lea r2, [r7 + 16*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - lea r2, [r7 + 24*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 pxor m7, m7 movhlps m7, m6 paddd m6, m7 @@ -1531,7 +1473,7 @@ movd eax, m6 RET %else -cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64 +cglobal pixel_satd_8x64, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 @@ -1543,39 +1485,6 @@ call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 - lea r0, [r6 + 8*SIZEOF_PIXEL] - mov r2, [rsp] - add r2, 8*SIZEOF_PIXEL - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 16*SIZEOF_PIXEL] - mov r2, [rsp] - add r2, 16*SIZEOF_PIXEL - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - mov r2, [rsp] - add r2, 24*SIZEOF_PIXEL - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 pxor m7, m7 movhlps m7, m6 paddd m6, m7 @@ -1586,68 +1495,12 @@ %endif %if WIN64 -cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx) +cglobal pixel_satd_8x12, 4,8,14 ;if WIN64 && cpuflag(avx) SATD_START_SSE2 m6, m7 mov r6, r0 mov r7, r2 call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 8*SIZEOF_PIXEL] - lea r2, [r7 + 8*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 16*SIZEOF_PIXEL] - lea r2, [r7 + 16*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 24*SIZEOF_PIXEL] - lea r2, [r7 + 24*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 32*SIZEOF_PIXEL] - lea r2, [r7 + 32*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 40*SIZEOF_PIXEL] - lea r2, [r7 + 40*SIZEOF_PIXEL] - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 + call %%pixel_satd_8x4_internal2 pxor m7, m7 movhlps m7, m6 paddd m6, m7 @@ -1656,31 +1509,754 @@ movd eax, m6 RET %else -cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64 +cglobal pixel_satd_8x12, 4,7,8,0-gprsize ;if !WIN64 SATD_START_SSE2 m6, m7 mov r6, r0 mov [rsp], r2 call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 8*SIZEOF_PIXEL] - mov r2, [rsp] - add r2,8*SIZEOF_PIXEL - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - call pixel_satd_8x8_internal2 - lea r0, [r6 + 16*SIZEOF_PIXEL] - mov r2, [rsp] + call %%pixel_satd_8x4_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if HIGH_BIT_DEPTH +%if WIN64 +cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) + SATD_START_MMX + mov r6, r0 + mov r7, r2 + pxor m7, m7 + SATD_4x8_SSE vertical, 0, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r6 + 4*SIZEOF_PIXEL] + lea r2, [r7 + 4*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + pxor m1, m1 + movhlps m1, m7 + paddd m7, m1 + pshufd m1, m7, 1 + paddd m7, m1 + movd eax, m7 + RET +%else +cglobal pixel_satd_12x32, 4,7,8,0-gprsize + SATD_START_MMX + mov r6, r0 + mov [rsp], r2 + pxor m7, m7 + SATD_4x8_SSE vertical, 0, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r6 + 4*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 4*SIZEOF_PIXEL + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + pxor m1, m1 + movhlps m1, m7 + paddd m7, m1 + pshufd m1, m7, 1 + paddd m7, m1 + movd eax, m7 + RET +%endif +%else ;HIGH_BIT_DEPTH +%if WIN64 +cglobal pixel_satd_12x32, 4,8,8 ;if WIN64 && cpuflag(avx) + SATD_START_MMX + mov r6, r0 + mov r7, r2 +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r6 + 4*SIZEOF_PIXEL] + lea r2, [r7 + 4*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + HADDW m7, m1 + movd eax, m7 + RET +%else +cglobal pixel_satd_12x32, 4,7,8,0-gprsize + SATD_START_MMX + mov r6, r0 + mov [rsp], r2 +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r6 + 4*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 4*SIZEOF_PIXEL + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + HADDW m7, m1 + movd eax, m7 + RET +%endif +%endif + +%if HIGH_BIT_DEPTH +%if WIN64 +cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) + SATD_START_MMX + mov r6, r0 + mov r7, r2 + pxor m7, m7 + SATD_4x8_SSE vertical, 0, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + pxor m1, m1 + movhlps m1, m7 + paddd m7, m1 + pshufd m1, m7, 1 + paddd m7, m1 + movd eax, m7 + RET +%else +cglobal pixel_satd_4x32, 4,7,8,0-gprsize + SATD_START_MMX + mov r6, r0 + mov [rsp], r2 + pxor m7, m7 + SATD_4x8_SSE vertical, 0, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, 4, 5 + pxor m1, m1 + movhlps m1, m7 + paddd m7, m1 + pshufd m1, m7, 1 + paddd m7, m1 + movd eax, m7 + RET +%endif +%else +%if WIN64 +cglobal pixel_satd_4x32, 4,8,8 ;if WIN64 && cpuflag(avx) + SATD_START_MMX + mov r6, r0 + mov r7, r2 +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + HADDW m7, m1 + movd eax, m7 + RET +%else +cglobal pixel_satd_4x32, 4,7,8,0-gprsize + SATD_START_MMX + mov r6, r0 + mov [rsp], r2 +%if vertical==0 + mova m7, [hmul_4p] +%endif + SATD_4x8_SSE vertical, 0, swap + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + lea r0, [r0 + r1*2*SIZEOF_PIXEL] + lea r2, [r2 + r3*2*SIZEOF_PIXEL] + SATD_4x8_SSE vertical, 1, add + HADDW m7, m1 + movd eax, m7 + RET +%endif +%endif + +%if WIN64 +cglobal pixel_satd_32x8, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x8, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_32x16, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x16, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_32x24, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x24, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_32x32, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x32, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_32x64, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_32x64, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 16*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + mov r2, [rsp] + add r2, 24*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%endif + +%if WIN64 +cglobal pixel_satd_48x64, 4,8,14 ;if WIN64 && cpuflag(avx) + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov r7, r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + lea r2, [r7 + 8*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + lea r2, [r7 + 16*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 24*SIZEOF_PIXEL] + lea r2, [r7 + 24*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 32*SIZEOF_PIXEL] + lea r2, [r7 + 32*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 40*SIZEOF_PIXEL] + lea r2, [r7 + 40*SIZEOF_PIXEL] + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + pxor m7, m7 + movhlps m7, m6 + paddd m6, m7 + pshufd m7, m6, 1 + paddd m6, m7 + movd eax, m6 + RET +%else +cglobal pixel_satd_48x64, 4,7,8,0-gprsize ;if !WIN64 + SATD_START_SSE2 m6, m7 + mov r6, r0 + mov [rsp], r2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 8*SIZEOF_PIXEL] + mov r2, [rsp] + add r2,8*SIZEOF_PIXEL + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + call pixel_satd_8x8_internal2 + lea r0, [r6 + 16*SIZEOF_PIXEL] + mov r2, [rsp] add r2,16*SIZEOF_PIXEL call pixel_satd_8x8_internal2 call pixel_satd_8x8_internal2 @@ -7546,30 +8122,360 @@ paddd m11, m9 psrld m11, 2 - HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 + HADAMARD8_2D_HMUL 0, 1, 2, 3, 4, 5, 6, 7, 9, 9 + + paddw m0, m1 + paddw m0, m2 + paddw m0, m3 + HADDW m0, m1 + + paddd m0, m14 + psrld m0, 1 + psubd m0, m11 + psubd m12, m0 + pabsd m0, m12 + paddd m13, m0 + add r0, 8 + add r2, 8 + dec r6d + jnz .loopW + lea r0, [r0 + r1 * 8 - 64] + lea r2, [r2 + r3 * 8 - 64] + dec r7d + jnz .loopH + movd eax, m13 + RET +%endif ; HIGH_BIT_DEPTH +%endif + +INIT_YMM avx2 +cglobal psyCost_pp_4x4, 4, 5, 6 + lea r4, [3 * r1] + movd xm0, [r0] + movd xm1, [r0 + r1] + movd xm2, [r0 + r1 * 2] + movd xm3, [r0 + r4] + vshufps xm0, xm1, 0 + vshufps xm2, xm3, 0 + + lea r4, [3 * r3] + movd xm1, [r2] + movd xm3, [r2 + r3] + movd xm4, [r2 + r3 * 2] + movd xm5, [r2 + r4] + vshufps xm1, xm3, 0 + vshufps xm4, xm5, 0 + + vinserti128 m0, m0, xm1, 1 + vinserti128 m2, m2, xm4, 1 + + mova m4, [hmul_4p] + pmaddubsw m0, m4 + pmaddubsw m2, m4 + + paddw m5, m0, m2 + mova m1, m5 + psrldq m4, m5, 8 + paddw m5, m4 + pmaddwd m5, [pw_1] + psrld m5, 2 + + vpsubw m2, m2, m0 + vpunpckhqdq m0, m1, m2 + vpunpcklqdq m1, m1, m2 + vpaddw m2, m1, m0 + vpsubw m0, m0, m1 + vpblendw m1, m2, m0, 10101010b + vpslld m0, m0, 10h + vpsrld m2, m2, 10h + vpor m0, m0, m2 + vpabsw m1, m1 + vpabsw m0, m0 + vpmaxsw m1, m1, m0 + vpmaddwd m1, m1, [pw_1] + psrldq m2, m1, 8 + paddd m1, m2 + psrldq m3, m1, 4 + paddd m1, m3 + psubd m1, m5 + vextracti128 xm2, m1, 1 + psubd m1, m2 + pabsd m1, m1 + movd eax, xm1 + RET + +%macro PSY_PP_8x8 0 + movddup m0, [r0 + r1 * 0] + movddup m1, [r0 + r1 * 1] + movddup m2, [r0 + r1 * 2] + movddup m3, [r0 + r4 * 1] + + lea r5, [r0 + r1 * 4] + + movddup m4, [r2 + r3 * 0] + movddup m5, [r2 + r3 * 1] + movddup m6, [r2 + r3 * 2] + movddup m7, [r2 + r7 * 1] + + lea r6, [r2 + r3 * 4] + + vinserti128 m0, m0, xm4, 1 + vinserti128 m1, m1, xm5, 1 + vinserti128 m2, m2, xm6, 1 + vinserti128 m3, m3, xm7, 1 + + movddup m4, [r5 + r1 * 0] + movddup m5, [r5 + r1 * 1] + movddup m6, [r5 + r1 * 2] + movddup m7, [r5 + r4 * 1] + + movddup m9, [r6 + r3 * 0] + movddup m10, [r6 + r3 * 1] + movddup m11, [r6 + r3 * 2] + movddup m12, [r6 + r7 * 1] + + vinserti128 m4, m4, xm9, 1 + vinserti128 m5, m5, xm10, 1 + vinserti128 m6, m6, xm11, 1 + vinserti128 m7, m7, xm12, 1 + + pmaddubsw m0, m8 + pmaddubsw m1, m8 + pmaddubsw m2, m8 + pmaddubsw m3, m8 + pmaddubsw m4, m8 + pmaddubsw m5, m8 + pmaddubsw m6, m8 + pmaddubsw m7, m8 + + paddw m11, m0, m1 + paddw m11, m2 + paddw m11, m3 + paddw m11, m4 + paddw m11, m5 + paddw m11, m6 + paddw m11, m7 + + pmaddwd m11, [pw_1] + psrldq m10, m11, 4 + paddd m11, m10 + psrld m11, 2 + + mova m9, m0 + paddw m0, m1 ; m0+m1 + psubw m1, m9 ; m1-m0 + mova m9, m2 + paddw m2, m3 ; m2+m3 + psubw m3, m9 ; m3-m2 + mova m9, m0 + paddw m0, m2 ; m0+m1+m2+m3 + psubw m2, m9 ; m2+m3-m0+m1 + mova m9, m1 + paddw m1, m3 ; m1-m0+m3-m2 + psubw m3, m9 ; m3-m2-m1-m0 + + movdqa m9, m4 + paddw m4, m5 ; m4+m5 + psubw m5, m9 ; m5-m4 + movdqa m9, m6 + paddw m6, m7 ; m6+m7 + psubw m7, m9 ; m7-m6 + movdqa m9, m4 + paddw m4, m6 ; m4+m5+m6+m7 + psubw m6, m9 ; m6+m7-m4+m5 + movdqa m9, m5 + paddw m5, m7 ; m5-m4+m7-m6 + psubw m7, m9 ; m7-m6-m5-m4 + + movdqa m9, m0 + paddw m0, m4 ; (m0+m1+m2+m3)+(m4+m5+m6+m7) + psubw m4, m9 ; (m4+m5+m6+m7)-(m0+m1+m2+m3) + movdqa m9, m1 + paddw m1, m5 ; (m1-m0+m3-m2)+(m5-m4+m7-m6) + psubw m5, m9 ; (m5-m4+m7-m6)-(m1-m0+m3-m2) + + mova m9, m0 + vshufps m9, m9, m4, 11011101b + vshufps m0, m0, m4, 10001000b + + movdqa m4, m0 + paddw m0, m9 ; (a0 + a4) + (a4 - a0) + psubw m9, m4 ; (a0 + a4) - (a4 - a0) == (a0 + a4) + (a0 - a4) + + movaps m4, m1 + vshufps m4, m4, m5, 11011101b + vshufps m1, m1, m5, 10001000b + + movdqa m5, m1 + paddw m1, m4 + psubw m4, m5 + movdqa m5, m2 + paddw m2, m6 + psubw m6, m5 + movdqa m5, m3 + paddw m3, m7 + psubw m7, m5 + + movaps m5, m2 + vshufps m5, m5, m6, 11011101b + vshufps m2, m2, m6, 10001000b + + movdqa m6, m2 + paddw m2, m5 + psubw m5, m6 + movaps m6, m3 + + vshufps m6, m6, m7, 11011101b + vshufps m3, m3, m7, 10001000b + + movdqa m7, m3 + paddw m3, m6 + psubw m6, m7 + movdqa m7, m0 + + pblendw m0, m9, 10101010b + pslld m9, 10h + psrld m7, 10h + por m9, m7 + pabsw m0, m0 + pabsw m9, m9 + pmaxsw m0, m9 + movdqa m7, m1 + pblendw m1, m4, 10101010b + pslld m4, 10h + psrld m7, 10h + por m4, m7 + pabsw m1, m1 + pabsw m4, m4 + pmaxsw m1, m4 + movdqa m7, m2 + pblendw m2, m5, 10101010b + pslld m5, 10h + psrld m7, 10h + por m5, m7 + pabsw m2, m2 + pabsw m5, m5 + pmaxsw m2, m5 + mova m7, m3 + + pblendw m3, m6, 10101010b + pslld m6, 10h + psrld m7, 10h + por m6, m7 + pabsw m3, m3 + pabsw m6, m6 + pmaxsw m3, m6 + paddw m0, m1 + paddw m0, m2 + paddw m0, m3 + pmaddwd m0, [pw_1] + psrldq m1, m0, 8 + paddd m0, m1 + + pshuflw m1, m0, 00001110b + paddd m0, m1 + paddd m0, [pd_1] + psrld m0, 1 + + psubd m0, m11 + + vextracti128 xm1, m0, 1 + psubd m0, m1 + pabsd m0, m0 +%endmacro + +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal psyCost_pp_8x8, 4, 8, 13 + lea r4, [3 * r1] + lea r7, [3 * r3] + mova m8, [hmul_8p] + + PSY_PP_8x8 + + movd eax, xm0 + RET +%endif + +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal psyCost_pp_16x16, 4, 10, 14 + lea r4, [3 * r1] + lea r7, [3 * r3] + mova m8, [hmul_8p] + pxor m13, m13 + + mov r8d, 2 +.loopH: + mov r9d, 2 +.loopW: + PSY_PP_8x8 + + paddd m13, m0 + add r0, 8 + add r2, 8 + dec r9d + jnz .loopW + lea r0, [r0 + r1 * 8 - 16] + lea r2, [r2 + r3 * 8 - 16] + dec r8d + jnz .loopH + movd eax, xm13 + RET +%endif + +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal psyCost_pp_32x32, 4, 10, 14 + lea r4, [3 * r1] + lea r7, [3 * r3] + mova m8, [hmul_8p] + pxor m13, m13 + + mov r8d, 4 +.loopH: + mov r9d, 4 +.loopW: + PSY_PP_8x8 + + paddd m13, m0 + add r0, 8 + add r2, 8 + dec r9d + jnz .loopW + lea r0, [r0 + r1 * 8 - 32] + lea r2, [r2 + r3 * 8 - 32] + dec r8d + jnz .loopH + movd eax, xm13 + RET +%endif + +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal psyCost_pp_64x64, 4, 10, 14 + lea r4, [3 * r1] + lea r7, [3 * r3] + mova m8, [hmul_8p] + pxor m13, m13 - paddw m0, m1 - paddw m0, m2 - paddw m0, m3 - HADDW m0, m1 + mov r8d, 8 +.loopH: + mov r9d, 8 +.loopW: + PSY_PP_8x8 - paddd m0, m14 - psrld m0, 1 - psubd m0, m11 - psubd m12, m0 - pabsd m0, m12 paddd m13, m0 add r0, 8 add r2, 8 - dec r6d + dec r9d jnz .loopW lea r0, [r0 + r1 * 8 - 64] lea r2, [r2 + r3 * 8 - 64] - dec r7d + dec r8d jnz .loopH - movd eax, m13 + movd eax, xm13 RET -%endif ; HIGH_BIT_DEPTH %endif ;--------------------------------------------------------------------------------------------------------------------- @@ -7613,136 +8519,419 @@ paddd m1, m4 shufps m1, m5, 10001000b - psrldq m4, m2, 4 - psubd m5, m2, m4 - paddd m2, m4 - shufps m2, m5, 10001000b + psrldq m4, m2, 4 + psubd m5, m2, m4 + paddd m2, m4 + shufps m2, m5, 10001000b + + psrldq m4, m3, 4 + psubd m5, m3, m4 + paddd m3, m4 + shufps m3, m5, 10001000b + + mova m4, m0 + paddd m0, m1 + psubd m1, m4 + mova m4, m2 + paddd m2, m3 + psubd m3, m4 + mova m4, m0 + paddd m0, m2 + psubd m2, m4 + mova m4, m1 + paddd m1, m3 + psubd m3, m4 + + pabsd m0, m0 + pabsd m2, m2 + pabsd m1, m1 + pabsd m3, m3 + paddd m0, m2 + paddd m1, m3 + paddd m0, m1 + movhlps m1, m0 + paddd m0, m1 + psrldq m1, m0, 4 + paddd m0, m1 + psrld m0, 1 + psubd m7, m0, m6 + + add r3, r3 + lea r4, [3 * r3] + movddup m0, [r2] + movddup m1, [r2 + r3] + movddup m2, [r2 + r3 * 2] + movddup m3, [r2 + r4] + + pabsw m4, m0 + pabsw m5, m1 + paddw m5, m4 + pabsw m4, m2 + paddw m5, m4 + pabsw m4, m3 + paddw m5, m4 + pmaddwd m5, [pw_1] + psrldq m4, m5, 4 + paddd m5, m4 + psrld m6, m5, 2 + + mova m4, [hmul_8w] + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m4 + pmaddwd m3, m4 + + psrldq m4, m0, 4 + psubd m5, m0, m4 + paddd m0, m4 + shufps m0, m5, 10001000b + + psrldq m4, m1, 4 + psubd m5, m1, m4 + paddd m1, m4 + shufps m1, m5, 10001000b + + psrldq m4, m2, 4 + psubd m5, m2, m4 + paddd m2, m4 + shufps m2, m5, 10001000b + + psrldq m4, m3, 4 + psubd m5, m3, m4 + paddd m3, m4 + shufps m3, m5, 10001000b + + mova m4, m0 + paddd m0, m1 + psubd m1, m4 + mova m4, m2 + paddd m2, m3 + psubd m3, m4 + mova m4, m0 + paddd m0, m2 + psubd m2, m4 + mova m4, m1 + paddd m1, m3 + psubd m3, m4 + + pabsd m0, m0 + pabsd m2, m2 + pabsd m1, m1 + pabsd m3, m3 + paddd m0, m2 + paddd m1, m3 + paddd m0, m1 + movhlps m1, m0 + paddd m0, m1 + psrldq m1, m0, 4 + paddd m0, m1 + psrld m0, 1 + psubd m0, m6 + psubd m7, m0 + pabsd m0, m7 + movd eax, m0 + RET + +%if ARCH_X86_64 +INIT_XMM sse4 +cglobal psyCost_ss_8x8, 4, 6, 15 + + mova m13, [hmul_w] + mova m14, [pw_1] + add r1, r1 + add r3, r3 + lea r4, [3 * r1] + movu m0, [r0] + movu m1, [r0 + r1] + movu m2, [r0 + r1 * 2] + movu m3, [r0 + r4] + lea r5, [r0 + r1 * 4] + movu m4, [r5] + movu m5, [r5 + r1] + movu m6, [r5 + r1 * 2] + movu m7, [r5 + r4] + + pabsw m8, m0 + pabsw m9, m1 + paddw m8, m9 + pabsw m10, m2 + pabsw m11, m3 + paddw m10, m11 + paddw m8, m10 + pabsw m9, m4 + pabsw m10, m5 + paddw m9, m10 + pabsw m11, m6 + pabsw m12, m7 + paddw m11, m12 + paddw m9, m11 + paddw m8, m9 + movhlps m9, m8 + pmovzxwd m8, m8 + pmovzxwd m9, m9 + paddd m8, m9 + movhlps m9, m8 + paddd m8, m9 + psrldq m9, m8, 4 + paddd m8, m9 + psrld m8, 2 + + pmaddwd m0, m13 + pmaddwd m1, m13 + pmaddwd m2, m13 + pmaddwd m3, m13 + + psrldq m9, m0, 4 + psubd m10, m0, m9 + paddd m0, m9 + shufps m0, m10, 10001000b + psrldq m9, m0, 4 + psubd m10, m0, m9 + paddd m0, m9 + shufps m0, m10, 10001000b + + psrldq m9, m1, 4 + psubd m10, m1, m9 + paddd m1, m9 + shufps m1, m10, 10001000b + psrldq m9, m1, 4 + psubd m10, m1, m9 + paddd m1, m9 + shufps m1, m10, 10001000b + + psrldq m9, m2, 4 + psubd m10, m2, m9 + paddd m2, m9 + shufps m2, m10, 10001000b + psrldq m9, m2, 4 + psubd m10, m2, m9 + paddd m2, m9 + shufps m2, m10, 10001000b + + psrldq m9, m3, 4 + psubd m10, m3, m9 + paddd m3, m9 + shufps m3, m10, 10001000b + psrldq m9, m3, 4 + psubd m10, m3, m9 + paddd m3, m9 + shufps m3, m10, 10001000b + + SUMSUB_BA d, 0, 1, 9 + SUMSUB_BA d, 2, 3, 9 + SUMSUB_BA d, 0, 2, 9 + SUMSUB_BA d, 1, 3, 9 + + pmaddwd m4, m13 + pmaddwd m5, m13 + pmaddwd m6, m13 + pmaddwd m7, m13 + + psrldq m9, m4, 4 + psubd m10, m4, m9 + paddd m4, m9 + shufps m4, m10, 10001000b + psrldq m9, m4, 4 + psubd m10, m4, m9 + paddd m4, m9 + shufps m4, m10, 10001000b + + psrldq m9, m5, 4 + psubd m10, m5, m9 + paddd m5, m9 + shufps m5, m10, 10001000b + psrldq m9, m5, 4 + psubd m10, m5, m9 + paddd m5, m9 + shufps m5, m10, 10001000b + + psrldq m9, m6, 4 + psubd m10, m6, m9 + paddd m6, m9 + shufps m6, m10, 10001000b + psrldq m9, m6, 4 + psubd m10, m6, m9 + paddd m6, m9 + shufps m6, m10, 10001000b - psrldq m4, m3, 4 - psubd m5, m3, m4 - paddd m3, m4 - shufps m3, m5, 10001000b + psrldq m9, m7, 4 + psubd m10, m7, m9 + paddd m7, m9 + shufps m7, m10, 10001000b + psrldq m9, m7, 4 + psubd m10, m7, m9 + paddd m7, m9 + shufps m7, m10, 10001000b - mova m4, m0 - paddd m0, m1 - psubd m1, m4 - mova m4, m2 - paddd m2, m3 - psubd m3, m4 - mova m4, m0 - paddd m0, m2 - psubd m2, m4 - mova m4, m1 - paddd m1, m3 - psubd m3, m4 + SUMSUB_BA d, 4, 5, 9 + SUMSUB_BA d, 6, 7, 9 + SUMSUB_BA d, 4, 6, 9 + SUMSUB_BA d, 5, 7, 9 + + SUMSUB_BA d, 0, 4, 9 + SUMSUB_BA d, 1, 5, 9 + SUMSUB_BA d, 2, 6, 9 + SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 + pabsd m4, m4 + pabsd m5, m5 + pabsd m6, m6 + pabsd m7, m7 + paddd m0, m2 paddd m1, m3 paddd m0, m1 - movhlps m1, m0 - paddd m0, m1 - psrldq m1, m0, 4 - paddd m0, m1 - psrld m0, 1 - psubd m7, m0, m6 + paddd m5, m4 + paddd m0, m5 + paddd m7, m6 + paddd m11, m0, m7 - add r3, r3 - lea r4, [3 * r3] - movddup m0, [r2] - movddup m1, [r2 + r3] - movddup m2, [r2 + r3 * 2] - movddup m3, [r2 + r4] + movu m0, [r0] + movu m1, [r0 + r1] + movu m2, [r0 + r1 * 2] + movu m3, [r0 + r4] - pabsw m4, m0 - pabsw m5, m1 - paddw m5, m4 - pabsw m4, m2 - paddw m5, m4 - pabsw m4, m3 - paddw m5, m4 - pmaddwd m5, [pw_1] - psrldq m4, m5, 4 - paddd m5, m4 - psrld m6, m5, 2 + pmaddwd m0, m14 + pmaddwd m1, m14 + pmaddwd m2, m14 + pmaddwd m3, m14 - mova m4, [hmul_8w] - pmaddwd m0, m4 - pmaddwd m1, m4 - pmaddwd m2, m4 - pmaddwd m3, m4 + psrldq m9, m0, 4 + psubd m10, m0, m9 + paddd m0, m9 + shufps m0, m10, 10001000b + psrldq m9, m0, 4 + psubd m10, m0, m9 + paddd m0, m9 + shufps m0, m10, 10001000b - psrldq m4, m0, 4 - psubd m5, m0, m4 - paddd m0, m4 - shufps m0, m5, 10001000b + psrldq m9, m1, 4 + psubd m10, m1, m9 + paddd m1, m9 + shufps m1, m10, 10001000b + psrldq m9, m1, 4 + psubd m10, m1, m9 + paddd m1, m9 + shufps m1, m10, 10001000b - psrldq m4, m1, 4 - psubd m5, m1, m4 - paddd m1, m4 - shufps m1, m5, 10001000b + psrldq m9, m2, 4 + psubd m10, m2, m9 + paddd m2, m9 + shufps m2, m10, 10001000b + psrldq m9, m2, 4 + psubd m10, m2, m9 + paddd m2, m9 + shufps m2, m10, 10001000b - psrldq m4, m2, 4 - psubd m5, m2, m4 - paddd m2, m4 - shufps m2, m5, 10001000b + psrldq m9, m3, 4 + psubd m10, m3, m9 + paddd m3, m9 + shufps m3, m10, 10001000b + psrldq m9, m3, 4 + psubd m10, m3, m9 + paddd m3, m9 + shufps m3, m10, 10001000b - psrldq m4, m3, 4 - psubd m5, m3, m4 - paddd m3, m4 - shufps m3, m5, 10001000b + SUMSUB_BA d, 0, 1, 9 + SUMSUB_BA d, 2, 3, 9 + SUMSUB_BA d, 0, 2, 9 + SUMSUB_BA d, 1, 3, 9 - mova m4, m0 - paddd m0, m1 - psubd m1, m4 - mova m4, m2 - paddd m2, m3 - psubd m3, m4 - mova m4, m0 - paddd m0, m2 - psubd m2, m4 - mova m4, m1 - paddd m1, m3 - psubd m3, m4 + movu m4, [r5] + movu m5, [r5 + r1] + movu m6, [r5 + r1 * 2] + movu m7, [r5 + r4] + + pmaddwd m4, m14 + pmaddwd m5, m14 + pmaddwd m6, m14 + pmaddwd m7, m14 + + psrldq m9, m4, 4 + psubd m10, m4, m9 + paddd m4, m9 + shufps m4, m10, 10001000b + psrldq m9, m4, 4 + psubd m10, m4, m9 + paddd m4, m9 + shufps m4, m10, 10001000b + + psrldq m9, m5, 4 + psubd m10, m5, m9 + paddd m5, m9 + shufps m5, m10, 10001000b + psrldq m9, m5, 4 + psubd m10, m5, m9 + paddd m5, m9 + shufps m5, m10, 10001000b + + psrldq m9, m6, 4 + psubd m10, m6, m9 + paddd m6, m9 + shufps m6, m10, 10001000b + psrldq m9, m6, 4 + psubd m10, m6, m9 + paddd m6, m9 + shufps m6, m10, 10001000b + + psrldq m9, m7, 4 + psubd m10, m7, m9 + paddd m7, m9 + shufps m7, m10, 10001000b + psrldq m9, m7, 4 + psubd m10, m7, m9 + paddd m7, m9 + shufps m7, m10, 10001000b + + SUMSUB_BA d, 4, 5, 9 + SUMSUB_BA d, 6, 7, 9 + SUMSUB_BA d, 4, 6, 9 + SUMSUB_BA d, 5, 7, 9 + + SUMSUB_BA d, 0, 4, 9 + SUMSUB_BA d, 1, 5, 9 + SUMSUB_BA d, 2, 6, 9 + SUMSUB_BA d, 3, 7, 9 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 + pabsd m4, m4 + pabsd m5, m5 + pabsd m6, m6 + pabsd m7, m7 + paddd m0, m2 paddd m1, m3 paddd m0, m1 + paddd m5, m4 + paddd m0, m5 + paddd m7, m6 + paddd m0, m7 + paddd m0, m11 + movhlps m1, m0 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 - psrld m0, 1 - psubd m0, m6 - psubd m7, m0 - pabsd m0, m7 - movd eax, m0 - RET - -%if ARCH_X86_64 -INIT_XMM sse4 -cglobal psyCost_ss_8x8, 4, 6, 15 + paddd m0, [pd_2] + psrld m0, 2 + psubd m12, m0, m8 - mova m13, [hmul_w] - mova m14, [pw_1] - add r1, r1 - add r3, r3 - lea r4, [3 * r1] - movu m0, [r0] - movu m1, [r0 + r1] - movu m2, [r0 + r1 * 2] - movu m3, [r0 + r4] - lea r5, [r0 + r1 * 4] + lea r4, [3 * r3] + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + r3 * 2] + movu m3, [r2 + r4] + lea r5, [r2 + r3 * 4] movu m4, [r5] - movu m5, [r5 + r1] - movu m6, [r5 + r1 * 2] + movu m5, [r5 + r3] + movu m6, [r5 + r3 * 2] movu m7, [r5 + r4] pabsw m8, m0 @@ -7756,8 +8945,8 @@ pabsw m10, m5 paddw m9, m10 pabsw m11, m6 - pabsw m12, m7 - paddw m11, m12 + pabsw m10, m7 + paddw m11, m10 paddw m9, m11 paddw m8, m9 movhlps m9, m8 @@ -7884,10 +9073,10 @@ paddd m7, m6 paddd m11, m0, m7 - movu m0, [r0] - movu m1, [r0 + r1] - movu m2, [r0 + r1 * 2] - movu m3, [r0 + r4] + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + r3 * 2] + movu m3, [r2 + r4] pmaddwd m0, m14 pmaddwd m1, m14 @@ -7936,8 +9125,8 @@ SUMSUB_BA d, 1, 3, 9 movu m4, [r5] - movu m5, [r5 + r1] - movu m6, [r5 + r1 * 2] + movu m5, [r5 + r3] + movu m6, [r5 + r3 * 2] movu m7, [r5 + r4] pmaddwd m4, m14 @@ -8015,17 +9204,23 @@ paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 - psubd m12, m0, m8 + psubd m0, m8 - lea r4, [3 * r3] - movu m0, [r2] - movu m1, [r2 + r3] - movu m2, [r2 + r3 * 2] - movu m3, [r2 + r4] - lea r5, [r2 + r3 * 4] + psubd m12, m0 + pabsd m0, m12 + movd eax, m0 + RET +%endif + +%macro psy_cost_ss 0 + movu m0, [r0] + movu m1, [r0 + r1] + movu m2, [r0 + r1 * 2] + movu m3, [r0 + r4] + lea r5, [r0 + r1 * 4] movu m4, [r5] - movu m5, [r5 + r3] - movu m6, [r5 + r3 * 2] + movu m5, [r5 + r1] + movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pabsw m8, m0 @@ -8039,8 +9234,8 @@ pabsw m10, m5 paddw m9, m10 pabsw m11, m6 - pabsw m10, m7 - paddw m11, m10 + pabsw m12, m7 + paddw m11, m12 paddw m9, m11 paddw m8, m9 movhlps m9, m8 @@ -8167,10 +9362,10 @@ paddd m7, m6 paddd m11, m0, m7 - movu m0, [r2] - movu m1, [r2 + r3] - movu m2, [r2 + r3 * 2] - movu m3, [r2 + r4] + movu m0, [r0] + movu m1, [r0 + r1] + movu m2, [r0 + r1 * 2] + movu m3, [r0 + r4] pmaddwd m0, m14 pmaddwd m1, m14 @@ -8219,8 +9414,8 @@ SUMSUB_BA d, 1, 3, 9 movu m4, [r5] - movu m5, [r5 + r3] - movu m6, [r5 + r3 * 2] + movu m5, [r5 + r1] + movu m6, [r5 + r1 * 2] movu m7, [r5 + r4] pmaddwd m4, m14 @@ -8298,24 +9493,17 @@ paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 - psubd m0, m8 - - psubd m12, m0 - pabsd m0, m12 - movd eax, m0 - RET -%endif + psubd m12, m0, m8 -%macro psy_cost_ss 0 - movu m0, [r0] - movu m1, [r0 + r1] - movu m2, [r0 + r1 * 2] - movu m3, [r0 + r4] - lea r5, [r0 + r1 * 4] + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + r3 * 2] + movu m3, [r2 + r6] + lea r5, [r2 + r3 * 4] movu m4, [r5] - movu m5, [r5 + r1] - movu m6, [r5 + r1 * 2] - movu m7, [r5 + r4] + movu m5, [r5 + r3] + movu m6, [r5 + r3 * 2] + movu m7, [r5 + r6] pabsw m8, m0 pabsw m9, m1 @@ -8328,8 +9516,8 @@ pabsw m10, m5 paddw m9, m10 pabsw m11, m6 - pabsw m12, m7 - paddw m11, m12 + pabsw m10, m7 + paddw m11, m10 paddw m9, m11 paddw m8, m9 movhlps m9, m8 @@ -8456,10 +9644,10 @@ paddd m7, m6 paddd m11, m0, m7 - movu m0, [r0] - movu m1, [r0 + r1] - movu m2, [r0 + r1 * 2] - movu m3, [r0 + r4] + movu m0, [r2] + movu m1, [r2 + r3] + movu m2, [r2 + r3 * 2] + movu m3, [r2 + r6] pmaddwd m0, m14 pmaddwd m1, m14 @@ -8502,102 +9690,313 @@ paddd m3, m9 shufps m3, m10, 10001000b - SUMSUB_BA d, 0, 1, 9 - SUMSUB_BA d, 2, 3, 9 - SUMSUB_BA d, 0, 2, 9 - SUMSUB_BA d, 1, 3, 9 + SUMSUB_BA d, 0, 1, 9 + SUMSUB_BA d, 2, 3, 9 + SUMSUB_BA d, 0, 2, 9 + SUMSUB_BA d, 1, 3, 9 + + movu m4, [r5] + movu m5, [r5 + r3] + movu m6, [r5 + r3 * 2] + movu m7, [r5 + r6] + + pmaddwd m4, m14 + pmaddwd m5, m14 + pmaddwd m6, m14 + pmaddwd m7, m14 + + psrldq m9, m4, 4 + psubd m10, m4, m9 + paddd m4, m9 + shufps m4, m10, 10001000b + psrldq m9, m4, 4 + psubd m10, m4, m9 + paddd m4, m9 + shufps m4, m10, 10001000b + + psrldq m9, m5, 4 + psubd m10, m5, m9 + paddd m5, m9 + shufps m5, m10, 10001000b + psrldq m9, m5, 4 + psubd m10, m5, m9 + paddd m5, m9 + shufps m5, m10, 10001000b + + psrldq m9, m6, 4 + psubd m10, m6, m9 + paddd m6, m9 + shufps m6, m10, 10001000b + psrldq m9, m6, 4 + psubd m10, m6, m9 + paddd m6, m9 + shufps m6, m10, 10001000b + + psrldq m9, m7, 4 + psubd m10, m7, m9 + paddd m7, m9 + shufps m7, m10, 10001000b + psrldq m9, m7, 4 + psubd m10, m7, m9 + paddd m7, m9 + shufps m7, m10, 10001000b + + SUMSUB_BA d, 4, 5, 9 + SUMSUB_BA d, 6, 7, 9 + SUMSUB_BA d, 4, 6, 9 + SUMSUB_BA d, 5, 7, 9 + + SUMSUB_BA d, 0, 4, 9 + SUMSUB_BA d, 1, 5, 9 + SUMSUB_BA d, 2, 6, 9 + SUMSUB_BA d, 3, 7, 9 + + pabsd m0, m0 + pabsd m2, m2 + pabsd m1, m1 + pabsd m3, m3 + pabsd m4, m4 + pabsd m5, m5 + pabsd m6, m6 + pabsd m7, m7 + + paddd m0, m2 + paddd m1, m3 + paddd m0, m1 + paddd m5, m4 + paddd m0, m5 + paddd m7, m6 + paddd m0, m7 + paddd m0, m11 + + movhlps m1, m0 + paddd m0, m1 + psrldq m1, m0, 4 + paddd m0, m1 + paddd m0, [pd_2] + psrld m0, 2 + psubd m0, m8 + + psubd m12, m0 + pabsd m0, m12 + paddd m15, m0 +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse4 +cglobal psyCost_ss_16x16, 4, 9, 16 + + mova m13, [hmul_w] + mova m14, [pw_1] + add r1, r1 + add r3, r3 + lea r4, [3 * r1] + lea r6, [3 * r3] + pxor m15, m15 + mov r7d, 2 +.loopH: + mov r8d, 2 +.loopW: + psy_cost_ss + add r0, 16 + add r2, 16 + dec r8d + jnz .loopW + lea r0, [r0 + r1 * 8 - 32] + lea r2, [r2 + r3 * 8 - 32] + dec r7d + jnz .loopH + movd eax, m15 + RET +%endif + +%if ARCH_X86_64 +INIT_XMM sse4 +cglobal psyCost_ss_32x32, 4, 9, 16 + + mova m13, [hmul_w] + mova m14, [pw_1] + add r1, r1 + add r3, r3 + lea r4, [3 * r1] + lea r6, [3 * r3] + pxor m15, m15 + mov r7d, 4 +.loopH: + mov r8d, 4 +.loopW: + psy_cost_ss + add r0, 16 + add r2, 16 + dec r8d + jnz .loopW + lea r0, [r0 + r1 * 8 - 64] + lea r2, [r2 + r3 * 8 - 64] + dec r7d + jnz .loopH + movd eax, m15 + RET +%endif + +%if ARCH_X86_64 +INIT_XMM sse4 +cglobal psyCost_ss_64x64, 4, 9, 16 + + mova m13, [hmul_w] + mova m14, [pw_1] + add r1, r1 + add r3, r3 + lea r4, [3 * r1] + lea r6, [3 * r3] + pxor m15, m15 + mov r7d, 8 +.loopH: + mov r8d, 8 +.loopW: + psy_cost_ss + add r0, 16 + add r2, 16 + dec r8d + jnz .loopW + lea r0, [r0 + r1 * 8 - 128] + lea r2, [r2 + r3 * 8 - 128] + dec r7d + jnz .loopH + movd eax, m15 + RET +%endif + +INIT_YMM avx2 +cglobal psyCost_ss_4x4, 4, 5, 8 + add r1, r1 + add r3, r3 + lea r4, [3 * r1] + movddup m0, [r0] + movddup m1, [r0 + r1] + movddup m2, [r0 + r1 * 2] + movddup m3, [r0 + r4] - movu m4, [r5] - movu m5, [r5 + r1] - movu m6, [r5 + r1 * 2] - movu m7, [r5 + r4] + lea r4, [3 * r3] + movddup m4, [r2] + movddup m5, [r2 + r3] + movddup m6, [r2 + r3 * 2] + movddup m7, [r2 + r4] + + vinserti128 m0, m0, xm4, 1 + vinserti128 m1, m1, xm5, 1 + vinserti128 m2, m2, xm6, 1 + vinserti128 m3, m3, xm7, 1 - pmaddwd m4, m14 - pmaddwd m5, m14 - pmaddwd m6, m14 - pmaddwd m7, m14 + pabsw m4, m0 + pabsw m5, m1 + paddw m5, m4 + pabsw m4, m2 + paddw m5, m4 + pabsw m4, m3 + paddw m5, m4 + pmaddwd m5, [pw_1] + psrldq m4, m5, 4 + paddd m5, m4 + psrld m6, m5, 2 - psrldq m9, m4, 4 - psubd m10, m4, m9 - paddd m4, m9 - shufps m4, m10, 10001000b - psrldq m9, m4, 4 - psubd m10, m4, m9 - paddd m4, m9 - shufps m4, m10, 10001000b + mova m4, [hmul_8w] + pmaddwd m0, m4 + pmaddwd m1, m4 + pmaddwd m2, m4 + pmaddwd m3, m4 - psrldq m9, m5, 4 - psubd m10, m5, m9 - paddd m5, m9 - shufps m5, m10, 10001000b - psrldq m9, m5, 4 - psubd m10, m5, m9 - paddd m5, m9 - shufps m5, m10, 10001000b + psrldq m4, m0, 4 + psubd m5, m0, m4 + paddd m0, m4 + shufps m0, m0, m5, 10001000b - psrldq m9, m6, 4 - psubd m10, m6, m9 - paddd m6, m9 - shufps m6, m10, 10001000b - psrldq m9, m6, 4 - psubd m10, m6, m9 - paddd m6, m9 - shufps m6, m10, 10001000b + psrldq m4, m1, 4 + psubd m5, m1, m4 + paddd m1, m4 + shufps m1, m1, m5, 10001000b - psrldq m9, m7, 4 - psubd m10, m7, m9 - paddd m7, m9 - shufps m7, m10, 10001000b - psrldq m9, m7, 4 - psubd m10, m7, m9 - paddd m7, m9 - shufps m7, m10, 10001000b + psrldq m4, m2, 4 + psubd m5, m2, m4 + paddd m2, m4 + shufps m2, m2, m5, 10001000b - SUMSUB_BA d, 4, 5, 9 - SUMSUB_BA d, 6, 7, 9 - SUMSUB_BA d, 4, 6, 9 - SUMSUB_BA d, 5, 7, 9 + psrldq m4, m3, 4 + psubd m5, m3, m4 + paddd m3, m4 + shufps m3, m3, m5, 10001000b - SUMSUB_BA d, 0, 4, 9 - SUMSUB_BA d, 1, 5, 9 - SUMSUB_BA d, 2, 6, 9 - SUMSUB_BA d, 3, 7, 9 + mova m4, m0 + paddd m0, m1 + psubd m1, m4 + mova m4, m2 + paddd m2, m3 + psubd m3, m4 + mova m4, m0 + paddd m0, m2 + psubd m2, m4 + mova m4, m1 + paddd m1, m3 + psubd m3, m4 pabsd m0, m0 pabsd m2, m2 pabsd m1, m1 pabsd m3, m3 - pabsd m4, m4 - pabsd m5, m5 - pabsd m6, m6 - pabsd m7, m7 - paddd m0, m2 paddd m1, m3 paddd m0, m1 - paddd m5, m4 - paddd m0, m5 - paddd m7, m6 - paddd m0, m7 - paddd m0, m11 - - movhlps m1, m0 + psrldq m1, m0, 8 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 - paddd m0, [pd_2] - psrld m0, 2 - psubd m12, m0, m8 + psrld m0, 1 + psubd m0, m6 + vextracti128 xm1, m0, 1 + psubd m0, m1 + pabsd m0, m0 + movd eax, xm0 + RET - movu m0, [r2] - movu m1, [r2 + r3] - movu m2, [r2 + r3 * 2] - movu m3, [r2 + r6] - lea r5, [r2 + r3 * 4] - movu m4, [r5] - movu m5, [r5 + r3] - movu m6, [r5 + r3 * 2] - movu m7, [r5 + r6] +%macro PSY_SS_8x8 0 + lea r4, [3 * r1] + lea r6, [r0 + r1 * 4] + movu xm0, [r0] + movu xm1, [r0 + r1] + movu xm2, [r0 + r1 * 2] + movu xm3, [r0 + r4] + movu xm4, [r6] + movu xm5, [r6 + r1] + movu xm6, [r6 + r1 * 2] + movu xm7, [r6 + r4] + + lea r4, [3 * r3] + lea r6, [r2 + r3 * 4] + movu xm8, [r2] + movu xm9, [r2 + r3] + movu xm10, [r2 + r3 * 2] + movu xm11, [r2 + r4] + vinserti128 m0, m0, xm8, 1 + vinserti128 m1, m1, xm9, 1 + vinserti128 m2, m2, xm10, 1 + vinserti128 m3, m3, xm11, 1 + movu xm8, [r6] + movu xm9, [r6 + r3] + movu xm10, [r6 + r3 * 2] + movu xm11, [r6 + r4] + vinserti128 m4, m4, xm8, 1 + vinserti128 m5, m5, xm9, 1 + vinserti128 m6, m6, xm10, 1 + vinserti128 m7, m7, xm11, 1 + + ;; store on stack to use later + mova [rsp + 0 * mmsize], m0 + mova [rsp + 1 * mmsize], m1 + mova [rsp + 2 * mmsize], m2 + mova [rsp + 3 * mmsize], m3 + mova [rsp + 4 * mmsize], m4 + mova [rsp + 5 * mmsize], m5 + mova [rsp + 6 * mmsize], m6 + mova [rsp + 7 * mmsize], m7 pabsw m8, m0 pabsw m9, m1 @@ -8614,15 +10013,25 @@ paddw m11, m10 paddw m9, m11 paddw m8, m9 - movhlps m9, m8 - pmovzxwd m8, m8 - pmovzxwd m9, m9 + psrldq m9, m8, 8 + + vextracti128 xm10, m8, 1 + vextracti128 xm11, m9, 1 + + vpmovzxwd m8, xm8 + vpmovzxwd m9, xm9 + vpmovzxwd m10, xm10 + vpmovzxwd m11, xm11 + + vinserti128 m8, m8, xm10, 1 + vinserti128 m9, m9, xm11, 1 + paddd m8, m9 - movhlps m9, m8 + psrldq m9, m8, 8 paddd m8, m9 psrldq m9, m8, 4 paddd m8, m9 - psrld m8, 2 + psrld m8, 2 ; sad_4x4 pmaddwd m0, m13 pmaddwd m1, m13 @@ -8632,38 +10041,38 @@ psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 - shufps m0, m10, 10001000b + vshufps m0, m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 - shufps m0, m10, 10001000b + vshufps m0, m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 - shufps m1, m10, 10001000b + vshufps m1, m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 - shufps m1, m10, 10001000b + vshufps m1, m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 - shufps m2, m10, 10001000b + vshufps m2, m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 - shufps m2, m10, 10001000b + vshufps m2, m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 - shufps m3, m10, 10001000b + vshufps m3, m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 - shufps m3, m10, 10001000b + vshufps m3, m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 @@ -8678,38 +10087,38 @@ psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 - shufps m4, m10, 10001000b + vshufps m4, m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 - shufps m4, m10, 10001000b + vshufps m4, m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 - shufps m5, m10, 10001000b + vshufps m5, m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 - shufps m5, m10, 10001000b + vshufps m5, m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 - shufps m6, m10, 10001000b + vshufps m6, m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 - shufps m6, m10, 10001000b + vshufps m6, m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 - shufps m7, m10, 10001000b + vshufps m7, m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 - shufps m7, m10, 10001000b + vshufps m7, m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 @@ -8738,102 +10147,92 @@ paddd m7, m6 paddd m11, m0, m7 - movu m0, [r2] - movu m1, [r2 + r3] - movu m2, [r2 + r3 * 2] - movu m3, [r2 + r6] - - pmaddwd m0, m14 - pmaddwd m1, m14 - pmaddwd m2, m14 - pmaddwd m3, m14 + pmaddwd m0, m12, [rsp + 0 * mmsize] + pmaddwd m1, m12, [rsp + 1 * mmsize] + pmaddwd m2, m12, [rsp + 2 * mmsize] + pmaddwd m3, m12, [rsp + 3 * mmsize] psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 - shufps m0, m10, 10001000b + vshufps m0, m0, m10, 10001000b psrldq m9, m0, 4 psubd m10, m0, m9 paddd m0, m9 - shufps m0, m10, 10001000b + vshufps m0, m0, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 - shufps m1, m10, 10001000b + vshufps m1, m1, m10, 10001000b psrldq m9, m1, 4 psubd m10, m1, m9 paddd m1, m9 - shufps m1, m10, 10001000b + vshufps m1, m1, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 - shufps m2, m10, 10001000b + vshufps m2, m2, m10, 10001000b psrldq m9, m2, 4 psubd m10, m2, m9 paddd m2, m9 - shufps m2, m10, 10001000b + vshufps m2, m2, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 - shufps m3, m10, 10001000b + vshufps m3, m3, m10, 10001000b psrldq m9, m3, 4 psubd m10, m3, m9 paddd m3, m9 - shufps m3, m10, 10001000b + vshufps m3, m3, m10, 10001000b SUMSUB_BA d, 0, 1, 9 SUMSUB_BA d, 2, 3, 9 SUMSUB_BA d, 0, 2, 9 SUMSUB_BA d, 1, 3, 9 - movu m4, [r5] - movu m5, [r5 + r3] - movu m6, [r5 + r3 * 2] - movu m7, [r5 + r6] - - pmaddwd m4, m14 - pmaddwd m5, m14 - pmaddwd m6, m14 - pmaddwd m7, m14 + pmaddwd m4, m12, [rsp + 4 * mmsize] + pmaddwd m5, m12, [rsp + 5 * mmsize] + pmaddwd m6, m12, [rsp + 6 * mmsize] + pmaddwd m7, m12, [rsp + 7 * mmsize] psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 - shufps m4, m10, 10001000b + vshufps m4, m4, m10, 10001000b psrldq m9, m4, 4 psubd m10, m4, m9 paddd m4, m9 - shufps m4, m10, 10001000b + vshufps m4, m4, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 - shufps m5, m10, 10001000b + vshufps m5, m5, m10, 10001000b psrldq m9, m5, 4 psubd m10, m5, m9 paddd m5, m9 - shufps m5, m10, 10001000b + vshufps m5, m5, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 - shufps m6, m10, 10001000b + vshufps m6, m6, m10, 10001000b psrldq m9, m6, 4 psubd m10, m6, m9 paddd m6, m9 - shufps m6, m10, 10001000b + vshufps m6, m6, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 - shufps m7, m10, 10001000b + vshufps m7, m7, m10, 10001000b psrldq m9, m7, 4 psubd m10, m7, m9 paddd m7, m9 - shufps m7, m10, 10001000b + vshufps m7, m7, m10, 10001000b SUMSUB_BA d, 4, 5, 9 SUMSUB_BA d, 6, 7, 9 @@ -8863,35 +10262,59 @@ paddd m0, m7 paddd m0, m11 - movhlps m1, m0 + psrldq m1, m0, 8 paddd m0, m1 psrldq m1, m0, 4 paddd m0, m1 paddd m0, [pd_2] psrld m0, 2 psubd m0, m8 - - psubd m12, m0 - pabsd m0, m12 - paddd m15, m0 + vextracti128 xm1, m0, 1 + psubd m0, m1 + pabsd m0, m0 %endmacro %if ARCH_X86_64 -INIT_XMM sse4 -cglobal psyCost_ss_16x16, 4, 9, 16 +INIT_YMM avx2 +cglobal psyCost_ss_8x8, 4, 7, 14 + ; NOTE: align stack to 64 bytes, so all of local data in same cache line + mov r5, rsp + sub rsp, 8*mmsize + and rsp, ~63 + mova m12, [pw_1] mova m13, [hmul_w] - mova m14, [pw_1] add r1, r1 add r3, r3 - lea r4, [3 * r1] - lea r6, [3 * r3] - pxor m15, m15 + + PSY_SS_8x8 + + movd eax, xm0 + mov rsp, r5 + RET +%endif + +%if ARCH_X86_64 +INIT_YMM avx2 +cglobal psyCost_ss_16x16, 4, 9, 15 + ; NOTE: align stack to 64 bytes, so all of local data in same cache line + mov r5, rsp + sub rsp, 8*mmsize + and rsp, ~63 + + mova m12, [pw_1] + mova m13, [hmul_w] + add r1, r1 + add r3, r3 + pxor m14, m14 + mov r7d, 2 .loopH: mov r8d, 2 .loopW: - psy_cost_ss + PSY_SS_8x8 + + paddd m14, m0 add r0, 16 add r2, 16 dec r8d @@ -8900,26 +10323,32 @@ lea r2, [r2 + r3 * 8 - 32] dec r7d jnz .loopH - movd eax, m15 + movd eax, xm14 + mov rsp, r5 RET %endif %if ARCH_X86_64 -INIT_XMM sse4 -cglobal psyCost_ss_32x32, 4, 9, 16 +INIT_YMM avx2 +cglobal psyCost_ss_32x32, 4, 9, 15 + ; NOTE: align stack to 64 bytes, so all of local data in same cache line + mov r5, rsp + sub rsp, 8*mmsize + and rsp, ~63 + mova m12, [pw_1] mova m13, [hmul_w] - mova m14, [pw_1] add r1, r1 add r3, r3 - lea r4, [3 * r1] - lea r6, [3 * r3] - pxor m15, m15 + pxor m14, m14 + mov r7d, 4 .loopH: mov r8d, 4 .loopW: - psy_cost_ss + PSY_SS_8x8 + + paddd m14, m0 add r0, 16 add r2, 16 dec r8d @@ -8928,26 +10357,32 @@ lea r2, [r2 + r3 * 8 - 64] dec r7d jnz .loopH - movd eax, m15 + movd eax, xm14 + mov rsp, r5 RET %endif %if ARCH_X86_64 -INIT_XMM sse4 -cglobal psyCost_ss_64x64, 4, 9, 16 +INIT_YMM avx2 +cglobal psyCost_ss_64x64, 4, 9, 15 + ; NOTE: align stack to 64 bytes, so all of local data in same cache line + mov r5, rsp + sub rsp, 8*mmsize + and rsp, ~63 + mova m12, [pw_1] mova m13, [hmul_w] - mova m14, [pw_1] add r1, r1 add r3, r3 - lea r4, [3 * r1] - lea r6, [3 * r3] - pxor m15, m15 + pxor m14, m14 + mov r7d, 8 .loopH: mov r8d, 8 .loopW: - psy_cost_ss + PSY_SS_8x8 + + paddd m14, m0 add r0, 16 add r2, 16 dec r8d @@ -8956,6 +10391,7 @@ lea r2, [r2 + r3 * 8 - 128] dec r7d jnz .loopH - movd eax, m15 + movd eax, xm14 + mov rsp, r5 RET %endif diff -Nru x265-1.5/source/common/x86/pixeladd8.asm x265-1.6/source/common/x86/pixeladd8.asm --- x265-1.5/source/common/x86/pixeladd8.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/pixeladd8.asm 2015-04-02 16:46:36.000000000 +0000 @@ -398,6 +398,52 @@ jnz .loop RET + +INIT_YMM avx2 +cglobal pixel_add_ps_16x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/4 + add r5, r5 +.loop: + + pmovzxbw m0, [r2] ; row 0 of src0 + pmovzxbw m1, [r2 + r4] ; row 1 of src0 + movu m2, [r3] ; row 0 of src1 + movu m3, [r3 + r5] ; row 1 of src1 + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + pmovzxbw m2, [r2] ; row 2 of src0 + pmovzxbw m3, [r2 + r4] ; row 3 of src0 + movu m4, [r3] ; row 2 of src1 + movu m5, [r3 + r5] ; row 3 of src1 + paddw m2, m4 + paddw m3, m5 + packuswb m2, m3 + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + vpermq m0, m0, 11011000b + movu [r0], xm0 ; row 0 of dst + vextracti128 xm3, m0, 1 + movu [r0 + r1], xm3 ; row 1 of dst + + lea r0, [r0 + r1 * 2] + vpermq m2, m2, 11011000b + movu [r0], xm2 ; row 2 of dst + vextracti128 xm3, m2, 1 + movu [r0 + r1], xm3 ; row 3 of dst + + lea r0, [r0 + r1 * 2] + + dec r6d + jnz .loop + + RET %endif %endmacro @@ -523,6 +569,67 @@ jnz .loop RET + +INIT_YMM avx2 +cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/4 + add r5, r5 +.loop: + pmovzxbw m0, [r2] ; first half of row 0 of src0 + pmovzxbw m1, [r2 + 16] ; second half of row 0 of src0 + movu m2, [r3] ; first half of row 0 of src1 + movu m3, [r3 + 32] ; second half of row 0 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0], m0 ; row 0 of dst + + pmovzxbw m0, [r2 + r4] ; first half of row 1 of src0 + pmovzxbw m1, [r2 + r4 + 16] ; second half of row 1 of src0 + movu m2, [r3 + r5] ; first half of row 1 of src1 + movu m3, [r3 + r5 + 32] ; second half of row 1 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0 + r1], m0 ; row 1 of dst + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + + pmovzxbw m0, [r2] ; first half of row 2 of src0 + pmovzxbw m1, [r2 + 16] ; second half of row 2 of src0 + movu m2, [r3] ; first half of row 2 of src1 + movu m3, [r3 + 32] ; second half of row 2 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0], m0 ; row 2 of dst + + pmovzxbw m0, [r2 + r4] ; first half of row 3 of src0 + pmovzxbw m1, [r2 + r4 + 16] ; second half of row 3 of src0 + movu m2, [r3 + r5] ; first half of row 3 of src1 + movu m3, [r3 + r5 + 32] ; second half of row 3 of src1 + + paddw m0, m2 + paddw m1, m3 + packuswb m0, m1 + vpermq m0, m0, 11011000b + movu [r0 + r1], m0 ; row 3 of dst + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + + dec r6d + jnz .loop + RET %endif %endmacro @@ -734,6 +841,60 @@ jnz .loop RET + +INIT_YMM avx2 +cglobal pixel_add_ps_64x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1 + mov r6d, %2/2 + add r5, r5 +.loop: + pmovzxbw m0, [r2] ; first 16 of row 0 of src0 + pmovzxbw m1, [r2 + 16] ; second 16 of row 0 of src0 + pmovzxbw m2, [r2 + 32] ; third 16 of row 0 of src0 + pmovzxbw m3, [r2 + 48] ; forth 16 of row 0 of src0 + movu m4, [r3] ; first 16 of row 0 of src1 + movu m5, [r3 + 32] ; second 16 of row 0 of src1 + movu m6, [r3 + 64] ; third 16 of row 0 of src1 + movu m7, [r3 + 96] ; forth 16 of row 0 of src1 + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b + movu [r0], m0 ; first 32 of row 0 of dst + vpermq m2, m2, 11011000b + movu [r0 + 32], m2 ; second 32 of row 0 of dst + + pmovzxbw m0, [r2 + r4] ; first 16 of row 1 of src0 + pmovzxbw m1, [r2 + r4 + 16] ; second 16 of row 1 of src0 + pmovzxbw m2, [r2 + r4 + 32] ; third 16 of row 1 of src0 + pmovzxbw m3, [r2 + r4 + 48] ; forth 16 of row 1 of src0 + movu m4, [r3 + r5] ; first 16 of row 1 of src1 + movu m5, [r3 + r5 + 32] ; second 16 of row 1 of src1 + movu m6, [r3 + r5 + 64] ; third 16 of row 1 of src1 + movu m7, [r3 + r5 + 96] ; forth 16 of row 1 of src1 + + paddw m0, m4 + paddw m1, m5 + paddw m2, m6 + paddw m3, m7 + packuswb m0, m1 + packuswb m2, m3 + vpermq m0, m0, 11011000b + movu [r0 + r1], m0 ; first 32 of row 1 of dst + vpermq m2, m2, 11011000b + movu [r0 + r1 + 32], m2 ; second 32 of row 1 of dst + + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + lea r0, [r0 + r1 * 2] + + dec r6d + jnz .loop + RET + %endif %endmacro diff -Nru x265-1.5/source/common/x86/pixel.h x265-1.6/source/common/x86/pixel.h --- x265-1.5/source/common/x86/pixel.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/pixel.h 2015-04-02 16:46:36.000000000 +0000 @@ -103,6 +103,13 @@ DECL_X1(satd, avx) DECL_X1(satd, xop) DECL_X1(satd, avx2) +int x265_pixel_satd_16x24_avx(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_32x48_avx(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_24x64_avx(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_8x64_avx(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_8x12_avx(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_12x32_avx(const pixel*, intptr_t, const pixel*, intptr_t); +int x265_pixel_satd_4x32_avx(const pixel*, intptr_t, const pixel*, intptr_t); int x265_pixel_satd_8x32_sse2(const pixel*, intptr_t, const pixel*, intptr_t); int x265_pixel_satd_16x4_sse2(const pixel*, intptr_t, const pixel*, intptr_t); int x265_pixel_satd_16x12_sse2(const pixel*, intptr_t, const pixel*, intptr_t); @@ -170,10 +177,12 @@ int x265_pixel_ssd_s_8_sse2(const int16_t*, intptr_t); int x265_pixel_ssd_s_16_sse2(const int16_t*, intptr_t); int x265_pixel_ssd_s_32_sse2(const int16_t*, intptr_t); +int x265_pixel_ssd_s_16_avx2(const int16_t*, intptr_t); int x265_pixel_ssd_s_32_avx2(const int16_t*, intptr_t); #define ADDAVG(func) \ - void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); + void x265_ ## func ## _sse4(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \ + void x265_ ## func ## _avx2(const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); ADDAVG(addAvg_2x4) ADDAVG(addAvg_2x8) ADDAVG(addAvg_4x2); @@ -228,6 +237,41 @@ int x265_psyCost_ss_16x16_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride); int x265_psyCost_ss_32x32_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride); int x265_psyCost_ss_64x64_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride); +void x265_pixel_avg_16x4_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_16x8_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_16x12_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_16x16_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_16x32_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_16x64_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_32x64_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_32x32_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_32x24_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_32x16_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_32x8_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_64x64_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_64x48_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_64x32_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); +void x265_pixel_avg_64x16_avx2(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int); + +void x265_pixel_add_ps_16x16_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); +void x265_pixel_add_ps_32x32_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); +void x265_pixel_add_ps_64x64_avx2(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1); + +void x265_pixel_sub_ps_16x16_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); +void x265_pixel_sub_ps_32x32_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); +void x265_pixel_sub_ps_64x64_avx2(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); + +int x265_psyCost_pp_4x4_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); +int x265_psyCost_pp_8x8_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); +int x265_psyCost_pp_16x16_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); +int x265_psyCost_pp_32x32_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); +int x265_psyCost_pp_64x64_avx2(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride); + +int x265_psyCost_ss_4x4_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride); +int x265_psyCost_ss_8x8_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride); +int x265_psyCost_ss_16x16_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride); +int x265_psyCost_ss_32x32_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride); +int x265_psyCost_ss_64x64_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride); #undef DECL_PIXELS #undef DECL_HEVC_SSD diff -Nru x265-1.5/source/common/x86/pixel-util8.asm x265-1.6/source/common/x86/pixel-util8.asm --- x265-1.5/source/common/x86/pixel-util8.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/pixel-util8.asm 2015-04-02 16:46:36.000000000 +0000 @@ -3,6 +3,7 @@ ;* ;* Authors: Min Chen ;* Nabajit Deka +;* Rajesh Paulraj ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -63,6 +64,12 @@ cextern pd_1 cextern pd_32767 cextern pd_n32768 +cextern pb_2 +cextern pb_4 +cextern pb_8 +cextern pb_16 +cextern pb_32 +cextern pb_64 ;----------------------------------------------------------------------------- ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride) @@ -95,9 +102,9 @@ punpcklqdq m0, m1 punpcklqdq m2, m3 psubw m0, m2 - movh [r2], m0 movhps [r2 + r3], m0 + RET %else cglobal getResidual4, 4,4,5 pxor m0, m0 @@ -130,8 +137,8 @@ psubw m1, m3 movh [r2], m1 movhps [r2 + r3 * 2], m1 -%endif RET +%endif INIT_XMM sse2 @@ -157,6 +164,7 @@ lea r2, [r2 + r3 * 2] %endif %endrep + RET %else cglobal getResidual8, 4,4,5 pxor m0, m0 @@ -183,8 +191,9 @@ lea r2, [r2 + r3 * 4] %endif %endrep -%endif RET +%endif + %if HIGH_BIT_DEPTH INIT_XMM sse2 @@ -238,10 +247,9 @@ lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 2] - jnz .loop + RET %else - INIT_XMM sse4 cglobal getResidual16, 4,5,8 mov r4d, 16/4 @@ -302,11 +310,67 @@ lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] - jnz .loop + RET %endif +%if HIGH_BIT_DEPTH +INIT_YMM avx2 +cglobal getResidual16, 4,4,5 + add r3, r3 + pxor m0, m0 + +%assign x 0 +%rep 16/2 + movu m1, [r0] + movu m2, [r0 + r3] + movu m3, [r1] + movu m4, [r1 + r3] + + psubw m1, m3 + psubw m2, m4 + movu [r2], m1 + movu [r2 + r3], m2 +%assign x x+1 +%if (x != 8) + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 2] +%endif +%endrep RET +%else +INIT_YMM avx2 +cglobal getResidual16, 4,5,8 + lea r4, [r3 * 2] + add r4d, r3d +%assign x 0 +%rep 4 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + r3] + pmovzxbw m2, [r0 + r3 * 2] + pmovzxbw m3, [r0 + r4] + pmovzxbw m4, [r1] + pmovzxbw m5, [r1 + r3] + pmovzxbw m6, [r1 + r3 * 2] + pmovzxbw m7, [r1 + r4] + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + movu [r2], m0 + movu [r2 + r3 * 2], m1 + movu [r2 + r3 * 2 * 2], m2 + movu [r2 + r4 * 2], m3 +%assign x x+1 +%if (x != 4) + lea r0, [r0 + r3 * 2 * 2] + lea r1, [r1 + r3 * 2 * 2] + lea r2, [r2 + r3 * 4 * 2] +%endif +%endrep + RET +%endif %if HIGH_BIT_DEPTH INIT_XMM sse2 @@ -357,9 +421,8 @@ lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 2] - jnz .loop - + RET %else INIT_XMM sse4 cglobal getResidual32, 4,5,7 @@ -415,12 +478,70 @@ lea r0, [r0 + r3 * 2] lea r1, [r1 + r3 * 2] lea r2, [r2 + r3 * 4] - jnz .loop + RET +%endif + + +%if HIGH_BIT_DEPTH +INIT_YMM avx2 +cglobal getResidual32, 4,4,5 + add r3, r3 + pxor m0, m0 + +%assign x 0 +%rep 32 + movu m1, [r0] + movu m2, [r0 + 32] + movu m3, [r1] + movu m4, [r1 + 32] + + psubw m1, m3 + psubw m2, m4 + movu [r2], m1 + movu [r2 + 32], m2 +%assign x x+1 +%if (x != 32) + lea r0, [r0 + r3] + lea r1, [r1 + r3] + lea r2, [r2 + r3] %endif +%endrep RET +%else +INIT_YMM avx2 +cglobal getResidual32, 4,5,8 + lea r4, [r3 * 2] +%assign x 0 +%rep 16 + pmovzxbw m0, [r0] + pmovzxbw m1, [r0 + 16] + pmovzxbw m2, [r0 + r3] + pmovzxbw m3, [r0 + r3 + 16] + + pmovzxbw m4, [r1] + pmovzxbw m5, [r1 + 16] + pmovzxbw m6, [r1 + r3] + pmovzxbw m7, [r1 + r3 + 16] + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + movu [r2 + 0 ], m0 + movu [r2 + 32], m1 + movu [r2 + r4 + 0], m2 + movu [r2 + r4 + 32], m3 +%assign x x+1 +%if (x != 16) + lea r0, [r0 + r3 * 2] + lea r1, [r1 + r3 * 2] + lea r2, [r2 + r3 * 4] +%endif +%endrep + RET +%endif ;----------------------------------------------------------------------------- ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); ;----------------------------------------------------------------------------- @@ -485,15 +606,14 @@ dec r4d jnz .loop - pxor m0, m0 - psadbw m7, m0 - movhlps m0, m7 - paddd m7, m0 - movd eax, m7 + pshufd m0, m7, 00001110b + paddd m0, m7 + pshufd m1, m0, 00000001b + paddd m0, m1 + movd eax, m0 RET -IACA_START %if ARCH_X86_64 == 1 INIT_YMM avx2 cglobal quant, 5,5,10 @@ -640,7 +760,6 @@ movd eax, xm7 RET %endif ; ARCH_X86_64 == 1 -IACA_END ;----------------------------------------------------------------------------- @@ -829,28 +948,188 @@ ;----------------------------------------------------------------------------- -; int count_nonzero(const int16_t *quantCoeff, int numCoeff); +; int x265_count_nonzero_4x4_ssse3(const int16_t *quantCoeff); ;----------------------------------------------------------------------------- INIT_XMM ssse3 -cglobal count_nonzero, 2,2,3 - pxor m0, m0 - shr r1d, 4 - movd m1, r1d - pshufb m1, m0 +cglobal count_nonzero_4x4, 1,1,2 + pxor m0, m0 -.loop: - mova m2, [r0 + 0] - packsswb m2, [r0 + 16] - add r0, 32 - pcmpeqb m2, m0 - paddb m1, m2 - dec r1d - jnz .loop + mova m1, [r0 + 0] + packsswb m1, [r0 + 16] + pcmpeqb m1, m0 + paddb m1, [pb_1] + + psadbw m1, m0 + pshufd m0, m1, 2 + paddd m0, m1 + movd eax, m0 + RET - psadbw m1, m0 - pshufd m0, m1, 2 - paddd m0, m1 - movd eax, m0 + +;----------------------------------------------------------------------------- +; int x265_count_nonzero_4x4_avx2(const int16_t *quantCoeff); +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal count_nonzero_4x4, 1,1,2 + pxor m0, m0 + + mova m1, [r0 + 0] + packsswb m1, [r0 + 16] + pcmpeqb m1, m0 + paddb m1, [pb_1] + + psadbw m1, m0 + pshufd m0, m1, 2 + paddd m1, m0 + movd eax, xm1 + RET + + +;----------------------------------------------------------------------------- +; int x265_count_nonzero_8x8_ssse3(const int16_t *quantCoeff); +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal count_nonzero_8x8, 1,1,3 + pxor m0, m0 + movu m1, [pb_4] + +%rep 4 + mova m2, [r0 + 0] + packsswb m2, [r0 + 16] + add r0, 32 + pcmpeqb m2, m0 + paddb m1, m2 +%endrep + + psadbw m1, m0 + pshufd m0, m1, 2 + paddd m0, m1 + movd eax, m0 + RET + + +;----------------------------------------------------------------------------- +; int x265_count_nonzero_8x8_avx2(const int16_t *quantCoeff); +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal count_nonzero_8x8, 1,1,3 + pxor m0, m0 + movu m1, [pb_2] + + mova m2, [r0] + packsswb m2, [r0 + 32] + pcmpeqb m2, m0 + paddb m1, m2 + + mova m2, [r0 + 64] + packsswb m2, [r0 + 96] + pcmpeqb m2, m0 + paddb m1, m2 + + psadbw m1, m0 + vextracti128 xm0, m1, 1 + paddd m0, m1 + pshufd m1, m0, 2 + paddd m0, m1 + movd eax, xm0 + RET + + +;----------------------------------------------------------------------------- +; int x265_count_nonzero_16x16_ssse3(const int16_t *quantCoeff); +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal count_nonzero_16x16, 1,1,3 + pxor m0, m0 + movu m1, [pb_16] + +%rep 16 + mova m2, [r0 + 0] + packsswb m2, [r0 + 16] + add r0, 32 + pcmpeqb m2, m0 + paddb m1, m2 +%endrep + + psadbw m1, m0 + pshufd m0, m1, 2 + paddd m0, m1 + movd eax, m0 + RET + + +;----------------------------------------------------------------------------- +; int x265_count_nonzero_16x16_avx2(const int16_t *quantCoeff); +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal count_nonzero_16x16, 1,1,3 + pxor m0, m0 + movu m1, [pb_8] + +%assign x 0 +%rep 8 + mova m2, [r0 + x] + packsswb m2, [r0 + x + 32] +%assign x x+64 + pcmpeqb m2, m0 + paddb m1, m2 +%endrep + + psadbw m1, m0 + vextracti128 xm0, m1, 1 + paddd m0, m1 + pshufd m1, m0, 2 + paddd m0, m1 + movd eax, xm0 + RET + + +;----------------------------------------------------------------------------- +; int x265_count_nonzero_32x32_ssse3(const int16_t *quantCoeff); +;----------------------------------------------------------------------------- +INIT_XMM ssse3 +cglobal count_nonzero_32x32, 1,1,3 + pxor m0, m0 + movu m1, [pb_64] + +%rep 64 + mova m2, [r0 + 0] + packsswb m2, [r0 + 16] + add r0, 32 + pcmpeqb m2, m0 + paddb m1, m2 +%endrep + + psadbw m1, m0 + pshufd m0, m1, 2 + paddd m0, m1 + movd eax, m0 + RET + + +;----------------------------------------------------------------------------- +; int x265_count_nonzero_32x32_avx2(const int16_t *quantCoeff); +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal count_nonzero_32x32, 1,1,3 + pxor m0, m0 + movu m1, [pb_32] + +%assign x 0 +%rep 32 + mova m2, [r0 + x] + packsswb m2, [r0 + x + 32] +%assign x x+64 + pcmpeqb m2, m0 + paddb m1, m2 +%endrep + + psadbw m1, m0 + vextracti128 xm0, m1, 1 + paddd m0, m1 + pshufd m1, m0, 2 + paddd m0, m1 + movd eax, xm0 RET @@ -4056,6 +4335,44 @@ ;----------------------------------------------------------------------------- +; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal pixel_sub_ps_16x16, 6, 7, 4, dest, deststride, src0, src1, srcstride0, srcstride1 + add r1, r1 + lea r6, [r1 * 3] + +%rep 4 + pmovzxbw m0, [r2] + pmovzxbw m1, [r3] + pmovzxbw m2, [r2 + r4] + pmovzxbw m3, [r3 + r5] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + psubw m0, m1 + psubw m2, m3 + + movu [r0], m0 + movu [r0 + r1], m2 + + pmovzxbw m0, [r2] + pmovzxbw m1, [r3] + pmovzxbw m2, [r2 + r4] + pmovzxbw m3, [r3 + r5] + + psubw m0, m1 + psubw m2, m3 + + movu [r0 + r1 * 2], m0 + movu [r0 + r6], m2 + + lea r0, [r0 + r1 * 4] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] +%endrep + RET +;----------------------------------------------------------------------------- ; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W32_H2 2 @@ -4190,6 +4507,136 @@ ;----------------------------------------------------------------------------- +; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal pixel_sub_ps_32x32, 6, 7, 4, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, 4 + add r1, r1 + +.loop: + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 16] + pmovzxbw m2, [r3] + pmovzxbw m3, [r3 + 16] + + psubw m0, m2 + psubw m1, m3 + + movu [r0], m0 + movu [r0 + 32], m1 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 16] + pmovzxbw m2, [r3 + r5] + pmovzxbw m3, [r3 + r5 + 16] + + psubw m0, m2 + psubw m1, m3 + + movu [r0 + r1], m0 + movu [r0 + r1 + 32], m1 + + add r2, r4 + add r3, r5 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 16] + pmovzxbw m2, [r3 + r5] + pmovzxbw m3, [r3 + r5 + 16] + + psubw m0, m2 + psubw m1, m3 + lea r0, [r0 + r1 * 2] + + movu [r0 ], m0 + movu [r0 + 32], m1 + + add r2, r4 + add r3, r5 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 16] + pmovzxbw m2, [r3 + r5] + pmovzxbw m3, [r3 + r5 + 16] + + + psubw m0, m2 + psubw m1, m3 + add r0, r1 + + movu [r0 ], m0 + movu [r0 + 32], m1 + + add r2, r4 + add r3, r5 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 16] + pmovzxbw m2, [r3 + r5] + pmovzxbw m3, [r3 + r5 + 16] + + psubw m0, m2 + psubw m1, m3 + add r0, r1 + + movu [r0 ], m0 + movu [r0 + 32], m1 + + add r2, r4 + add r3, r5 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 16] + pmovzxbw m2, [r3 + r5] + pmovzxbw m3, [r3 + r5 + 16] + + psubw m0, m2 + psubw m1, m3 + add r0, r1 + + movu [r0 ], m0 + movu [r0 + 32], m1 + + add r2, r4 + add r3, r5 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 16] + pmovzxbw m2, [r3 + r5] + pmovzxbw m3, [r3 + r5 + 16] + + psubw m0, m2 + psubw m1, m3 + add r0, r1 + + movu [r0 ], m0 + movu [r0 + 32], m1 + + add r2, r4 + add r3, r5 + + pmovzxbw m0, [r2 + r4] + pmovzxbw m1, [r2 + r4 + 16] + pmovzxbw m2, [r3 + r5] + pmovzxbw m3, [r3 + r5 + 16] + + psubw m0, m2 + psubw m1, m3 + add r0, r1 + + movu [r0 ], m0 + movu [r0 + 32], m1 + + lea r0, [r0 + r1] + lea r2, [r2 + r4 * 2] + lea r3, [r3 + r5 * 2] + + dec r6d + jnz .loop + RET + +;----------------------------------------------------------------------------- ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); ;----------------------------------------------------------------------------- %macro PIXELSUB_PS_W64_H2 2 @@ -4408,6 +4855,115 @@ %endif +;----------------------------------------------------------------------------- +; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;----------------------------------------------------------------------------- +INIT_YMM avx2 +cglobal pixel_sub_ps_64x64, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1 + mov r6d, 16 + add r1, r1 + +.loop: + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 16] + pmovzxbw m2, [r2 + 32] + pmovzxbw m3, [r2 + 48] + + pmovzxbw m4, [r3] + pmovzxbw m5, [r3 + 16] + pmovzxbw m6, [r3 + 32] + pmovzxbw m7, [r3 + 48] + + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + 64], m2 + movu [r0 + 96], m3 + + add r0, r1 + add r2, r4 + add r3, r5 + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 16] + pmovzxbw m2, [r2 + 32] + pmovzxbw m3, [r2 + 48] + + pmovzxbw m4, [r3] + pmovzxbw m5, [r3 + 16] + pmovzxbw m6, [r3 + 32] + pmovzxbw m7, [r3 + 48] + + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + 64], m2 + movu [r0 + 96], m3 + + add r0, r1 + add r2, r4 + add r3, r5 + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 16] + pmovzxbw m2, [r2 + 32] + pmovzxbw m3, [r2 + 48] + + pmovzxbw m4, [r3] + pmovzxbw m5, [r3 + 16] + pmovzxbw m6, [r3 + 32] + pmovzxbw m7, [r3 + 48] + + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + 64], m2 + movu [r0 + 96], m3 + + add r0, r1 + add r2, r4 + add r3, r5 + + pmovzxbw m0, [r2] + pmovzxbw m1, [r2 + 16] + pmovzxbw m2, [r2 + 32] + pmovzxbw m3, [r2 + 48] + + pmovzxbw m4, [r3] + pmovzxbw m5, [r3 + 16] + pmovzxbw m6, [r3 + 32] + pmovzxbw m7, [r3 + 48] + + psubw m0, m4 + psubw m1, m5 + psubw m2, m6 + psubw m3, m7 + + movu [r0], m0 + movu [r0 + 32], m1 + movu [r0 + 64], m2 + movu [r0 + 96], m3 + + add r0, r1 + add r2, r4 + add r3, r5 + + dec r6d + jnz .loop + RET + ;============================================================================= ; variance ;============================================================================= @@ -4831,3 +5387,71 @@ RET %endmacro +;int x265_test_func(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig) +;{ +; int scanPosLast = 0; +; do +; { +; const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE; +; +; const uint32_t posLast = scan[scanPosLast++]; +; +; const int curCoeff = coeff[posLast]; +; const uint32_t isNZCoeff = (curCoeff != 0); +; numSig -= isNZCoeff; +; +; coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]); +; coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff; +; coeffNum[cgIdx] += (uint8_t)isNZCoeff; +; } +; while (numSig > 0); +; return scanPosLast - 1; +;} + +%if ARCH_X86_64 == 1 +INIT_CPUFLAGS +cglobal findPosLast_x64, 5,12 + mov r5d, r5m + xor r11d, r11d ; cgIdx + xor r7d, r7d ; tmp for non-zero flag + +.loop: + xor r8d, r8d ; coeffSign[] + xor r9d, r9d ; coeffFlag[] + xor r10d, r10d ; coeffNum[] + +%assign x 0 +%rep 16 + movzx r6d, word [r0 + x * 2] + movsx r6d, word [r1 + r6 * 2] + test r6d, r6d + setnz r7b + shr r6d, 31 + shlx r6d, r6d, r10d + or r8d, r6d + lea r9, [r9 * 2 + r7] + add r10d, r7d +%assign x x+1 +%endrep + + ; store latest group data + mov [r2 + r11 * 2], r8w + mov [r3 + r11 * 2], r9w + mov [r4 + r11], r10b + inc r11d + + add r0, 16 * 2 + sub r5d, r10d + jnz .loop + + ; store group data + tzcnt r6d, r9d + shrx r9d, r9d, r6d + mov [r3 + (r11 - 1) * 2], r9w + + ; get posLast + shl r11d, 4 + sub r11d, r6d + lea eax, [r11d - 1] + RET +%endif diff -Nru x265-1.5/source/common/x86/pixel-util.h x265-1.6/source/common/x86/pixel-util.h --- x265-1.5/source/common/x86/pixel-util.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/pixel-util.h 2015-04-02 16:46:36.000000000 +0000 @@ -30,6 +30,8 @@ void x265_getResidual16_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); void x265_getResidual32_sse2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); void x265_getResidual32_sse4(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void x265_getResidual16_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); +void x265_getResidual32_avx2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride); void x265_transpose4_sse2(pixel* dest, const pixel* src, intptr_t stride); void x265_transpose8_sse2(pixel* dest, const pixel* src, intptr_t stride); @@ -48,7 +50,15 @@ uint32_t x265_nquant_avx2(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff); void x265_dequant_normal_sse4(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift); void x265_dequant_normal_avx2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift); -int x265_count_nonzero_ssse3(const int16_t* quantCoeff, int numCoeff); + +int x265_count_nonzero_4x4_ssse3(const int16_t* quantCoeff); +int x265_count_nonzero_8x8_ssse3(const int16_t* quantCoeff); +int x265_count_nonzero_16x16_ssse3(const int16_t* quantCoeff); +int x265_count_nonzero_32x32_ssse3(const int16_t* quantCoeff); +int x265_count_nonzero_4x4_avx2(const int16_t* quantCoeff); +int x265_count_nonzero_8x8_avx2(const int16_t* quantCoeff); +int x265_count_nonzero_16x16_avx2(const int16_t* quantCoeff); +int x265_count_nonzero_32x32_avx2(const int16_t* quantCoeff); void x265_weight_pp_sse4(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); void x265_weight_pp_avx2(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset); @@ -67,6 +77,8 @@ void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t); void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t); +int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig); + #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \ void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \ void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* scr1, intptr_t srcStride0, intptr_t srcStride1); diff -Nru x265-1.5/source/common/x86/sad-a.asm x265-1.6/source/common/x86/sad-a.asm --- x265-1.5/source/common/x86/sad-a.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/sad-a.asm 2015-04-02 16:46:36.000000000 +0000 @@ -3710,3 +3710,749 @@ SADX34_CACHELINE_FUNC 16, 16, 64, sse2, ssse3, ssse3 SADX34_CACHELINE_FUNC 16, 8, 64, sse2, ssse3, ssse3 +%if HIGH_BIT_DEPTH==0 +INIT_YMM avx2 +cglobal pixel_sad_x3_8x4, 6,6,5 + xorps m0, m0 + xorps m1, m1 + + sub r2, r1 ; rebase on pointer r1 + sub r3, r1 + + ; row 0 + vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + add r1, r4 + + ; row 1 + vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + add r1, r4 + + ; row 2 + vpbroadcastq xm2, [r0 + 2 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + add r1, r4 + + ; row 3 + vpbroadcastq xm2, [r0 + 3 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + + pshufd xm0, xm0, q0020 + movq [r5 + 0], xm0 + movd [r5 + 8], xm1 + RET + +INIT_YMM avx2 +cglobal pixel_sad_x3_8x8, 6,6,5 + xorps m0, m0 + xorps m1, m1 + + sub r2, r1 ; rebase on pointer r1 + sub r3, r1 +%assign x 0 +%rep 4 + ; row 0 + vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + add r1, r4 + + ; row 1 + vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + +%assign x x+1 + %if x < 4 + add r1, r4 + add r0, 2 * FENC_STRIDE + %endif +%endrep + + pshufd xm0, xm0, q0020 + movq [r5 + 0], xm0 + movd [r5 + 8], xm1 + RET + +INIT_YMM avx2 +cglobal pixel_sad_x3_8x16, 6,6,5 + xorps m0, m0 + xorps m1, m1 + + sub r2, r1 ; rebase on pointer r1 + sub r3, r1 +%assign x 0 +%rep 8 + ; row 0 + vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + add r1, r4 + + ; row 1 + vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + +%assign x x+1 + %if x < 8 + add r1, r4 + add r0, 2 * FENC_STRIDE + %endif +%endrep + + pshufd xm0, xm0, q0020 + movq [r5 + 0], xm0 + movd [r5 + 8], xm1 + RET + +INIT_YMM avx2 +cglobal pixel_sad_x4_8x8, 7,7,5 + xorps m0, m0 + xorps m1, m1 + + sub r2, r1 ; rebase on pointer r1 + sub r3, r1 + sub r4, r1 +%assign x 0 +%rep 4 + ; row 0 + vpbroadcastq xm2, [r0 + 0 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + movhps xm4, [r1 + r4] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + add r1, r5 + + ; row 1 + vpbroadcastq xm2, [r0 + 1 * FENC_STRIDE] + movq xm3, [r1] + movhps xm3, [r1 + r2] + movq xm4, [r1 + r3] + movhps xm4, [r1 + r4] + psadbw xm3, xm2 + psadbw xm4, xm2 + paddd xm0, xm3 + paddd xm1, xm4 + +%assign x x+1 + %if x < 4 + add r1, r5 + add r0, 2 * FENC_STRIDE + %endif +%endrep + + pshufd xm0, xm0, q0020 + pshufd xm1, xm1, q0020 + movq [r6 + 0], xm0 + movq [r6 + 8], xm1 + RET + +INIT_YMM avx2 +cglobal pixel_sad_32x8, 4,4,6 + xorps m0, m0 + xorps m5, m5 + + movu m1, [r0] ; row 0 of pix0 + movu m2, [r2] ; row 0 of pix1 + movu m3, [r0 + r1] ; row 1 of pix0 + movu m4, [r2 + r3] ; row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m1, [r0] ; row 2 of pix0 + movu m2, [r2] ; row 2 of pix1 + movu m3, [r0 + r1] ; row 3 of pix0 + movu m4, [r2 + r3] ; row 3 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m1, [r0] ; row 4 of pix0 + movu m2, [r2] ; row 4 of pix1 + movu m3, [r0 + r1] ; row 5 of pix0 + movu m4, [r2 + r3] ; row 5 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m1, [r0] ; row 6 of pix0 + movu m2, [r2] ; row 6 of pix1 + movu m3, [r0 + r1] ; row 7 of pix0 + movu m4, [r2 + r3] ; row 7 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + paddd m0, m5 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + +INIT_YMM avx2 +cglobal pixel_sad_32x16, 4,5,6 + xorps m0, m0 + xorps m5, m5 + mov r4d, 4 + +.loop + movu m1, [r0] ; row 0 of pix0 + movu m2, [r2] ; row 0 of pix1 + movu m3, [r0 + r1] ; row 1 of pix0 + movu m4, [r2 + r3] ; row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m1, [r0] ; row 2 of pix0 + movu m2, [r2] ; row 2 of pix1 + movu m3, [r0 + r1] ; row 3 of pix0 + movu m4, [r2 + r3] ; row 3 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + dec r4d + jnz .loop + + paddd m0, m5 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + +INIT_YMM avx2 +cglobal pixel_sad_32x24, 4,5,6 + xorps m0, m0 + xorps m5, m5 + mov r4d, 6 +.loop + movu m1, [r0] ; row 0 of pix0 + movu m2, [r2] ; row 0 of pix1 + movu m3, [r0 + r1] ; row 1 of pix0 + movu m4, [r2 + r3] ; row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m1, [r0] ; row 2 of pix0 + movu m2, [r2] ; row 2 of pix1 + movu m3, [r0 + r1] ; row 3 of pix0 + movu m4, [r2 + r3] ; row 3 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + dec r4d + jnz .loop + + paddd m0, m5 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + +INIT_YMM avx2 +cglobal pixel_sad_32x32, 4,7,5 + xorps m0, m0 + mov r4d, 32/4 + lea r5, [r1 * 3] + lea r6, [r3 * 3] + +.loop + movu m1, [r0] ; row 0 of pix0 + movu m2, [r2] ; row 0 of pix1 + movu m3, [r0 + r1] ; row 1 of pix0 + movu m4, [r2 + r3] ; row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [r0 + 2 * r1] ; row 2 of pix0 + movu m2, [r2 + 2 * r3] ; row 2 of pix1 + movu m3, [r0 + r5] ; row 3 of pix0 + movu m4, [r2 + r6] ; row 3 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + + dec r4d + jnz .loop + + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + + INIT_YMM avx2 +cglobal pixel_sad_32x64, 4,7,5 + xorps m0, m0 + mov r4d, 64/8 + lea r5, [r1 * 3] + lea r6, [r3 * 3] + +.loop + movu m1, [r0] ; row 0 of pix0 + movu m2, [r2] ; row 0 of pix1 + movu m3, [r0 + r1] ; row 1 of pix0 + movu m4, [r2 + r3] ; row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [r0 + 2 * r1] ; row 2 of pix0 + movu m2, [r2 + 2 * r3] ; row 2 of pix1 + movu m3, [r0 + r5] ; row 3 of pix0 + movu m4, [r2 + r6] ; row 3 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + + movu m1, [r0] ; row 4 of pix0 + movu m2, [r2] ; row 4 of pix1 + movu m3, [r0 + r1] ; row 5 of pix0 + movu m4, [r2 + r3] ; row 5 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + movu m1, [r0 + 2 * r1] ; row 6 of pix0 + movu m2, [r2 + 2 * r3] ; row 6 of pix1 + movu m3, [r0 + r5] ; row 7 of pix0 + movu m4, [r2 + r6] ; row 7 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m0, m3 + + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + + dec r4d + jnz .loop + + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + +INIT_YMM avx2 +cglobal pixel_sad_48x64, 4,7,7 + xorps m0, m0 + mov r4d, 64/4 + lea r5, [r1 * 3] + lea r6, [r3 * 3] +.loop + movu m1, [r0] ; row 0 of pix0 + movu m2, [r2] ; row 0 of pix1 + movu m3, [r0 + r1] ; row 1 of pix0 + movu m4, [r2 + r3] ; row 1 of pix1 + movu xm5, [r0 +32] ; last 16 of row 0 of pix0 + vinserti128 m5, m5, [r0 + r1 + 32], 1 + movu xm6, [r2 +32] ; last 16 of row 0 of pix1 + vinserti128 m6, m6, [r2 + r3 + 32], 1 + + psadbw m1, m2 + psadbw m3, m4 + psadbw m5, m6 + paddd m0, m1 + paddd m0, m3 + paddd m0, m5 + + movu m1, [r0 + 2 * r1] ; row 2 of pix0 + movu m2, [r2 + 2 * r3] ; row 2 of pix1 + movu m3, [r0 + r5] ; row 3 of pix0 + movu m4, [r2 + r6] ; row 3 of pix1 + movu xm5, [r0 +32 + 2 * r1] + vinserti128 m5, m5, [r0 + r5 + 32], 1 + movu xm6, [r2 +32 + 2 * r3] + vinserti128 m6, m6, [r2 + r6 + 32], 1 + + psadbw m1, m2 + psadbw m3, m4 + psadbw m5, m6 + paddd m0, m1 + paddd m0, m3 + paddd m0, m5 + + lea r2, [r2 + 4 * r3] + lea r0, [r0 + 4 * r1] + + dec r4d + jnz .loop + + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + +INIT_YMM avx2 +cglobal pixel_sad_64x16, 4,5,6 + xorps m0, m0 + xorps m5, m5 + mov r4d, 4 +.loop + movu m1, [r0] ; first 32 of row 0 of pix0 + movu m2, [r2] ; first 32 of row 0 of pix1 + movu m3, [r0 + 32] ; second 32 of row 0 of pix0 + movu m4, [r2 + 32] ; second 32 of row 0 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + movu m1, [r0 + r1] ; first 32 of row 1 of pix0 + movu m2, [r2 + r3] ; first 32 of row 1 of pix1 + movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 + movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m1, [r0] ; first 32 of row 2 of pix0 + movu m2, [r2] ; first 32 of row 2 of pix1 + movu m3, [r0 + 32] ; second 32 of row 2 of pix0 + movu m4, [r2 + 32] ; second 32 of row 2 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + movu m1, [r0 + r1] ; first 32 of row 3 of pix0 + movu m2, [r2 + r3] ; first 32 of row 3 of pix1 + movu m3, [r0 + 32 + r1] ; second 32 of row 3 of pix0 + movu m4, [r2 + 32 + r3] ; second 32 of row 3 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + dec r4d + jnz .loop + + paddd m0, m5 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + +INIT_YMM avx2 +cglobal pixel_sad_64x32, 4,5,6 + xorps m0, m0 + xorps m5, m5 + mov r4d, 16 +.loop + movu m1, [r0] ; first 32 of row 0 of pix0 + movu m2, [r2] ; first 32 of row 0 of pix1 + movu m3, [r0 + 32] ; second 32 of row 0 of pix0 + movu m4, [r2 + 32] ; second 32 of row 0 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + movu m1, [r0 + r1] ; first 32 of row 1 of pix0 + movu m2, [r2 + r3] ; first 32 of row 1 of pix1 + movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 + movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + dec r4d + jnz .loop + + paddd m0, m5 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + +INIT_YMM avx2 +cglobal pixel_sad_64x48, 4,5,6 + xorps m0, m0 + xorps m5, m5 + mov r4d, 24 +.loop + movu m1, [r0] ; first 32 of row 0 of pix0 + movu m2, [r2] ; first 32 of row 0 of pix1 + movu m3, [r0 + 32] ; second 32 of row 0 of pix0 + movu m4, [r2 + 32] ; second 32 of row 0 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + movu m1, [r0 + r1] ; first 32 of row 1 of pix0 + movu m2, [r2 + r3] ; first 32 of row 1 of pix1 + movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 + movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + dec r4d + jnz .loop + + paddd m0, m5 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + +INIT_YMM avx2 +cglobal pixel_sad_64x64, 4,5,6 + xorps m0, m0 + xorps m5, m5 + mov r4d, 8 +.loop + movu m1, [r0] ; first 32 of row 0 of pix0 + movu m2, [r2] ; first 32 of row 0 of pix1 + movu m3, [r0 + 32] ; second 32 of row 0 of pix0 + movu m4, [r2 + 32] ; second 32 of row 0 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + movu m1, [r0 + r1] ; first 32 of row 1 of pix0 + movu m2, [r2 + r3] ; first 32 of row 1 of pix1 + movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 + movu m4, [r2 + 32 + r3] ; second 32 of row 1 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m1, [r0] ; first 32 of row 2 of pix0 + movu m2, [r2] ; first 32 of row 2 of pix1 + movu m3, [r0 + 32] ; second 32 of row 2 of pix0 + movu m4, [r2 + 32] ; second 32 of row 2 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + movu m1, [r0 + r1] ; first 32 of row 3 of pix0 + movu m2, [r2 + r3] ; first 32 of row 3 of pix1 + movu m3, [r0 + 32 + r1] ; second 32 of row 3 of pix0 + movu m4, [r2 + 32 + r3] ; second 32 of row 3 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m1, [r0] ; first 32 of row 4 of pix0 + movu m2, [r2] ; first 32 of row 4 of pix1 + movu m3, [r0 + 32] ; second 32 of row 4 of pix0 + movu m4, [r2 + 32] ; second 32 of row 4 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + movu m1, [r0 + r1] ; first 32 of row 5 of pix0 + movu m2, [r2 + r3] ; first 32 of row 5 of pix1 + movu m3, [r0 + 32 + r1] ; second 32 of row 5 of pix0 + movu m4, [r2 + 32 + r3] ; second 32 of row 5 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + movu m1, [r0] ; first 32 of row 6 of pix0 + movu m2, [r2] ; first 32 of row 6 of pix1 + movu m3, [r0 + 32] ; second 32 of row 6 of pix0 + movu m4, [r2 + 32] ; second 32 of row 6 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + movu m1, [r0 + r1] ; first 32 of row 7 of pix0 + movu m2, [r2 + r3] ; first 32 of row 7 of pix1 + movu m3, [r0 + 32 + r1] ; second 32 of row 7 of pix0 + movu m4, [r2 + 32 + r3] ; second 32 of row 7 of pix1 + + psadbw m1, m2 + psadbw m3, m4 + paddd m0, m1 + paddd m5, m3 + + lea r2, [r2 + 2 * r3] + lea r0, [r0 + 2 * r1] + + dec r4d + jnz .loop + + paddd m0, m5 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + pshufd xm1, xm0, 2 + paddd xm0,xm1 + movd eax, xm0 + RET + +%endif diff -Nru x265-1.5/source/common/x86/ssd-a.asm x265-1.6/source/common/x86/ssd-a.asm --- x265-1.5/source/common/x86/ssd-a.asm 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/common/x86/ssd-a.asm 2015-04-02 16:46:36.000000000 +0000 @@ -822,10 +822,10 @@ %if HIGH_BIT_DEPTH == 0 %macro SSD_LOAD_FULL 5 - mova m1, [t0+%1] - mova m2, [t2+%2] - mova m3, [t0+%3] - mova m4, [t2+%4] + movu m1, [t0+%1] + movu m2, [t2+%2] + movu m3, [t0+%3] + movu m4, [t2+%4] %if %5==1 add t0, t1 add t2, t3 @@ -1094,6 +1094,8 @@ INIT_YMM avx2 SSD 16, 16 SSD 16, 8 +SSD 32, 32 +SSD 64, 64 %assign function_align 16 %endif ; !HIGH_BIT_DEPTH @@ -2548,6 +2550,35 @@ movd eax, m0 RET +INIT_YMM avx2 +cglobal pixel_ssd_s_16, 2,4,5 + add r1, r1 + lea r3, [r1 * 3] + mov r2d, 16/4 + pxor m0, m0 +.loop: + movu m1, [r0] + movu m2, [r0 + r1] + movu m3, [r0 + 2 * r1] + movu m4, [r0 + r3] + + lea r0, [r0 + r1 * 4] + pmaddwd m1, m1 + pmaddwd m2, m2 + pmaddwd m3, m3 + pmaddwd m4, m4 + paddd m1, m2 + paddd m3, m4 + paddd m1, m3 + paddd m0, m1 + + dec r2d + jnz .loop + + ; calculate sum and return + HADDD m0, m1 + movd eax, xm0 + RET INIT_YMM avx2 cglobal pixel_ssd_s_32, 2,4,5 diff -Nru x265-1.5/source/encoder/analysis.cpp x265-1.6/source/encoder/analysis.cpp --- x265-1.5/source/encoder/analysis.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/analysis.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -71,9 +71,10 @@ Analysis::Analysis() { - m_totalNumJobs = m_numAcquiredJobs = m_numCompletedJobs = 0; m_reuseIntraDataCTU = NULL; m_reuseInterDataCTU = NULL; + m_reuseRef = NULL; + m_reuseBestMergeCand = NULL; } bool Analysis::create(ThreadLocalData *tld) @@ -125,6 +126,11 @@ m_slice = ctu.m_slice; m_frame = &frame; +#if _DEBUG || CHECKED_BUILD + for (uint32_t i = 0; i <= g_maxCUDepth; i++) + for (uint32_t j = 0; j < MAX_PRED_TYPES; j++) + m_modeDepth[i].pred[j].invalidate(); +#endif invalidateContexts(0); m_quant.setQPforQuant(ctu); m_rqt[0].cur.load(initialContext); @@ -139,10 +145,13 @@ { int numPredDir = m_slice->isInterP() ? 1 : 2; m_reuseInterDataCTU = (analysis_inter_data *)m_frame->m_analysisData.interData; - reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir]; + m_reuseRef = &m_reuseInterDataCTU->ref[ctu.m_cuAddr * X265_MAX_PRED_MODE_PER_CTU * numPredDir]; + m_reuseBestMergeCand = &m_reuseInterDataCTU->bestMergeCand[ctu.m_cuAddr * CUGeom::MAX_GEOMS]; } } + ProfileCUScope(ctu, totalCTUTime, totalCTUs); + uint32_t zOrder = 0; if (m_slice->m_sliceType == I_SLICE) { @@ -153,6 +162,7 @@ memcpy(&m_reuseIntraDataCTU->depth[ctu.m_cuAddr * numPartition], bestCU->m_cuDepth, sizeof(uint8_t) * numPartition); memcpy(&m_reuseIntraDataCTU->modes[ctu.m_cuAddr * numPartition], bestCU->m_lumaIntraDir, sizeof(uint8_t) * numPartition); memcpy(&m_reuseIntraDataCTU->partSizes[ctu.m_cuAddr * numPartition], bestCU->m_partSize, sizeof(uint8_t) * numPartition); + memcpy(&m_reuseIntraDataCTU->chromaModes[ctu.m_cuAddr * numPartition], bestCU->m_chromaIntraDir, sizeof(uint8_t) * numPartition); } } else @@ -196,14 +206,16 @@ return; else if (md.bestMode->cu.isIntra(0)) { + md.pred[PRED_LOSSLESS].initCosts(); md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); PartSize size = (PartSize)md.pred[PRED_LOSSLESS].cu.m_partSize[0]; uint8_t* modes = md.pred[PRED_LOSSLESS].cu.m_lumaIntraDir; - checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes); + checkIntra(md.pred[PRED_LOSSLESS], cuGeom, size, modes, NULL); checkBestMode(md.pred[PRED_LOSSLESS], cuGeom.depth); } else { + md.pred[PRED_LOSSLESS].initCosts(); md.pred[PRED_LOSSLESS].cu.initLosslessCU(md.bestMode->cu, cuGeom); md.pred[PRED_LOSSLESS].predYuv.copyFromYuv(md.bestMode->predYuv); encodeResAndCalcRdInterCU(md.pred[PRED_LOSSLESS], cuGeom); @@ -225,15 +237,16 @@ uint8_t* reuseDepth = &m_reuseIntraDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; uint8_t* reuseModes = &m_reuseIntraDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; char* reusePartSizes = &m_reuseIntraDataCTU->partSizes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; + uint8_t* reuseChromaModes = &m_reuseIntraDataCTU->chromaModes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; - if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx) + if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx) { m_quant.setQPforQuant(parentCTU); PartSize size = (PartSize)reusePartSizes[zOrder]; Mode& mode = size == SIZE_2Nx2N ? md.pred[PRED_INTRA] : md.pred[PRED_INTRA_NxN]; mode.cu.initSubCU(parentCTU, cuGeom); - checkIntra(mode, cuGeom, size, &reuseModes[zOrder]); + checkIntra(mode, cuGeom, size, &reuseModes[zOrder], &reuseChromaModes[zOrder]); checkBestMode(mode, depth); if (m_bTryLossless) @@ -252,13 +265,13 @@ m_quant.setQPforQuant(parentCTU); md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); - checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); + checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL); checkBestMode(md.pred[PRED_INTRA], depth); - if (depth == g_maxCUDepth) + if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3) { md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); - checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL); + checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL); checkBestMode(md.pred[PRED_INTRA_NxN], depth); } @@ -286,7 +299,7 @@ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) { - m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx); m_rqt[nextDepth].cur.load(*nextContext); compressIntraCU(parentCTU, childGeom, zOrder); @@ -308,203 +321,173 @@ addSplitFlagCost(*splitPred, cuGeom.depth); else updateModeCost(*splitPred); + + checkDQPForSplitPred(splitPred->cu, cuGeom); checkBestMode(*splitPred, depth); } - checkDQP(md.bestMode->cu, cuGeom); - /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); if (md.bestMode != &md.pred[PRED_SPLIT]) - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx); + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx); } -bool Analysis::findJob(int threadId) +void Analysis::PMODE::processTasks(int workerThreadId) { - /* try to acquire a CU mode to analyze */ - m_pmodeLock.acquire(); - if (m_totalNumJobs > m_numAcquiredJobs) - { - int id = m_numAcquiredJobs++; - m_pmodeLock.release(); - - ProfileScopeEvent(pmode); - parallelModeAnalysis(threadId, id); - - m_pmodeLock.acquire(); - if (++m_numCompletedJobs == m_totalNumJobs) - m_modeCompletionEvent.trigger(); - m_pmodeLock.release(); - return true; - } - else - m_pmodeLock.release(); - - m_meLock.acquire(); - if (m_totalNumME > m_numAcquiredME) - { - int id = m_numAcquiredME++; - m_meLock.release(); - - ProfileScopeEvent(pme); - parallelME(threadId, id); - - m_meLock.acquire(); - if (++m_numCompletedME == m_totalNumME) - m_meCompletionEvent.trigger(); - m_meLock.release(); - return true; - } - else - m_meLock.release(); - - return false; +#if DETAILED_CU_STATS + int fe = master.m_modeDepth[cuGeom.depth].pred[PRED_2Nx2N].cu.m_encData->m_frameEncoderID; + master.m_stats[fe].countPModeTasks++; + ScopedElapsedTime pmodeTime(master.m_stats[fe].pmodeTime); +#endif + ProfileScopeEvent(pmode); + master.processPmode(*this, master.m_tld[workerThreadId].analysis); } -void Analysis::parallelME(int threadId, int meId) +/* process pmode jobs until none remain; may be called by the master thread or by + * a bonded peer (slave) thread via pmodeTasks() */ +void Analysis::processPmode(PMODE& pmode, Analysis& slave) { - Analysis* slave; - - if (threadId == -1) - slave = this; - else + /* acquire a mode task, else exit early */ + int task; + pmode.m_lock.acquire(); + if (pmode.m_jobTotal > pmode.m_jobAcquired) { - slave = &m_tld[threadId].analysis; - slave->setQP(*m_slice, m_rdCost.m_qp); - slave->m_slice = m_slice; - slave->m_frame = m_frame; - - slave->m_me.setSourcePU(*m_curInterMode->fencYuv, m_curInterMode->cu.m_cuAddr, m_curGeom->encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight); - slave->prepMotionCompensation(m_curInterMode->cu, *m_curGeom, m_curPart); + task = pmode.m_jobAcquired++; + pmode.m_lock.release(); } - - if (meId < m_slice->m_numRefIdx[0]) - slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 0, meId); - else - slave->singleMotionEstimation(*this, *m_curInterMode, *m_curGeom, m_curPart, 1, meId - m_slice->m_numRefIdx[0]); -} - -void Analysis::parallelModeAnalysis(int threadId, int jobId) -{ - Analysis* slave; - - if (threadId == -1) - slave = this; else { - slave = &m_tld[threadId].analysis; - slave->m_slice = m_slice; - slave->m_frame = m_frame; - slave->setQP(*m_slice, m_rdCost.m_qp); - slave->invalidateContexts(0); + pmode.m_lock.release(); + return; } - ModeDepth& md = m_modeDepth[m_curGeom->depth]; + ModeDepth& md = m_modeDepth[pmode.cuGeom.depth]; + bool bMergeOnly = pmode.cuGeom.log2CUSize == 6; - if (m_param->rdLevel <= 4) + /* setup slave Analysis */ + if (&slave != this) { - switch (jobId) - { - case 0: - if (slave != this) - slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur); - slave->checkIntraInInter(md.pred[PRED_INTRA], *m_curGeom); - if (m_param->rdLevel > 2) - slave->encodeIntraInInter(md.pred[PRED_INTRA], *m_curGeom); - break; + slave.m_slice = m_slice; + slave.m_frame = m_frame; + slave.setQP(*m_slice, m_rdCost.m_qp); + slave.invalidateContexts(0); - case 1: - slave->checkInter_rd0_4(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N); - if (m_slice->m_sliceType == B_SLICE) - slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom); - break; - - case 2: - slave->checkInter_rd0_4(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N); - break; - - case 3: - slave->checkInter_rd0_4(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN); - break; - - case 4: - slave->checkInter_rd0_4(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU); - break; - - case 5: - slave->checkInter_rd0_4(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD); - break; - - case 6: - slave->checkInter_rd0_4(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N); - break; - - case 7: - slave->checkInter_rd0_4(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N); - break; - - default: - X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); - break; + if (m_param->rdLevel >= 5) + { + slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur); + slave.m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu); } } - else + + + /* perform Mode task, repeat until no more work is available */ + do { - bool bMergeOnly = m_curGeom->log2CUSize == 6; - if (slave != this) + if (m_param->rdLevel <= 4) { - slave->m_rqt[m_curGeom->depth].cur.load(m_rqt[m_curGeom->depth].cur); - slave->m_quant.setQPforQuant(md.pred[PRED_2Nx2N].cu); + switch (pmode.modes[task]) + { + case PRED_INTRA: + if (&slave != this) + slave.m_rqt[pmode.cuGeom.depth].cur.load(m_rqt[pmode.cuGeom.depth].cur); + slave.checkIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom); + if (m_param->rdLevel > 2) + slave.encodeIntraInInter(md.pred[PRED_INTRA], pmode.cuGeom); + break; + + case PRED_2Nx2N: + slave.checkInter_rd0_4(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N); + if (m_slice->m_sliceType == B_SLICE) + slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom); + break; + + case PRED_Nx2N: + slave.checkInter_rd0_4(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N); + break; + + case PRED_2NxN: + slave.checkInter_rd0_4(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN); + break; + + case PRED_2NxnU: + slave.checkInter_rd0_4(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU); + break; + + case PRED_2NxnD: + slave.checkInter_rd0_4(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD); + break; + + case PRED_nLx2N: + slave.checkInter_rd0_4(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N); + break; + + case PRED_nRx2N: + slave.checkInter_rd0_4(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N); + break; + + default: + X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); + break; + } } - - switch (jobId) + else { - case 0: - slave->checkIntra(md.pred[PRED_INTRA], *m_curGeom, SIZE_2Nx2N, NULL); - if (m_curGeom->depth == g_maxCUDepth && m_curGeom->log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) - slave->checkIntra(md.pred[PRED_INTRA_NxN], *m_curGeom, SIZE_NxN, NULL); - break; - - case 1: - slave->checkInter_rd5_6(md.pred[PRED_2Nx2N], *m_curGeom, SIZE_2Nx2N, false); - md.pred[PRED_BIDIR].rdCost = MAX_INT64; - if (m_slice->m_sliceType == B_SLICE) + switch (pmode.modes[task]) { - slave->checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], *m_curGeom); - if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) - slave->encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], *m_curGeom); - } - break; + case PRED_INTRA: + slave.checkIntra(md.pred[PRED_INTRA], pmode.cuGeom, SIZE_2Nx2N, NULL, NULL); + if (pmode.cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3) + slave.checkIntra(md.pred[PRED_INTRA_NxN], pmode.cuGeom, SIZE_NxN, NULL, NULL); + break; + + case PRED_2Nx2N: + slave.checkInter_rd5_6(md.pred[PRED_2Nx2N], pmode.cuGeom, SIZE_2Nx2N, false); + md.pred[PRED_BIDIR].rdCost = MAX_INT64; + if (m_slice->m_sliceType == B_SLICE) + { + slave.checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], pmode.cuGeom); + if (md.pred[PRED_BIDIR].sa8dCost < MAX_INT64) + slave.encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], pmode.cuGeom); + } + break; - case 2: - slave->checkInter_rd5_6(md.pred[PRED_Nx2N], *m_curGeom, SIZE_Nx2N, false); - break; + case PRED_Nx2N: + slave.checkInter_rd5_6(md.pred[PRED_Nx2N], pmode.cuGeom, SIZE_Nx2N, false); + break; - case 3: - slave->checkInter_rd5_6(md.pred[PRED_2NxN], *m_curGeom, SIZE_2NxN, false); - break; + case PRED_2NxN: + slave.checkInter_rd5_6(md.pred[PRED_2NxN], pmode.cuGeom, SIZE_2NxN, false); + break; - case 4: - slave->checkInter_rd5_6(md.pred[PRED_2NxnU], *m_curGeom, SIZE_2NxnU, bMergeOnly); - break; + case PRED_2NxnU: + slave.checkInter_rd5_6(md.pred[PRED_2NxnU], pmode.cuGeom, SIZE_2NxnU, bMergeOnly); + break; - case 5: - slave->checkInter_rd5_6(md.pred[PRED_2NxnD], *m_curGeom, SIZE_2NxnD, bMergeOnly); - break; + case PRED_2NxnD: + slave.checkInter_rd5_6(md.pred[PRED_2NxnD], pmode.cuGeom, SIZE_2NxnD, bMergeOnly); + break; - case 6: - slave->checkInter_rd5_6(md.pred[PRED_nLx2N], *m_curGeom, SIZE_nLx2N, bMergeOnly); - break; + case PRED_nLx2N: + slave.checkInter_rd5_6(md.pred[PRED_nLx2N], pmode.cuGeom, SIZE_nLx2N, bMergeOnly); + break; - case 7: - slave->checkInter_rd5_6(md.pred[PRED_nRx2N], *m_curGeom, SIZE_nRx2N, bMergeOnly); - break; + case PRED_nRx2N: + slave.checkInter_rd5_6(md.pred[PRED_nRx2N], pmode.cuGeom, SIZE_nRx2N, bMergeOnly); + break; - default: - X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); - break; + default: + X265_CHECK(0, "invalid job ID for parallel mode analysis\n"); + break; + } } + + task = -1; + pmode.m_lock.acquire(); + if (pmode.m_jobTotal > pmode.m_jobAcquired) + task = pmode.m_jobAcquired++; + pmode.m_lock.release(); } + while (task >= 0); } void Analysis::compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom) @@ -525,48 +508,37 @@ int bTryAmp = m_slice->m_sps->maxAMPDepth > depth && (cuGeom.log2CUSize < 6 || m_param->rdLevel > 4); int bTryIntra = m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames; + PMODE pmode(*this, cuGeom); + /* Initialize all prediction CUs based on parentCTU */ - md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); - md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom); md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); - if (m_param->bEnableRectInter) + if (bTryIntra) { - md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); - md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); + if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3 && m_param->rdLevel >= 5) + md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); + pmode.modes[pmode.m_jobTotal++] = PRED_INTRA; } - if (bTryAmp) + md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_2Nx2N; + md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom); + if (m_param->bEnableRectInter) { - md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); - md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); - md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); - md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_2NxN; + md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_Nx2N; } - if (bTryIntra) + if (bTryAmp) { - md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); - if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) - md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); + md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnU; + md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_2NxnD; + md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_nLx2N; + md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); pmode.modes[pmode.m_jobTotal++] = PRED_nRx2N; } - m_pmodeLock.acquire(); - m_totalNumJobs = 2 + m_param->bEnableRectInter * 2 + bTryAmp * 4; - m_numAcquiredJobs = !bTryIntra; - m_numCompletedJobs = m_numAcquiredJobs; - m_curGeom = &cuGeom; - m_bJobsQueued = true; - JobProvider::enqueue(); - m_pmodeLock.release(); - - for (int i = 0; i < m_totalNumJobs - m_numCompletedJobs; i++) - m_pool->pokeIdleThread(); + pmode.tryBondPeers(*m_frame->m_encData->m_jobProvider, pmode.m_jobTotal); /* participate in processing jobs, until all are distributed */ - while (findJob(-1)) - ; - - JobProvider::dequeue(); - m_bJobsQueued = false; + processPmode(pmode, *this); /* the master worker thread (this one) does merge analysis. By doing * merge after all the other jobs are at least started, we usually avoid @@ -576,7 +548,10 @@ { checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); - m_modeCompletionEvent.wait(); + { + ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters); + pmode.waitForExit(); + } /* select best inter mode based on sa8d cost */ Mode *bestInter = &md.pred[PRED_2Nx2N]; @@ -608,8 +583,8 @@ { for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) { - prepMotionCompensation(bestInter->cu, cuGeom, puIdx); - motionCompensation(bestInter->predYuv, false, true); + PredictionUnit pu(bestInter->cu, cuGeom, puIdx); + motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true); } } encodeResAndCalcRdInterCU(*bestInter, cuGeom); @@ -644,8 +619,8 @@ /* finally code the best mode selected from SA8D costs */ for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++) { - prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx); - motionCompensation(md.bestMode->predYuv, false, true); + PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx); + motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true); } encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); } @@ -653,11 +628,15 @@ } else { - checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); - m_modeCompletionEvent.wait(); + checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false); + { + ProfileCUScope(parentCTU, pmodeBlockTime, countPModeMasters); + pmode.waitForExit(); + } checkBestMode(md.pred[PRED_2Nx2N], depth); - checkBestMode(md.pred[PRED_BIDIR], depth); + if (m_slice->m_sliceType == B_SLICE) + checkBestMode(md.pred[PRED_BIDIR], depth); if (m_param->bEnableRectInter) { @@ -676,7 +655,7 @@ if (bTryIntra) { checkBestMode(md.pred[PRED_INTRA], depth); - if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) + if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3) checkBestMode(md.pred[PRED_INTRA_NxN], depth); } } @@ -721,7 +700,7 @@ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) { - m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx); m_rqt[nextDepth].cur.load(*nextContext); compressInterCU_dist(parentCTU, childGeom); @@ -742,6 +721,7 @@ else updateModeCost(*splitPred); + checkDQPForSplitPred(splitPred->cu, cuGeom); checkBestMode(*splitPred, depth); } @@ -755,12 +735,10 @@ cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; } - checkDQP(md.bestMode->cu, cuGeom); - /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); if (md.bestMode != &md.pred[PRED_SPLIT]) - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx); + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx); } void Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom) @@ -859,8 +837,8 @@ { for (uint32_t puIdx = 0; puIdx < bestInter->cu.getNumPartInter(); puIdx++) { - prepMotionCompensation(bestInter->cu, cuGeom, puIdx); - motionCompensation(bestInter->predYuv, false, true); + PredictionUnit pu(bestInter->cu, cuGeom, puIdx); + motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true); } } encodeResAndCalcRdInterCU(*bestInter, cuGeom); @@ -914,8 +892,8 @@ { for (uint32_t puIdx = 0; puIdx < md.bestMode->cu.getNumPartInter(); puIdx++) { - prepMotionCompensation(md.bestMode->cu, cuGeom, puIdx); - motionCompensation(md.bestMode->predYuv, false, true); + PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx); + motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true); } if (m_param->rdLevel == 2) encodeResAndCalcRdInterCU(*md.bestMode, cuGeom); @@ -956,7 +934,7 @@ residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange); getBestIntraModeChroma(*md.bestMode, cuGeom); residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0); - md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.encodeIdx); // TODO: + md.bestMode->reconYuv.copyFromPicYuv(*m_frame->m_reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO: } } } @@ -994,7 +972,7 @@ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) { - m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx); m_rqt[nextDepth].cur.load(*nextContext); compressInterCU_rd0_4(parentCTU, childGeom); @@ -1027,8 +1005,9 @@ checkBestMode(*splitPred, cuGeom.depth); else if (splitPred->sa8dCost < md.bestMode->sa8dCost) md.bestMode = splitPred; - } + checkDQPForSplitPred(md.bestMode->cu, cuGeom); + } if (mightNotSplit) { /* early-out statistics */ @@ -1039,12 +1018,11 @@ cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth]; } - checkDQP(md.bestMode->cu, cuGeom); - /* Copy best data to encData CTU and recon */ + X265_CHECK(md.bestMode->ok(), "best mode is not ok"); md.bestMode->cu.copyToPic(depth); if (md.bestMode != &md.pred[PRED_SPLIT] && m_param->rdLevel) - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.encodeIdx); + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, cuAddr, cuGeom.absPartIdx); } void Analysis::compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder) @@ -1060,26 +1038,11 @@ { uint8_t* reuseDepth = &m_reuseInterDataCTU->depth[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; uint8_t* reuseModes = &m_reuseInterDataCTU->modes[parentCTU.m_cuAddr * parentCTU.m_numPartitions]; - if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.encodeIdx && reuseModes[zOrder] == MODE_SKIP) + if (mightNotSplit && depth == reuseDepth[zOrder] && zOrder == cuGeom.absPartIdx && reuseModes[zOrder] == MODE_SKIP) { md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); - checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); - - if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && - (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))) - { - md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); - checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); - checkBestMode(md.pred[PRED_INTRA], depth); - - if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) - { - md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); - checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, &reuseModes[zOrder]); - checkBestMode(md.pred[PRED_INTRA_NxN], depth); - } - } + checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, true); if (m_bTryLossless) tryLossless(cuGeom); @@ -1099,7 +1062,7 @@ { md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom); md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom); - checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom); + checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom, false); bool earlySkip = m_param->bEnableEarlySkip && md.bestMode && !md.bestMode->cu.getQtRootCbf(0); if (!earlySkip) @@ -1121,18 +1084,13 @@ if (m_param->bEnableRectInter) { - if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) - { - md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); - checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false); - checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth); - } - if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) - { - md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); - checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false); - checkBestMode(md.pred[PRED_2NxN], cuGeom.depth); - } + md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom); + checkInter_rd5_6(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, false); + checkBestMode(md.pred[PRED_Nx2N], cuGeom.depth); + + md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom); + checkInter_rd5_6(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, false); + checkBestMode(md.pred[PRED_2NxN], cuGeom.depth); } // Try AMP (SIZE_2NxnU, SIZE_2NxnD, SIZE_nLx2N, SIZE_nRx2N) @@ -1145,7 +1103,7 @@ bHor = true; else if (md.bestMode->cu.m_partSize[0] == SIZE_Nx2N) bVer = true; - else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0] && !md.bestMode->cu.isSkipped(0)) + else if (md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N && !md.bestMode->cu.m_mergeFlag[0]) { bHor = true; bVer = true; @@ -1153,47 +1111,36 @@ if (bHor) { - if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) - { - md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); - checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly); - checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth); - } - if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) - { - md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); - checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly); - checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth); - } + md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom); + checkInter_rd5_6(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, bMergeOnly); + checkBestMode(md.pred[PRED_2NxnU], cuGeom.depth); + + md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom); + checkInter_rd5_6(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, bMergeOnly); + checkBestMode(md.pred[PRED_2NxnD], cuGeom.depth); } if (bVer) { - if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) - { - md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); - checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly); - checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth); - } - if (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0)) - { - md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); - checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly); - checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth); - } + md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom); + checkInter_rd5_6(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, bMergeOnly); + checkBestMode(md.pred[PRED_nLx2N], cuGeom.depth); + + md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom); + checkInter_rd5_6(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, bMergeOnly); + checkBestMode(md.pred[PRED_nRx2N], cuGeom.depth); } } - if ((m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && - (!m_param->bEnableCbfFastMode || md.bestMode->cu.getQtRootCbf(0))) + if (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) { md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom); - checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL); + checkIntra(md.pred[PRED_INTRA], cuGeom, SIZE_2Nx2N, NULL, NULL); checkBestMode(md.pred[PRED_INTRA], depth); - if (depth == g_maxCUDepth && cuGeom.log2CUSize > m_slice->m_sps->quadtreeTULog2MinSize) + if (cuGeom.log2CUSize == 3 && m_slice->m_sps->quadtreeTULog2MinSize < 3) { md.pred[PRED_INTRA_NxN].cu.initSubCU(parentCTU, cuGeom); - checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL); + checkIntra(md.pred[PRED_INTRA_NxN], cuGeom, SIZE_NxN, NULL, NULL); checkBestMode(md.pred[PRED_INTRA_NxN], depth); } } @@ -1224,7 +1171,7 @@ const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx); if (childGeom.flags & CUGeom::PRESENT) { - m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.encodeIdx); + m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx); m_rqt[nextDepth].cur.load(*nextContext); compressInterCU_rd5_6(parentCTU, childGeom, zOrder); @@ -1246,15 +1193,14 @@ else updateModeCost(*splitPred); + checkDQPForSplitPred(splitPred->cu, cuGeom); checkBestMode(*splitPred, depth); } - checkDQP(md.bestMode->cu, cuGeom); - /* Copy best data to encData CTU and recon */ md.bestMode->cu.copyToPic(depth); if (md.bestMode != &md.pred[PRED_SPLIT]) - md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.encodeIdx); + md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx); } /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ @@ -1271,40 +1217,43 @@ X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n"); + tempPred->initCosts(); tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N); tempPred->cu.setPredModeSubParts(MODE_INTER); tempPred->cu.m_mergeFlag[0] = true; + bestPred->initCosts(); bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N); bestPred->cu.setPredModeSubParts(MODE_INTER); bestPred->cu.m_mergeFlag[0] = true; - MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists - uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; - uint32_t maxNumMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours); + MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists + uint8_t candDir[MRG_MAX_NUM_CANDS]; + uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir); + PredictionUnit pu(merge.cu, cuGeom, 0); bestPred->sa8dCost = MAX_INT64; int bestSadCand = -1; int sizeIdx = cuGeom.log2CUSize - 2; - for (uint32_t i = 0; i < maxNumMergeCand; ++i) + for (uint32_t i = 0; i < numMergeCand; ++i) { if (m_bFrameParallel && - (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 || - mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4)) + (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 || + candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)) continue; tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idx - tempPred->cu.m_interDir[0] = interDirNeighbours[i]; - tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; - tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx; - tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; - tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx; + X265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n"); + tempPred->cu.m_interDir[0] = candDir[i]; + tempPred->cu.m_mv[0][0] = candMvField[i][0].mv; + tempPred->cu.m_mv[1][0] = candMvField[i][1].mv; + tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx; + tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx; - prepMotionCompensation(tempPred->cu, cuGeom, 0); - motionCompensation(tempPred->predYuv, true, m_bChromaSa8d); + motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d); - tempPred->sa8dBits = getTUBits(i, maxNumMergeCand); + tempPred->sa8dBits = getTUBits(i, numMergeCand); tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size); if (m_bChromaSa8d) { @@ -1326,10 +1275,7 @@ /* calculate the motion compensation for chroma for the best mode selected */ if (!m_bChromaSa8d) /* Chroma MC was done above */ - { - prepMotionCompensation(bestPred->cu, cuGeom, 0); - motionCompensation(bestPred->predYuv, false, true); - } + motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true); if (m_param->rdLevel) { @@ -1340,12 +1286,13 @@ /* Encode with residual */ tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand; - tempPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); - tempPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); - tempPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); - tempPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); - tempPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); + tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0); + tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0); + tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0); + tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0); + tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0); tempPred->sa8dCost = bestPred->sa8dCost; + tempPred->sa8dBits = bestPred->sa8dBits; tempPred->predYuv.copyFromYuv(bestPred->predYuv); encodeResAndCalcRdInterCU(*tempPred, cuGeom); @@ -1356,15 +1303,17 @@ md.bestMode = bestPred; /* broadcast sets of MV field data */ - bestPred->cu.setPUInterDir(interDirNeighbours[bestSadCand], 0, 0); - bestPred->cu.setPUMv(0, mvFieldNeighbours[bestSadCand][0].mv, 0, 0); - bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestSadCand][0].refIdx, 0, 0); - bestPred->cu.setPUMv(1, mvFieldNeighbours[bestSadCand][1].mv, 0, 0); - bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestSadCand][1].refIdx, 0, 0); + md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0); + md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0); + md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0); + md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0); + md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0); + checkDQP(md.bestMode->cu, cuGeom); + X265_CHECK(md.bestMode->ok(), "Merge mode not ok\n"); } /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */ -void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom) +void Analysis::checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isSkipMode) { uint32_t depth = cuGeom.depth; @@ -1373,91 +1322,110 @@ Mode* tempPred = &merge; Mode* bestPred = &skip; + merge.initCosts(); merge.cu.setPredModeSubParts(MODE_INTER); merge.cu.setPartSizeSubParts(SIZE_2Nx2N); merge.cu.m_mergeFlag[0] = true; + skip.initCosts(); skip.cu.setPredModeSubParts(MODE_INTER); skip.cu.setPartSizeSubParts(SIZE_2Nx2N); skip.cu.m_mergeFlag[0] = true; - MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists - uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; - uint32_t maxNumMergeCand = merge.cu.getInterMergeCandidates(0, 0, mvFieldNeighbours, interDirNeighbours); + MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both lists + uint8_t candDir[MRG_MAX_NUM_CANDS]; + uint32_t numMergeCand = merge.cu.getInterMergeCandidates(0, 0, candMvField, candDir); + PredictionUnit pu(merge.cu, cuGeom, 0); bool foundCbf0Merge = false; bool triedPZero = false, triedBZero = false; bestPred->rdCost = MAX_INT64; - for (uint32_t i = 0; i < maxNumMergeCand; i++) + + if (isSkipMode) { - if (m_bFrameParallel && - (mvFieldNeighbours[i][0].mv.y >= (m_param->searchRange + 1) * 4 || - mvFieldNeighbours[i][1].mv.y >= (m_param->searchRange + 1) * 4)) - continue; + uint32_t i = *m_reuseBestMergeCand; + bestPred->cu.m_mvpIdx[0][0] = (uint8_t)i; + bestPred->cu.m_interDir[0] = candDir[i]; + bestPred->cu.m_mv[0][0] = candMvField[i][0].mv; + bestPred->cu.m_mv[1][0] = candMvField[i][1].mv; + bestPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx; + bestPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx; - /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */ - if (interDirNeighbours[i] == 1 && !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx) - { - if (triedPZero) - continue; - triedPZero = true; - } - else if (interDirNeighbours[i] == 3 && - !mvFieldNeighbours[i][0].mv.word && !mvFieldNeighbours[i][0].refIdx && - !mvFieldNeighbours[i][1].mv.word && !mvFieldNeighbours[i][1].refIdx) + motionCompensation(bestPred->cu, pu, bestPred->predYuv, true, true); + encodeResAndCalcRdSkipCU(*bestPred); + } + else + { + for (uint32_t i = 0; i < numMergeCand; i++) { - if (triedBZero) + if (m_bFrameParallel && + (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 || + candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)) continue; - triedBZero = true; - } - - tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */ - tempPred->cu.m_interDir[0] = interDirNeighbours[i]; - tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; - tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx; - tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; - tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx; - tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */ - prepMotionCompensation(tempPred->cu, cuGeom, 0); - motionCompensation(tempPred->predYuv, true, true); + /* the merge candidate list is packed with MV(0,0) ref 0 when it is not full */ + if (candDir[i] == 1 && !candMvField[i][0].mv.word && !candMvField[i][0].refIdx) + { + if (triedPZero) + continue; + triedPZero = true; + } + else if (candDir[i] == 3 && + !candMvField[i][0].mv.word && !candMvField[i][0].refIdx && + !candMvField[i][1].mv.word && !candMvField[i][1].refIdx) + { + if (triedBZero) + continue; + triedBZero = true; + } - uint8_t hasCbf = true; - bool swapped = false; - if (!foundCbf0Merge) - { - /* if the best prediction has CBF (not a skip) then try merge with residual */ + tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; /* merge candidate ID is stored in L0 MVP idx */ + tempPred->cu.m_interDir[0] = candDir[i]; + tempPred->cu.m_mv[0][0] = candMvField[i][0].mv; + tempPred->cu.m_mv[1][0] = candMvField[i][1].mv; + tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx; + tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx; + tempPred->cu.setPredModeSubParts(MODE_INTER); /* must be cleared between encode iterations */ - encodeResAndCalcRdInterCU(*tempPred, cuGeom); - hasCbf = tempPred->cu.getQtRootCbf(0); - foundCbf0Merge = !hasCbf; + motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, true); - if (tempPred->rdCost < bestPred->rdCost) + uint8_t hasCbf = true; + bool swapped = false; + if (!foundCbf0Merge) { - std::swap(tempPred, bestPred); - swapped = true; - } - } - if (!m_param->bLossless && hasCbf) - { - /* try merge without residual (skip), if not lossless coding */ + /* if the best prediction has CBF (not a skip) then try merge with residual */ - if (swapped) - { - tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; - tempPred->cu.m_interDir[0] = interDirNeighbours[i]; - tempPred->cu.m_mv[0][0] = mvFieldNeighbours[i][0].mv; - tempPred->cu.m_refIdx[0][0] = (int8_t)mvFieldNeighbours[i][0].refIdx; - tempPred->cu.m_mv[1][0] = mvFieldNeighbours[i][1].mv; - tempPred->cu.m_refIdx[1][0] = (int8_t)mvFieldNeighbours[i][1].refIdx; - tempPred->cu.setPredModeSubParts(MODE_INTER); - tempPred->predYuv.copyFromYuv(bestPred->predYuv); + encodeResAndCalcRdInterCU(*tempPred, cuGeom); + hasCbf = tempPred->cu.getQtRootCbf(0); + foundCbf0Merge = !hasCbf; + + if (tempPred->rdCost < bestPred->rdCost) + { + std::swap(tempPred, bestPred); + swapped = true; + } } - - encodeResAndCalcRdSkipCU(*tempPred); + if (!m_param->bLossless && hasCbf) + { + /* try merge without residual (skip), if not lossless coding */ - if (tempPred->rdCost < bestPred->rdCost) - std::swap(tempPred, bestPred); + if (swapped) + { + tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; + tempPred->cu.m_interDir[0] = candDir[i]; + tempPred->cu.m_mv[0][0] = candMvField[i][0].mv; + tempPred->cu.m_mv[1][0] = candMvField[i][1].mv; + tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx; + tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx; + tempPred->cu.setPredModeSubParts(MODE_INTER); + tempPred->predYuv.copyFromYuv(bestPred->predYuv); + } + + encodeResAndCalcRdSkipCU(*tempPred); + + if (tempPred->rdCost < bestPred->rdCost) + std::swap(tempPred, bestPred); + } } } @@ -1467,11 +1435,20 @@ /* broadcast sets of MV field data */ uint32_t bestCand = bestPred->cu.m_mvpIdx[0][0]; - bestPred->cu.setPUInterDir(interDirNeighbours[bestCand], 0, 0); - bestPred->cu.setPUMv(0, mvFieldNeighbours[bestCand][0].mv, 0, 0); - bestPred->cu.setPURefIdx(0, (int8_t)mvFieldNeighbours[bestCand][0].refIdx, 0, 0); - bestPred->cu.setPUMv(1, mvFieldNeighbours[bestCand][1].mv, 0, 0); - bestPred->cu.setPURefIdx(1, (int8_t)mvFieldNeighbours[bestCand][1].refIdx, 0, 0); + bestPred->cu.setPUInterDir(candDir[bestCand], 0, 0); + bestPred->cu.setPUMv(0, candMvField[bestCand][0].mv, 0, 0); + bestPred->cu.setPUMv(1, candMvField[bestCand][1].mv, 0, 0); + bestPred->cu.setPURefIdx(0, (int8_t)candMvField[bestCand][0].refIdx, 0, 0); + bestPred->cu.setPURefIdx(1, (int8_t)candMvField[bestCand][1].refIdx, 0, 0); + checkDQP(bestPred->cu, cuGeom); + X265_CHECK(bestPred->ok(), "merge mode is not ok"); + } + + if (m_param->analysisMode) + { + m_reuseBestMergeCand++; + if (m_param->analysisMode == X265_ANALYSIS_SAVE) + *m_reuseBestMergeCand = bestPred->cu.m_mvpIdx[0][0]; } } @@ -1489,43 +1466,38 @@ MotionData* bestME = interMode.bestME[part]; for (int32_t i = 0; i < numPredDir; i++) { - bestME[i].ref = *reuseRef; - reuseRef++; + bestME[i].ref = *m_reuseRef; + m_reuseRef++; } } } - if (predInterSearch(interMode, cuGeom, false, m_bChromaSa8d)) + + predInterSearch(interMode, cuGeom, false, m_bChromaSa8d); + + /* predInterSearch sets interMode.sa8dBits */ + const Yuv& fencYuv = *interMode.fencYuv; + Yuv& predYuv = interMode.predYuv; + int part = partitionFromLog2Size(cuGeom.log2CUSize); + interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size); + if (m_bChromaSa8d) { - /* predInterSearch sets interMode.sa8dBits */ - const Yuv& fencYuv = *interMode.fencYuv; - Yuv& predYuv = interMode.predYuv; - int part = partitionFromLog2Size(cuGeom.log2CUSize); - interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size); - if (m_bChromaSa8d) - { - interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize); - interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize); - } - interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits); + interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize); + interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize); + } + interMode.sa8dCost = m_rdCost.calcRdSADCost(interMode.distortion, interMode.sa8dBits); - if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU) + if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU) + { + for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) { - for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) + MotionData* bestME = interMode.bestME[puIdx]; + for (int32_t i = 0; i < numPredDir; i++) { - MotionData* bestME = interMode.bestME[puIdx]; - for (int32_t i = 0; i < numPredDir; i++) - { - *reuseRef = bestME[i].ref; - reuseRef++; - } + *m_reuseRef = bestME[i].ref; + m_reuseRef++; } } } - else - { - interMode.distortion = MAX_UINT; - interMode.sa8dCost = MAX_INT64; - } } void Analysis::checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, bool bMergeOnly) @@ -1542,34 +1514,29 @@ MotionData* bestME = interMode.bestME[puIdx]; for (int32_t i = 0; i < numPredDir; i++) { - bestME[i].ref = *reuseRef; - reuseRef++; + bestME[i].ref = *m_reuseRef; + m_reuseRef++; } } } - if (predInterSearch(interMode, cuGeom, bMergeOnly, true)) - { - /* predInterSearch sets interMode.sa8dBits, but this is ignored */ - encodeResAndCalcRdInterCU(interMode, cuGeom); - if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU) + predInterSearch(interMode, cuGeom, bMergeOnly, true); + + /* predInterSearch sets interMode.sa8dBits, but this is ignored */ + encodeResAndCalcRdInterCU(interMode, cuGeom); + + if (m_param->analysisMode == X265_ANALYSIS_SAVE && m_reuseInterDataCTU) + { + for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) { - for (uint32_t puIdx = 0; puIdx < interMode.cu.getNumPartInter(); puIdx++) + MotionData* bestME = interMode.bestME[puIdx]; + for (int32_t i = 0; i < numPredDir; i++) { - MotionData* bestME = interMode.bestME[puIdx]; - for (int32_t i = 0; i < numPredDir; i++) - { - *reuseRef = bestME[i].ref; - reuseRef++; - } + *m_reuseRef = bestME[i].ref; + m_reuseRef++; } } } - else - { - interMode.distortion = MAX_UINT; - interMode.rdCost = MAX_INT64; - } } void Analysis::checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom) @@ -1614,8 +1581,8 @@ cu.setPUMv(1, bestME[1].mv, 0, 0); cu.m_mvd[1][0] = bestME[1].mv - mvp1; - prepMotionCompensation(cu, cuGeom, 0); - motionCompensation(bidir2Nx2N.predYuv, true, m_bChromaSa8d); + PredictionUnit pu(cu, cuGeom, 0); + motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, m_bChromaSa8d); int sa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, bidir2Nx2N.predYuv.m_buf[0], bidir2Nx2N.predYuv.m_size); if (m_bChromaSa8d) @@ -1654,8 +1621,7 @@ cu.m_mv[0][0] = mvzero; cu.m_mv[1][0] = mvzero; - prepMotionCompensation(cu, cuGeom, 0); - motionCompensation(tmpPredYuv, true, true); + motionCompensation(cu, pu, tmpPredYuv, true, true); zsa8d = primitives.cu[partEnum].sa8d(fencYuv.m_buf[0], fencYuv.m_size, tmpPredYuv.m_buf[0], tmpPredYuv.m_size); zsa8d += primitives.chroma[m_csp].cu[partEnum].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, tmpPredYuv.m_buf[1], tmpPredYuv.m_csize); @@ -1663,8 +1629,8 @@ } else { - pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx); - pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx); + pixel *fref0 = m_slice->m_mref[0][ref0].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx); + pixel *fref1 = m_slice->m_mref[1][ref1].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx); intptr_t refStride = m_slice->m_mref[0][0].lumaStride; primitives.pu[partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, fref0, refStride, fref1, refStride, 32); @@ -1699,10 +1665,7 @@ /* real MC was already performed */ bidir2Nx2N.predYuv.copyFromYuv(tmpPredYuv); else - { - prepMotionCompensation(cu, cuGeom, 0); - motionCompensation(bidir2Nx2N.predYuv, true, true); - } + motionCompensation(cu, pu, bidir2Nx2N.predYuv, true, true); } else if (m_bChromaSa8d) { @@ -1715,7 +1678,7 @@ void Analysis::encodeResidue(const CUData& ctu, const CUGeom& cuGeom) { - if (cuGeom.depth < ctu.m_cuDepth[cuGeom.encodeIdx] && cuGeom.depth < g_maxCUDepth) + if (cuGeom.depth < ctu.m_cuDepth[cuGeom.absPartIdx] && cuGeom.depth < g_maxCUDepth) { for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++) { @@ -1726,7 +1689,7 @@ return; } - uint32_t absPartIdx = cuGeom.encodeIdx; + uint32_t absPartIdx = cuGeom.absPartIdx; int sizeIdx = cuGeom.log2CUSize - 2; /* reuse the bestMode data structures at the current depth */ @@ -1743,6 +1706,8 @@ if (cu.isIntra(0)) { + ProfileCUScope(ctu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); // not really RDO, but close enough + uint32_t tuDepthRange[2]; cu.getIntraTUQtDepthRange(tuDepthRange, 0); @@ -1752,6 +1717,8 @@ } else // if (cu.isInter(0)) { + ProfileCUScope(ctu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); // not really RDO, but close enough + X265_CHECK(!ctu.isSkipped(absPartIdx), "skip not expected prior to transform\n"); /* Calculate residual for current CU part into depth sized resiYuv */ @@ -1810,7 +1777,6 @@ predV, predYuv.m_csize); } - checkDQP(cu, cuGeom); cu.updatePic(cuGeom.depth); } @@ -1839,34 +1805,6 @@ } } -void Analysis::checkDQP(CUData& cu, const CUGeom& cuGeom) -{ - if (m_slice->m_pps->bUseDQP && cuGeom.depth <= m_slice->m_pps->maxCuDQPDepth) - { - if (cu.m_cuDepth[0] > cuGeom.depth) // detect splits - { - bool hasResidual = false; - for (uint32_t absPartIdx = 0; absPartIdx < cu.m_numPartitions; absPartIdx++) - { - if (cu.getQtRootCbf(absPartIdx)) - { - hasResidual = true; - break; - } - } - if (hasResidual) - cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth); - else - cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); - } - else - { - if (!cu.getCbf(0, TEXT_LUMA, 0) && !cu.getCbf(0, TEXT_CHROMA_U, 0) && !cu.getCbf(0, TEXT_CHROMA_V, 0)) - cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); - } - } -} - uint32_t Analysis::topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom) { /* Do not attempt to code a block larger than the largest block in the @@ -1881,11 +1819,11 @@ numRefs++; const CUData& cu = *m_slice->m_refPicList[0][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); previousQP = cu.m_qp[0]; - if (!cu.m_cuDepth[cuGeom.encodeIdx]) + if (!cu.m_cuDepth[cuGeom.absPartIdx]) return 0; - for (uint32_t i = 0; i < cuGeom.numPartitions && minDepth0; i += 4) + for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4) { - uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i]; + uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i]; minDepth0 = X265_MIN(d, minDepth0); sum += d; } @@ -1894,11 +1832,11 @@ { numRefs++; const CUData& cu = *m_slice->m_refPicList[1][0]->m_encData->getPicCTU(parentCTU.m_cuAddr); - if (!cu.m_cuDepth[cuGeom.encodeIdx]) + if (!cu.m_cuDepth[cuGeom.absPartIdx]) return 0; for (uint32_t i = 0; i < cuGeom.numPartitions; i += 4) { - uint32_t d = cu.m_cuDepth[cuGeom.encodeIdx + i]; + uint32_t d = cu.m_cuDepth[cuGeom.absPartIdx + i]; minDepth1 = X265_MIN(d, minDepth1); sum += d; } @@ -1974,3 +1912,38 @@ return false; } + +int Analysis::calculateQpforCuSize(CUData& ctu, const CUGeom& cuGeom) +{ + uint32_t ctuAddr = ctu.m_cuAddr; + FrameData& curEncData = *m_frame->m_encData; + double qp = curEncData.m_cuStat[ctuAddr].baseQp; + + uint32_t width = m_frame->m_fencPic->m_picWidth; + uint32_t height = m_frame->m_fencPic->m_picHeight; + uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx]; + uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx]; + uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (16 - 1)) / 16; + uint32_t blockSize = g_maxCUSize >> cuGeom.depth; + double qp_offset = 0; + uint32_t cnt = 0; + uint32_t idx; + + /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */ + bool isReferenced = IS_REFERENCED(m_frame); + double *qpoffs = (isReferenced && m_param->rc.cuTree) ? m_frame->m_lowres.qpCuTreeOffset : m_frame->m_lowres.qpAqOffset; + + for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += 16) + { + for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += 16) + { + idx = ((block_yy / 16) * (maxCols)) + (block_xx / 16); + qp_offset += qpoffs[idx]; + cnt++; + } + } + + qp_offset /= cnt; + qp += qp_offset; + return x265_clip3(QP_MIN, QP_MAX_MAX, (int)(qp + 0.5)); +} diff -Nru x265-1.5/source/encoder/analysis.h x265-1.6/source/encoder/analysis.h --- x265-1.5/source/encoder/analysis.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/analysis.h 2015-04-02 16:46:36.000000000 +0000 @@ -70,30 +70,43 @@ CUDataMemPool cuMemPool; }; + class PMODE : public BondedTaskGroup + { + public: + + Analysis& master; + const CUGeom& cuGeom; + int modes[MAX_PRED_TYPES]; + + PMODE(Analysis& m, const CUGeom& g) : master(m), cuGeom(g) {} + + void processTasks(int workerThreadId); + + protected: + + PMODE operator=(const PMODE&); + }; + + void processPmode(PMODE& pmode, Analysis& slave); + ModeDepth m_modeDepth[NUM_CU_DEPTH]; bool m_bTryLossless; bool m_bChromaSa8d; - /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */ - analysis_intra_data* m_reuseIntraDataCTU; - analysis_inter_data* m_reuseInterDataCTU; - int32_t* reuseRef; Analysis(); + bool create(ThreadLocalData* tld); void destroy(); + Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext); protected: - /* mode analysis distribution */ - int m_totalNumJobs; - volatile int m_numAcquiredJobs; - volatile int m_numCompletedJobs; - Lock m_pmodeLock; - Event m_modeCompletionEvent; - bool findJob(int threadId); - void parallelModeAnalysis(int threadId, int jobId); - void parallelME(int threadId, int meId); + /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */ + analysis_intra_data* m_reuseIntraDataCTU; + analysis_inter_data* m_reuseInterDataCTU; + int32_t* m_reuseRef; + uint32_t* m_reuseBestMergeCand; /* full analysis for an I-slice CU */ void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder); @@ -105,7 +118,7 @@ /* measure merge and skip */ void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom); - void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom); + void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isSkipMode); /* measure inter options */ void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize); @@ -119,9 +132,6 @@ /* add the RD cost of coding a split flag (0 or 1) to the given mode */ void addSplitFlagCost(Mode& mode, uint32_t depth); - /* update CBF flags and QP values to be internally consistent */ - void checkDQP(CUData& cu, const CUGeom& cuGeom); - /* work-avoidance heuristics for RD levels < 5 */ uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom); bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode); @@ -129,9 +139,13 @@ /* generate residual and recon pixels for an entire CTU recursively (RD0) */ void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom); + int calculateQpforCuSize(CUData& ctu, const CUGeom& cuGeom); + /* check whether current mode is the new best */ inline void checkBestMode(Mode& mode, uint32_t depth) { + X265_CHECK(mode.ok(), "mode costs are uninitialized\n"); + ModeDepth& md = m_modeDepth[depth]; if (md.bestMode) { diff -Nru x265-1.5/source/encoder/api.cpp x265-1.6/source/encoder/api.cpp --- x265-1.5/source/encoder/api.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/api.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -173,6 +173,7 @@ { Encoder *encoder = static_cast(enc); + encoder->stop(); encoder->printSummary(); encoder->destroy(); delete encoder; @@ -183,6 +184,8 @@ void x265_cleanup(void) { BitCost::destroy(); + CUData::s_partSet[0] = NULL; /* allow CUData to adjust to new CTU size */ + g_ctuSizeConfigured = 0; } extern "C" @@ -206,7 +209,7 @@ uint32_t numCUsInFrame = widthInCU * heightInCU; pic->analysisData.numCUsInFrame = numCUsInFrame; - pic->analysisData.numPartitions = NUM_CU_PARTITIONS; + pic->analysisData.numPartitions = NUM_4x4_PARTITIONS; } } @@ -215,3 +218,36 @@ { return x265_free(p); } + +static const x265_api libapi = +{ + &x265_param_alloc, + &x265_param_free, + &x265_param_default, + &x265_param_parse, + &x265_param_apply_profile, + &x265_param_default_preset, + &x265_picture_alloc, + &x265_picture_free, + &x265_picture_init, + &x265_encoder_open, + &x265_encoder_parameters, + &x265_encoder_headers, + &x265_encoder_encode, + &x265_encoder_get_stats, + &x265_encoder_log, + &x265_encoder_close, + &x265_cleanup, + x265_version_str, + x265_build_info_str, + x265_max_bit_depth, +}; + +extern "C" +const x265_api* x265_api_get(int bitDepth) +{ + if (bitDepth && bitDepth != X265_DEPTH) + return NULL; + + return &libapi; +} diff -Nru x265-1.5/source/encoder/dpb.cpp x265-1.6/source/encoder/dpb.cpp --- x265-1.5/source/encoder/dpb.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/dpb.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -104,11 +104,14 @@ if (type == X265_TYPE_B) { - // change from _R "referenced" to _N "non-referenced" NAL unit type + newFrame->m_encData->m_bHasReferences = false; + + // Adjust NAL type for unreferenced B frames (change from _R "referenced" + // to _N "non-referenced" NAL unit type) switch (slice->m_nalUnitType) { case NAL_UNIT_CODED_SLICE_TRAIL_R: - slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_TRAIL_N; + slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N; break; case NAL_UNIT_CODED_SLICE_RADL_R: slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N; @@ -120,10 +123,12 @@ break; } } - - /* m_bHasReferences starts out as true for non-B pictures, and is set to false - * once no more pictures reference it */ - newFrame->m_encData->m_bHasReferences = IS_REFERENCED(newFrame); + else + { + /* m_bHasReferences starts out as true for non-B pictures, and is set to false + * once no more pictures reference it */ + newFrame->m_encData->m_bHasReferences = true; + } m_picList.pushFront(*newFrame); diff -Nru x265-1.5/source/encoder/dpb.h x265-1.6/source/encoder/dpb.h --- x265-1.5/source/encoder/dpb.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/dpb.h 2015-04-02 16:46:36.000000000 +0000 @@ -39,10 +39,11 @@ int m_lastIDR; int m_pocCRA; - bool m_bRefreshPending; int m_maxRefL0; int m_maxRefL1; int m_bOpenGOP; + bool m_bRefreshPending; + bool m_bTemporalSublayer; PicList m_picList; PicList m_freeList; FrameData* m_picSymFreeList; @@ -56,6 +57,7 @@ m_maxRefL0 = param->maxNumReferences; m_maxRefL1 = param->bBPyramid ? 2 : 1; m_bOpenGOP = param->bOpenGOP; + m_bTemporalSublayer = !!param->bEnableTemporalSubLayers; } ~DPB(); diff -Nru x265-1.5/source/encoder/encoder.cpp x265-1.6/source/encoder/encoder.cpp --- x265-1.5/source/encoder/encoder.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/encoder.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -43,7 +43,7 @@ const char g_sliceTypeToChar[] = {'B', 'P', 'I'}; } -static const char *summaryCSVHeader = +static const char* summaryCSVHeader = "Command, Date/Time, Elapsed Time, FPS, Bitrate, " "Y PSNR, U PSNR, V PSNR, Global PSNR, SSIM, SSIM (dB), " "I count, I ave-QP, I kpbs, I-PSNR Y, I-PSNR U, I-PSNR V, I-SSIM (dB), " @@ -51,7 +51,7 @@ "B count, B ave-QP, B kpbs, B-PSNR Y, B-PSNR U, B-PSNR V, B-SSIM (dB), " "Version\n"; -const char* defaultAnalysisFileName = "x265_analysis.dat"; +static const char* defaultAnalysisFileName = "x265_analysis.dat"; using namespace x265; @@ -66,7 +66,6 @@ m_numLumaWPBiFrames = 0; m_numChromaWPBiFrames = 0; m_lookahead = NULL; - m_frameEncoder = NULL; m_rateControl = NULL; m_dpb = NULL; m_exportedPic = NULL; @@ -78,9 +77,12 @@ m_cuOffsetC = NULL; m_buOffsetY = NULL; m_buOffsetC = NULL; - m_threadPool = 0; - m_numThreadLocalData = 0; + m_threadPool = NULL; m_analysisFile = NULL; + for (int i = 0; i < X265_MAX_FRAME_THREADS; i++) + m_frameEncoder[i] = NULL; + + MotionEstimate::initScales(); } void Encoder::create() @@ -101,21 +103,35 @@ if (rows == 1 || cols < 3) p->bEnableWavefront = 0; - int poolThreadCount = p->poolNumThreads ? p->poolNumThreads : getCpuCount(); + bool allowPools = !p->numaPools || strcmp(p->numaPools, "none"); // Trim the thread pool if --wpp, --pme, and --pmode are disabled if (!p->bEnableWavefront && !p->bDistributeModeAnalysis && !p->bDistributeMotionEstimation) - poolThreadCount = 0; + allowPools = false; - if (poolThreadCount > 1) + if (!p->frameNumThreads) { - m_threadPool = ThreadPool::allocThreadPool(poolThreadCount); - poolThreadCount = m_threadPool->getThreadCount(); + // auto-detect frame threads + int cpuCount = ThreadPool::getCpuCount(); + if (!p->bEnableWavefront) + p->frameNumThreads = X265_MIN3(cpuCount, (rows + 1) / 2, X265_MAX_FRAME_THREADS); + else if (cpuCount >= 32) + p->frameNumThreads = (p->sourceHeight > 2000) ? 8 : 6; // dual-socket 10-core IvyBridge or higher + else if (cpuCount >= 16) + p->frameNumThreads = 5; // 8 HT cores, or dual socket + else if (cpuCount >= 8) + p->frameNumThreads = 3; // 4 HT cores + else if (cpuCount >= 4) + p->frameNumThreads = 2; // Dual or Quad core + else + p->frameNumThreads = 1; } - else - poolThreadCount = 0; - if (!poolThreadCount) + m_numPools = 0; + if (allowPools) + m_threadPool = ThreadPool::allocThreadPools(p, m_numPools); + + if (!m_numPools) { // issue warnings if any of these features were requested if (p->bEnableWavefront) @@ -129,31 +145,40 @@ p->bEnableWavefront = p->bDistributeModeAnalysis = p->bDistributeMotionEstimation = 0; } - if (!p->frameNumThreads) - { - // auto-detect frame threads - int cpuCount = getCpuCount(); - if (!p->bEnableWavefront) - p->frameNumThreads = X265_MIN(cpuCount, (rows + 1) / 2); - else if (cpuCount >= 32) - p->frameNumThreads = (p->sourceHeight > 2000) ? 8 : 6; // dual-socket 10-core IvyBridge or higher - else if (cpuCount >= 16) - p->frameNumThreads = 5; // 8 HT cores, or dual socket - else if (cpuCount >= 8) - p->frameNumThreads = 3; // 4 HT cores - else if (cpuCount >= 4) - p->frameNumThreads = 2; // Dual or Quad core - else - p->frameNumThreads = 1; - } + char buf[128]; + int len = 0; + if (p->bEnableWavefront) + len += sprintf(buf + len, "wpp(%d rows)", rows); + if (p->bDistributeModeAnalysis) + len += sprintf(buf + len, "%spmode", len ? "+" : ""); + if (p->bDistributeMotionEstimation) + len += sprintf(buf + len, "%spme ", len ? "+" : ""); + if (!len) + strcpy(buf, "none"); - x265_log(p, X265_LOG_INFO, "WPP streams / frame threads / pool : %d / %d / %d%s%s\n", - p->bEnableWavefront ? rows : 0, p->frameNumThreads, poolThreadCount, - p->bDistributeMotionEstimation ? " / pme" : "", p->bDistributeModeAnalysis ? " / pmode" : ""); + x265_log(p, X265_LOG_INFO, "frame threads / pool features : %d / %s\n", p->frameNumThreads, buf); - m_frameEncoder = new FrameEncoder[m_param->frameNumThreads]; for (int i = 0; i < m_param->frameNumThreads; i++) - m_frameEncoder[i].setThreadPool(m_threadPool); + m_frameEncoder[i] = new FrameEncoder; + + if (m_numPools) + { + for (int i = 0; i < m_param->frameNumThreads; i++) + { + int pool = i % m_numPools; + m_frameEncoder[i]->m_pool = &m_threadPool[pool]; + m_frameEncoder[i]->m_jpId = m_threadPool[pool].m_numProviders++; + m_threadPool[pool].m_jpTable[m_frameEncoder[i]->m_jpId] = m_frameEncoder[i]; + } + for (int i = 0; i < m_numPools; i++) + m_threadPool[i].start(); + } + else + { + /* CU stats and noise-reduction buffers are indexed by jpId, so it cannot be left as -1 */ + for (int i = 0; i < m_param->frameNumThreads; i++) + m_frameEncoder[i]->m_jpId = 0; + } if (!m_scalingList.init()) { @@ -168,27 +193,17 @@ m_aborted = true; m_scalingList.setupQuantMatrices(); - /* Allocate thread local data, one for each thread pool worker and - * if --no-wpp, one for each frame encoder */ - m_numThreadLocalData = poolThreadCount; - if (!m_param->bEnableWavefront) - m_numThreadLocalData += m_param->frameNumThreads; - m_threadLocalData = new ThreadLocalData[m_numThreadLocalData]; - for (int i = 0; i < m_numThreadLocalData; i++) - { - m_threadLocalData[i].analysis.setThreadPool(m_threadPool); - m_threadLocalData[i].analysis.initSearch(*m_param, m_scalingList); - m_threadLocalData[i].analysis.create(m_threadLocalData); + m_lookahead = new Lookahead(m_param, m_threadPool); + if (m_numPools) + { + m_lookahead->m_jpId = m_threadPool[0].m_numProviders++; + m_threadPool[0].m_jpTable[m_lookahead->m_jpId] = m_lookahead; } - if (!m_param->bEnableWavefront) - for (int i = 0; i < m_param->frameNumThreads; i++) - m_frameEncoder[i].m_tld = &m_threadLocalData[poolThreadCount + i]; - - m_lookahead = new Lookahead(m_param, m_threadPool); m_dpb = new DPB(m_param); - m_rateControl = new RateControl(m_param); + m_rateControl = new RateControl(*m_param); + initVPS(&m_vps); initSPS(&m_sps); initPPS(&m_pps); @@ -229,26 +244,29 @@ } } - if (m_frameEncoder) + int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; + int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; + for (int i = 0; i < m_param->frameNumThreads; i++) { - int numRows = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; - int numCols = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; - for (int i = 0; i < m_param->frameNumThreads; i++) + if (!m_frameEncoder[i]->init(this, numRows, numCols)) { - if (!m_frameEncoder[i].init(this, numRows, numCols, i)) - { - x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n"); - m_aborted = true; - } + x265_log(m_param, X265_LOG_ERROR, "Unable to initialize frame encoder, aborting\n"); + m_aborted = true; } } + for (int i = 0; i < m_param->frameNumThreads; i++) + { + m_frameEncoder[i]->start(); + m_frameEncoder[i]->m_done.wait(); /* wait for thread to initialize */ + } + if (m_param->bEmitHRDSEI) - m_rateControl->initHRD(&m_sps); - if (!m_rateControl->init(&m_sps)) + m_rateControl->initHRD(m_sps); + if (!m_rateControl->init(m_sps)) + m_aborted = true; + if (!m_lookahead->create()) m_aborted = true; - - m_lookahead->init(); if (m_param->analysisMode) { @@ -271,6 +289,29 @@ m_encodeStartTime = x265_mdate(); } +void Encoder::stop() +{ + if (m_rateControl) + m_rateControl->terminate(); // unblock all blocked RC calls + + if (m_lookahead) + m_lookahead->stop(); + + for (int i = 0; i < m_param->frameNumThreads; i++) + { + if (m_frameEncoder[i]) + { + m_frameEncoder[i]->getEncodedPicture(m_nalList); + m_frameEncoder[i]->m_threadActive = false; + m_frameEncoder[i]->m_enable.trigger(); + m_frameEncoder[i]->stop(); + } + } + + if (m_threadPool) + m_threadPool->stop(); +} + void Encoder::destroy() { if (m_exportedPic) @@ -279,28 +320,24 @@ m_exportedPic = NULL; } - if (m_rateControl) - m_rateControl->terminate(); // unblock all blocked RC calls - - if (m_frameEncoder) + for (int i = 0; i < m_param->frameNumThreads; i++) { - for (int i = 0; i < m_param->frameNumThreads; i++) + if (m_frameEncoder[i]) { - // Ensure frame encoder is idle before destroying it - m_frameEncoder[i].getEncodedPicture(m_nalList); - m_frameEncoder[i].destroy(); + m_frameEncoder[i]->destroy(); + delete m_frameEncoder[i]; } - - delete [] m_frameEncoder; } - for (int i = 0; i < m_numThreadLocalData; i++) - m_threadLocalData[i].destroy(); - - delete [] m_threadLocalData; + // thread pools can be cleaned up now that all the JobProviders are + // known to be shutdown + delete [] m_threadPool; if (m_lookahead) - m_lookahead->stop(); + { + m_lookahead->destroy(); + delete m_lookahead; + } delete m_dpb; if (m_rateControl) @@ -309,16 +346,6 @@ delete m_rateControl; } - // thread pool release should always happen last - if (m_threadPool) - m_threadPool->release(); - - if (m_lookahead) - { - m_lookahead->destroy(); - delete m_lookahead; - } - X265_FREE(m_cuOffsetY); X265_FREE(m_cuOffsetC); X265_FREE(m_buOffsetY); @@ -326,20 +353,27 @@ if (m_analysisFile) fclose(m_analysisFile); - free(m_param->analysisFileName); - free(m_param->csvfn); if (m_csvfpt) fclose(m_csvfpt); - free(m_param->rc.statFileName); // alloc'd by strdup - X265_FREE(m_param); + if (m_param) + { + free((void*)m_param->rc.lambdaFileName); // allocs by strdup + free(m_param->rc.statFileName); + free(m_param->analysisFileName); + free((void*)m_param->scalingLists); + free(m_param->csvfn); + free(m_param->numaPools); + + X265_FREE(m_param); + } } void Encoder::updateVbvPlan(RateControl* rc) { for (int i = 0; i < m_param->frameNumThreads; i++) { - FrameEncoder *encoder = &m_frameEncoder[i]; + FrameEncoder *encoder = m_frameEncoder[i]; if (encoder->m_rce.isActive && encoder->m_rce.poc != rc->m_curSlice->m_poc) { int64_t bits = (int64_t) X265_MAX(encoder->m_rce.frameSizeEstimated, encoder->m_rce.frameSizePlanned); @@ -366,6 +400,13 @@ * negative on malloc error or abort */ int Encoder::encode(const x265_picture* pic_in, x265_picture* pic_out) { +#if CHECKED_BUILD || _DEBUG + if (g_checkFailures) + { + x265_log(m_param, X265_LOG_ERROR, "encoder aborting because of internal error\n"); + return -1; + } +#endif if (m_aborted) return -1; @@ -439,9 +480,9 @@ inFrame = m_dpb->m_freeList.popBack(); /* Copy input picture into a Frame and PicYuv, send to lookahead */ - inFrame->m_poc = ++m_pocLast; inFrame->m_fencPic->copyFromPicture(*pic_in, m_sps.conformanceWindow.rightOffset, m_sps.conformanceWindow.bottomOffset); + inFrame->m_poc = ++m_pocLast; inFrame->m_userData = pic_in->userData; inFrame->m_pts = pic_in->pts; inFrame->m_forceqp = pic_in->forceqp; @@ -453,21 +494,14 @@ /* Encoder holds a reference count until stats collection is finished */ ATOMIC_INC(&inFrame->m_countRefEncoders); - bool bEnableWP = m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred; - if (m_param->rc.aqMode || bEnableWP) + + if ((m_param->rc.aqMode || m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred) && + (m_param->rc.cuTree && m_param->rc.bStatRead)) { - if (m_param->rc.cuTree && m_param->rc.bStatRead) - { - if (!m_rateControl->cuTreeReadFor2Pass(inFrame)) - { - m_aborted = 1; - return -1; - } - } - else + if (!m_rateControl->cuTreeReadFor2Pass(inFrame)) { - ProfileScopeEvent(prelookahead); - m_rateControl->calcAdaptiveQuantFrame(inFrame); + m_aborted = 1; + return -1; } } @@ -490,13 +524,13 @@ sliceType = inputPic->analysisData.sliceType; } - m_lookahead->addPicture(inFrame, sliceType); + m_lookahead->addPicture(*inFrame, sliceType); m_numDelayedPic++; } else m_lookahead->flush(); - FrameEncoder *curEncoder = &m_frameEncoder[m_curEncoder]; + FrameEncoder *curEncoder = m_frameEncoder[m_curEncoder]; m_curEncoder = (m_curEncoder + 1) % m_param->frameNumThreads; int ret = 0; @@ -631,7 +665,7 @@ slice->m_sps = &m_sps; slice->m_pps = &m_pps; slice->m_maxNumMergeCand = m_param->maxNumMergeCand; - slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_CU_PARTITIONS); + slice->m_endCUAddr = slice->realEndAddress(m_sps.numCUsInFrame * NUM_4x4_PARTITIONS); frameEnc->m_reconPic->m_cuOffsetC = m_cuOffsetC; frameEnc->m_reconPic->m_cuOffsetY = m_cuOffsetY; frameEnc->m_reconPic->m_buOffsetC = m_buOffsetC; @@ -661,7 +695,7 @@ uint32_t numCUsInFrame = widthInCU * heightInCU; analysis->numCUsInFrame = numCUsInFrame; - analysis->numPartitions = NUM_CU_PARTITIONS; + analysis->numPartitions = NUM_4x4_PARTITIONS; allocAnalysis(analysis); } @@ -810,6 +844,143 @@ x265_log(m_param, X265_LOG_INFO, "lossless compression ratio %.2f::1\n", uncompressed / m_analyzeAll.m_accBits); } + +#if DETAILED_CU_STATS + /* Summarize stats from all frame encoders */ + CUStats cuStats; + for (int i = 0; i < m_param->frameNumThreads; i++) + cuStats.accumulate(m_frameEncoder[i]->m_cuStats); + + if (!cuStats.totalCTUTime) + return; + + int totalWorkerCount = 0; + for (int i = 0; i < m_numPools; i++) + totalWorkerCount += m_threadPool[i].m_numWorkers; + + int64_t batchElapsedTime, coopSliceElapsedTime; + uint64_t batchCount, coopSliceCount; + m_lookahead->getWorkerStats(batchElapsedTime, batchCount, coopSliceElapsedTime, coopSliceCount); + int64_t lookaheadWorkerTime = m_lookahead->m_slicetypeDecideElapsedTime + m_lookahead->m_preLookaheadElapsedTime + + batchElapsedTime + coopSliceElapsedTime; + + int64_t totalWorkerTime = cuStats.totalCTUTime + cuStats.loopFilterElapsedTime + cuStats.pmodeTime + + cuStats.pmeTime + lookaheadWorkerTime + cuStats.weightAnalyzeTime; + int64_t elapsedEncodeTime = x265_mdate() - m_encodeStartTime; + + int64_t interRDOTotalTime = 0, intraRDOTotalTime = 0; + uint64_t interRDOTotalCount = 0, intraRDOTotalCount = 0; + for (uint32_t i = 0; i <= g_maxCUDepth; i++) + { + interRDOTotalTime += cuStats.interRDOElapsedTime[i]; + intraRDOTotalTime += cuStats.intraRDOElapsedTime[i]; + interRDOTotalCount += cuStats.countInterRDO[i]; + intraRDOTotalCount += cuStats.countIntraRDO[i]; + } + + /* Time within compressCTU() and pmode tasks not captured by ME, Intra mode selection, or RDO (2Nx2N merge, 2Nx2N bidir, etc) */ + int64_t unaccounted = (cuStats.totalCTUTime + cuStats.pmodeTime) - + (cuStats.intraAnalysisElapsedTime + cuStats.motionEstimationElapsedTime + interRDOTotalTime + intraRDOTotalTime); + +#define ELAPSED_SEC(val) ((double)(val) / 1000000) +#define ELAPSED_MSEC(val) ((double)(val) / 1000) + + if (m_param->bDistributeMotionEstimation && cuStats.countPMEMasters) + { + x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in motion estimation, averaging %.3lf CU inter modes per CTU\n", + 100.0 * (cuStats.motionEstimationElapsedTime + cuStats.pmeTime) / totalWorkerTime, + (double)cuStats.countMotionEstimate / cuStats.totalCTUs); + x265_log(m_param, X265_LOG_INFO, "CU: %.3lf PME masters per inter CU, each blocked an average of %.3lf ns\n", + (double)cuStats.countPMEMasters / cuStats.countMotionEstimate, + (double)cuStats.pmeBlockTime / cuStats.countPMEMasters); + x265_log(m_param, X265_LOG_INFO, "CU: %.3lf slaves per PME master, each took an average of %.3lf ms\n", + (double)cuStats.countPMETasks / cuStats.countPMEMasters, + ELAPSED_MSEC(cuStats.pmeTime) / cuStats.countPMETasks); + } + else + { + x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in motion estimation, averaging %.3lf CU inter modes per CTU\n", + 100.0 * cuStats.motionEstimationElapsedTime / totalWorkerTime, + (double)cuStats.countMotionEstimate / cuStats.totalCTUs); + } + x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in intra analysis, averaging %.3lf Intra PUs per CTU\n", + 100.0 * cuStats.intraAnalysisElapsedTime / totalWorkerTime, + (double)cuStats.countIntraAnalysis / cuStats.totalCTUs); + x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in inter RDO, measuring %.3lf inter/merge predictions per CTU\n", + 100.0 * interRDOTotalTime / totalWorkerTime, + (double)interRDOTotalCount / cuStats.totalCTUs); + x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in intra RDO, measuring %.3lf intra predictions per CTU\n", + 100.0 * intraRDOTotalTime / totalWorkerTime, + (double)intraRDOTotalCount / cuStats.totalCTUs); + x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in loop filters, average %.3lf ms per call\n", + 100.0 * cuStats.loopFilterElapsedTime / totalWorkerTime, + ELAPSED_MSEC(cuStats.loopFilterElapsedTime) / cuStats.countLoopFilter); + if (cuStats.countWeightAnalyze && cuStats.weightAnalyzeTime) + { + x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in weight analysis, average %.3lf ms per call\n", + 100.0 * cuStats.weightAnalyzeTime / totalWorkerTime, + ELAPSED_MSEC(cuStats.weightAnalyzeTime) / cuStats.countWeightAnalyze); + } + if (m_param->bDistributeModeAnalysis && cuStats.countPModeMasters) + { + x265_log(m_param, X265_LOG_INFO, "CU: %.3lf PMODE masters per CTU, each blocked an average of %.3lf ns\n", + (double)cuStats.countPModeMasters / cuStats.totalCTUs, + (double)cuStats.pmodeBlockTime / cuStats.countPModeMasters); + x265_log(m_param, X265_LOG_INFO, "CU: %.3lf slaves per PMODE master, each took average of %.3lf ms\n", + (double)cuStats.countPModeTasks / cuStats.countPModeMasters, + ELAPSED_MSEC(cuStats.pmodeTime) / cuStats.countPModeTasks); + } + + x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in slicetypeDecide (avg %.3lfms) and prelookahead (avg %.3lfms)\n", + 100.0 * lookaheadWorkerTime / totalWorkerTime, + ELAPSED_MSEC(m_lookahead->m_slicetypeDecideElapsedTime) / m_lookahead->m_countSlicetypeDecide, + ELAPSED_MSEC(m_lookahead->m_preLookaheadElapsedTime) / m_lookahead->m_countPreLookahead); + + x265_log(m_param, X265_LOG_INFO, "CU: %%%05.2lf time spent in other tasks\n", + 100.0 * unaccounted / totalWorkerTime); + + if (intraRDOTotalTime && intraRDOTotalCount) + { + x265_log(m_param, X265_LOG_INFO, "CU: Intra RDO time per depth %%%05.2lf %%%05.2lf %%%05.2lf %%%05.2lf\n", + 100.0 * cuStats.intraRDOElapsedTime[0] / intraRDOTotalTime, // 64 + 100.0 * cuStats.intraRDOElapsedTime[1] / intraRDOTotalTime, // 32 + 100.0 * cuStats.intraRDOElapsedTime[2] / intraRDOTotalTime, // 16 + 100.0 * cuStats.intraRDOElapsedTime[3] / intraRDOTotalTime); // 8 + x265_log(m_param, X265_LOG_INFO, "CU: Intra RDO calls per depth %%%05.2lf %%%05.2lf %%%05.2lf %%%05.2lf\n", + 100.0 * cuStats.countIntraRDO[0] / intraRDOTotalCount, // 64 + 100.0 * cuStats.countIntraRDO[1] / intraRDOTotalCount, // 32 + 100.0 * cuStats.countIntraRDO[2] / intraRDOTotalCount, // 16 + 100.0 * cuStats.countIntraRDO[3] / intraRDOTotalCount); // 8 + } + + if (interRDOTotalTime && interRDOTotalCount) + { + x265_log(m_param, X265_LOG_INFO, "CU: Inter RDO time per depth %%%05.2lf %%%05.2lf %%%05.2lf %%%05.2lf\n", + 100.0 * cuStats.interRDOElapsedTime[0] / interRDOTotalTime, // 64 + 100.0 * cuStats.interRDOElapsedTime[1] / interRDOTotalTime, // 32 + 100.0 * cuStats.interRDOElapsedTime[2] / interRDOTotalTime, // 16 + 100.0 * cuStats.interRDOElapsedTime[3] / interRDOTotalTime); // 8 + x265_log(m_param, X265_LOG_INFO, "CU: Inter RDO calls per depth %%%05.2lf %%%05.2lf %%%05.2lf %%%05.2lf\n", + 100.0 * cuStats.countInterRDO[0] / interRDOTotalCount, // 64 + 100.0 * cuStats.countInterRDO[1] / interRDOTotalCount, // 32 + 100.0 * cuStats.countInterRDO[2] / interRDOTotalCount, // 16 + 100.0 * cuStats.countInterRDO[3] / interRDOTotalCount); // 8 + } + + x265_log(m_param, X265_LOG_INFO, "CU: " X265_LL " %dX%d CTUs compressed in %.3lf seconds, %.3lf CTUs per worker-second\n", + cuStats.totalCTUs, g_maxCUSize, g_maxCUSize, + ELAPSED_SEC(totalWorkerTime), + cuStats.totalCTUs / ELAPSED_SEC(totalWorkerTime)); + + if (m_threadPool) + x265_log(m_param, X265_LOG_INFO, "CU: %.3lf average worker utilization, %%%05.2lf of theoretical maximum utilization\n", + (double)totalWorkerTime / elapsedEncodeTime, + 100.0 * totalWorkerTime / (elapsedEncodeTime * totalWorkerCount)); + +#undef ELAPSED_SEC +#undef ELAPSED_MSEC +#endif + if (!m_param->bLogCuStats) return; @@ -823,9 +994,11 @@ StatisticLog finalLog; for (uint32_t depth = 0; depth <= g_maxCUDepth; depth++) { + int cuSize = g_maxCUSize >> depth; + for (int i = 0; i < m_param->frameNumThreads; i++) { - StatisticLog& enclog = m_frameEncoder[i].m_sliceTypeLog[sliceType]; + StatisticLog& enclog = m_frameEncoder[i]->m_sliceTypeLog[sliceType]; if (!depth) finalLog.totalCu += enclog.totalCu; finalLog.cntIntra[depth] += enclog.cntIntra[depth]; @@ -836,7 +1009,7 @@ finalLog.cuInterDistribution[depth][m] += enclog.cuInterDistribution[depth][m]; } - if (depth == g_maxCUDepth) + if (cuSize == 8 && m_sps.quadtreeTULog2MinSize < 3) finalLog.cntIntraNxN += enclog.cntIntraNxN; if (sliceType != I_SLICE) { @@ -901,7 +1074,6 @@ } // print statistics - int cuSize = g_maxCUSize >> depth; char stats[256] = { 0 }; int len = 0; if (sliceType != I_SLICE) @@ -929,14 +1101,14 @@ cuIntraDistribution[1], cuIntraDistribution[2]); if (sliceType != I_SLICE) { - if (depth == g_maxCUDepth) + if (cuSize == 8 && m_sps.quadtreeTULog2MinSize < 3) len += sprintf(stats + len, " %dx%d "X265_LL "%%", cuSize / 2, cuSize / 2, cntIntraNxN); } len += sprintf(stats + len, ")"); if (sliceType == I_SLICE) { - if (depth == g_maxCUDepth) + if (cuSize == 8 && m_sps.quadtreeTULog2MinSize < 3) len += sprintf(stats + len, " %dx%d: "X265_LL "%%", cuSize / 2, cuSize / 2, cntIntraNxN); } } @@ -1301,13 +1473,17 @@ } } -void Encoder::initSPS(SPS *sps) +void Encoder::initVPS(VPS *vps) { - m_vps.ptl.progressiveSourceFlag = !m_param->interlaceMode; - m_vps.ptl.interlacedSourceFlag = !!m_param->interlaceMode; - m_vps.ptl.nonPackedConstraintFlag = false; - m_vps.ptl.frameOnlyConstraintFlag = !m_param->interlaceMode; + /* Note that much of the VPS is initialized by determineLevel() */ + vps->ptl.progressiveSourceFlag = !m_param->interlaceMode; + vps->ptl.interlacedSourceFlag = !!m_param->interlaceMode; + vps->ptl.nonPackedConstraintFlag = false; + vps->ptl.frameOnlyConstraintFlag = !m_param->interlaceMode; +} +void Encoder::initSPS(SPS *sps) +{ sps->conformanceWindow = m_conformanceWindow; sps->chromaFormatIdc = m_param->internalCsp; sps->picWidthInLumaSamples = m_param->sourceWidth; @@ -1315,13 +1491,13 @@ sps->numCuInWidth = (m_param->sourceWidth + g_maxCUSize - 1) / g_maxCUSize; sps->numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; sps->numCUsInFrame = sps->numCuInWidth * sps->numCuInHeight; - sps->numPartitions = NUM_CU_PARTITIONS; - sps->numPartInCUSize = 1 << g_maxFullDepth; + sps->numPartitions = NUM_4x4_PARTITIONS; + sps->numPartInCUSize = 1 << g_unitSizeDepth; sps->log2MinCodingBlockSize = g_maxLog2CUSize - g_maxCUDepth; sps->log2DiffMaxMinCodingBlockSize = g_maxCUDepth; - - sps->quadtreeTULog2MaxSize = X265_MIN(g_maxLog2CUSize, 5); + uint32_t maxLog2TUSize = (uint32_t)g_log2Size[m_param->maxTUSize]; + sps->quadtreeTULog2MaxSize = X265_MIN(g_maxLog2CUSize, maxLog2TUSize); sps->quadtreeTULog2MinSize = 2; sps->quadtreeTUMaxDepthInter = m_param->tuQTMaxInterDepth; sps->quadtreeTUMaxDepthIntra = m_param->tuQTMaxIntraDepth; @@ -1331,9 +1507,10 @@ sps->bUseAMP = m_param->bEnableAMP; sps->maxAMPDepth = m_param->bEnableAMP ? g_maxCUDepth : 0; + sps->maxTempSubLayers = m_param->bEnableTemporalSubLayers ? 2 : 1; sps->maxDecPicBuffering = m_vps.maxDecPicBuffering; sps->numReorderPics = m_vps.numReorderPics; - sps->maxLatencyIncrease = m_param->bframes; + sps->maxLatencyIncrease = m_vps.maxLatencyIncrease = m_param->bframes; sps->bUseStrongIntraSmoothing = m_param->bEnableStrongIntraSmoothing; sps->bTemporalMVPEnabled = m_param->bEnableTemporalMvp; @@ -1433,18 +1610,12 @@ } p->keyframeMin = X265_MAX(1, X265_MIN(p->keyframeMin, p->keyframeMax / 2 + 1)); - if (p->bBPyramid && !p->bframes) + if (!p->bframes) p->bBPyramid = 0; + if (!p->rdoqLevel) + p->psyRdoq = 0; /* Disable features which are not supported by the current RD level */ - if (p->rdLevel < 5) - { - if (p->bEnableCbfFastMode) /* impossible */ - x265_log(p, X265_LOG_WARNING, "--fast-cbf disabled, requires --rdlevel 5 or higher\n"); - p->bEnableCbfFastMode = 0; - } - if (p->rdLevel < 4) - p->psyRdoq = 0; /* impossible */ if (p->rdLevel < 3) { if (p->bCULossless) /* impossible */ @@ -1504,13 +1675,19 @@ p->rc.cuTree = 0; } + if (p->maxTUSize > p->maxCUSize) + { + x265_log(p, X265_LOG_WARNING, "Max TU size should be less than or equal to max CU size, setting max TU size = %d\n", p->maxCUSize); + p->maxTUSize = p->maxCUSize; + } + if (p->rc.aqStrength == 0 && p->rc.cuTree == 0) p->rc.aqMode = X265_AQ_NONE; if (p->rc.aqMode == X265_AQ_NONE && p->rc.cuTree == 0) p->rc.aqStrength = 0; - if (p->totalFrames <= 2 * ((float)p->fpsNum) / p->fpsDenom && p->rc.bStrictCbr) + if (p->totalFrames && p->totalFrames <= 2 * ((float)p->fpsNum) / p->fpsDenom && p->rc.bStrictCbr) p->lookaheadDepth = p->totalFrames; if (p->scalingLists && p->internalCsp == X265_CSP_I444) @@ -1534,6 +1711,12 @@ p->bDistributeMotionEstimation = p->bDistributeModeAnalysis = 0; } + if (p->bEnableTemporalSubLayers && !p->bframes) + { + x265_log(p, X265_LOG_WARNING, "B frames not enabled, temporal sublayer disabled\n"); + p->bEnableTemporalSubLayers = 0; + } + m_bframeDelay = p->bframes ? (p->bBPyramid ? 2 : 1) : 0; p->bFrameBias = X265_MIN(X265_MAX(-90, p->bFrameBias), 100); @@ -1568,6 +1751,10 @@ x265_log(p, X265_LOG_WARNING, "--tune %s should be used if attempting to benchmark %s!\n", s, s); } + /* some options make no sense if others are disabled */ + p->bSaoNonDeblocked &= p->bEnableSAO; + p->bEnableTSkipFast &= p->bEnableTransformSkip; + /* initialize the conformance window */ m_conformanceWindow.bEnabled = false; m_conformanceWindow.rightOffset = 0; @@ -1576,10 +1763,10 @@ m_conformanceWindow.leftOffset = 0; /* set pad size if width is not multiple of the minimum CU size */ - if (p->sourceWidth & (MIN_CU_SIZE - 1)) + if (p->sourceWidth & (p->minCUSize - 1)) { - uint32_t rem = p->sourceWidth & (MIN_CU_SIZE - 1); - uint32_t padsize = MIN_CU_SIZE - rem; + uint32_t rem = p->sourceWidth & (p->minCUSize - 1); + uint32_t padsize = p->minCUSize - rem; p->sourceWidth += padsize; m_conformanceWindow.bEnabled = true; @@ -1587,10 +1774,10 @@ } /* set pad size if height is not multiple of the minimum CU size */ - if (p->sourceHeight & (MIN_CU_SIZE - 1)) + if (p->sourceHeight & (p->minCUSize - 1)) { - uint32_t rem = p->sourceHeight & (MIN_CU_SIZE - 1); - uint32_t padsize = MIN_CU_SIZE - rem; + uint32_t rem = p->sourceHeight & (p->minCUSize - 1); + uint32_t padsize = p->minCUSize - rem; p->sourceHeight += padsize; m_conformanceWindow.bEnabled = true; @@ -1613,6 +1800,7 @@ CHECKED_MALLOC(intraData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(intraData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(intraData->partSizes, char, analysis->numPartitions * analysis->numCUsInFrame); + CHECKED_MALLOC(intraData->chromaModes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); analysis->intraData = intraData; } else @@ -1622,6 +1810,7 @@ CHECKED_MALLOC_ZERO(interData->ref, int32_t, analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2); CHECKED_MALLOC(interData->depth, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); CHECKED_MALLOC(interData->modes, uint8_t, analysis->numPartitions * analysis->numCUsInFrame); + CHECKED_MALLOC_ZERO(interData->bestMergeCand, uint32_t, analysis->numCUsInFrame * CUGeom::MAX_GEOMS); analysis->interData = interData; } return; @@ -1638,6 +1827,7 @@ X265_FREE(((analysis_intra_data*)analysis->intraData)->depth); X265_FREE(((analysis_intra_data*)analysis->intraData)->modes); X265_FREE(((analysis_intra_data*)analysis->intraData)->partSizes); + X265_FREE(((analysis_intra_data*)analysis->intraData)->chromaModes); X265_FREE(analysis->intraData); } else @@ -1645,6 +1835,7 @@ X265_FREE(((analysis_inter_data*)analysis->interData)->ref); X265_FREE(((analysis_inter_data*)analysis->interData)->depth); X265_FREE(((analysis_inter_data*)analysis->interData)->modes); + X265_FREE(((analysis_inter_data*)analysis->interData)->bestMergeCand); X265_FREE(analysis->interData); } } @@ -1702,6 +1893,7 @@ X265_FREAD(((analysis_intra_data *)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); X265_FREAD(((analysis_intra_data *)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); X265_FREAD(((analysis_intra_data *)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FREAD(((analysis_intra_data *)analysis->intraData)->chromaModes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); analysis->sliceType = X265_TYPE_I; consumedBytes += frameRecordSize; } @@ -1710,6 +1902,7 @@ X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile); X265_FREAD(((analysis_inter_data *)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); X265_FREAD(((analysis_inter_data *)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FREAD(((analysis_inter_data *)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile); consumedBytes += frameRecordSize; totalConsumedBytes = consumedBytes; } @@ -1718,6 +1911,7 @@ X265_FREAD(((analysis_inter_data *)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile); X265_FREAD(((analysis_inter_data *)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); X265_FREAD(((analysis_inter_data *)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FREAD(((analysis_inter_data *)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile); consumedBytes += frameRecordSize; } #undef X265_FREAD @@ -1739,16 +1933,18 @@ analysis->frameRecordSize = sizeof(analysis->frameRecordSize) + sizeof(analysis->poc) + sizeof(analysis->sliceType) + sizeof(analysis->numCUsInFrame) + sizeof(analysis->numPartitions); if (analysis->sliceType == X265_TYPE_IDR || analysis->sliceType == X265_TYPE_I) - analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 3; + analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 4; else if (analysis->sliceType == X265_TYPE_P) { analysis->frameRecordSize += sizeof(int32_t) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU; analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 2; + analysis->frameRecordSize += sizeof(uint32_t) * analysis->numCUsInFrame * CUGeom::MAX_GEOMS; } else { analysis->frameRecordSize += sizeof(int32_t) * analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2; analysis->frameRecordSize += sizeof(uint8_t) * analysis->numCUsInFrame * analysis->numPartitions * 2; + analysis->frameRecordSize += sizeof(uint32_t) * analysis->numCUsInFrame * CUGeom::MAX_GEOMS; } X265_FWRITE(&analysis->frameRecordSize, sizeof(uint32_t), 1, m_analysisFile); @@ -1762,18 +1958,21 @@ X265_FWRITE(((analysis_intra_data*)analysis->intraData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); X265_FWRITE(((analysis_intra_data*)analysis->intraData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); X265_FWRITE(((analysis_intra_data*)analysis->intraData)->partSizes, sizeof(char), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FWRITE(((analysis_intra_data*)analysis->intraData)->chromaModes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); } else if (analysis->sliceType == X265_TYPE_P) { X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU, m_analysisFile); X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile); } else { X265_FWRITE(((analysis_inter_data*)analysis->interData)->ref, sizeof(int32_t), analysis->numCUsInFrame * X265_MAX_PRED_MODE_PER_CTU * 2, m_analysisFile); X265_FWRITE(((analysis_inter_data*)analysis->interData)->depth, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); X265_FWRITE(((analysis_inter_data*)analysis->interData)->modes, sizeof(uint8_t), analysis->numCUsInFrame * analysis->numPartitions, m_analysisFile); + X265_FWRITE(((analysis_inter_data*)analysis->interData)->bestMergeCand, sizeof(uint32_t), analysis->numCUsInFrame * CUGeom::MAX_GEOMS, m_analysisFile); } #undef X265_FWRITE } diff -Nru x265-1.5/source/encoder/encoder.h x265-1.6/source/encoder/encoder.h --- x265-1.5/source/encoder/encoder.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/encoder.h 2015-04-02 16:46:36.000000000 +0000 @@ -70,7 +70,6 @@ class Lookahead; class RateControl; class ThreadPool; -struct ThreadLocalData; class Encoder : public x265_encoder { @@ -86,11 +85,12 @@ int64_t m_prevReorderedPts[2]; ThreadPool* m_threadPool; - FrameEncoder* m_frameEncoder; + FrameEncoder* m_frameEncoder[X265_MAX_FRAME_THREADS]; DPB* m_dpb; Frame* m_exportedPic; + int m_numPools; int m_curEncoder; /* cached PicYuv offset arrays, shared by all instances of @@ -120,14 +120,12 @@ PPS m_pps; NALList m_nalList; ScalingList m_scalingList; // quantization matrix information - int m_numThreadLocalData; int m_lastBPSEI; uint32_t m_numDelayedPic; x265_param* m_param; RateControl* m_rateControl; - ThreadLocalData* m_threadLocalData; Lookahead* m_lookahead; Window m_conformanceWindow; @@ -138,6 +136,7 @@ ~Encoder() {} void create(); + void stop(); void destroy(); int encode(const x265_picture* pic, x265_picture *pic_out); @@ -154,8 +153,6 @@ char* statsCSVString(EncStats& stat, char* buffer); - void setThreadPool(ThreadPool* p) { m_threadPool = p; } - void configure(x265_param *param); void updateVbvPlan(RateControl* rc); @@ -172,6 +169,7 @@ protected: + void initVPS(VPS *vps); void initSPS(SPS *sps); void initPPS(PPS *pps); }; diff -Nru x265-1.5/source/encoder/entropy.cpp x265-1.6/source/encoder/entropy.cpp --- x265-1.5/source/encoder/entropy.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/entropy.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -43,6 +43,7 @@ { markValid(); m_fracBits = 0; + m_pad = 0; X265_CHECK(sizeof(m_contextState) >= sizeof(m_contextState[0]) * MAX_OFF_CTX_MOD, "context state table is too small\n"); } @@ -51,17 +52,21 @@ WRITE_CODE(0, 4, "vps_video_parameter_set_id"); WRITE_CODE(3, 2, "vps_reserved_three_2bits"); WRITE_CODE(0, 6, "vps_reserved_zero_6bits"); - WRITE_CODE(0, 3, "vps_max_sub_layers_minus1"); - WRITE_FLAG(1, "vps_temporal_id_nesting_flag"); + WRITE_CODE(vps.maxTempSubLayers - 1, 3, "vps_max_sub_layers_minus1"); + WRITE_FLAG(vps.maxTempSubLayers == 1, "vps_temporal_id_nesting_flag"); WRITE_CODE(0xffff, 16, "vps_reserved_ffff_16bits"); - codeProfileTier(vps.ptl); + codeProfileTier(vps.ptl, vps.maxTempSubLayers); WRITE_FLAG(true, "vps_sub_layer_ordering_info_present_flag"); - WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1[i]"); - WRITE_UVLC(vps.numReorderPics, "vps_num_reorder_pics[i]"); - WRITE_UVLC(0, "vps_max_latency_increase_plus1[i]"); + for (uint32_t i = 0; i < vps.maxTempSubLayers; i++) + { + WRITE_UVLC(vps.maxDecPicBuffering - 1, "vps_max_dec_pic_buffering_minus1[i]"); + WRITE_UVLC(vps.numReorderPics, "vps_num_reorder_pics[i]"); + WRITE_UVLC(vps.maxLatencyIncrease + 1, "vps_max_latency_increase_plus1[i]"); + } + WRITE_CODE(0, 6, "vps_max_nuh_reserved_zero_layer_id"); WRITE_UVLC(0, "vps_max_op_sets_minus1"); WRITE_FLAG(0, "vps_timing_info_present_flag"); /* we signal timing info in SPS-VUI */ @@ -71,16 +76,16 @@ void Entropy::codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl) { WRITE_CODE(0, 4, "sps_video_parameter_set_id"); - WRITE_CODE(0, 3, "sps_max_sub_layers_minus1"); - WRITE_FLAG(1, "sps_temporal_id_nesting_flag"); + WRITE_CODE(sps.maxTempSubLayers - 1, 3, "sps_max_sub_layers_minus1"); + WRITE_FLAG(sps.maxTempSubLayers == 1, "sps_temporal_id_nesting_flag"); - codeProfileTier(ptl); + codeProfileTier(ptl, sps.maxTempSubLayers); WRITE_UVLC(0, "sps_seq_parameter_set_id"); WRITE_UVLC(sps.chromaFormatIdc, "chroma_format_idc"); if (sps.chromaFormatIdc == X265_CSP_I444) - WRITE_FLAG(0, "separate_colour_plane_flag"); + WRITE_FLAG(0, "separate_colour_plane_flag"); WRITE_UVLC(sps.picWidthInLumaSamples, "pic_width_in_luma_samples"); WRITE_UVLC(sps.picHeightInLumaSamples, "pic_height_in_luma_samples"); @@ -101,9 +106,12 @@ WRITE_UVLC(BITS_FOR_POC - 4, "log2_max_pic_order_cnt_lsb_minus4"); WRITE_FLAG(true, "sps_sub_layer_ordering_info_present_flag"); - WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1[i]"); - WRITE_UVLC(sps.numReorderPics, "sps_num_reorder_pics[i]"); - WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1[i]"); + for (uint32_t i = 0; i < sps.maxTempSubLayers; i++) + { + WRITE_UVLC(sps.maxDecPicBuffering - 1, "sps_max_dec_pic_buffering_minus1[i]"); + WRITE_UVLC(sps.numReorderPics, "sps_num_reorder_pics[i]"); + WRITE_UVLC(sps.maxLatencyIncrease + 1, "sps_max_latency_increase_plus1[i]"); + } WRITE_UVLC(sps.log2MinCodingBlockSize - 3, "log2_min_coding_block_size_minus3"); WRITE_UVLC(sps.log2DiffMaxMinCodingBlockSize, "log2_diff_max_min_coding_block_size"); @@ -129,7 +137,7 @@ WRITE_FLAG(sps.bUseStrongIntraSmoothing, "sps_strong_intra_smoothing_enable_flag"); WRITE_FLAG(1, "vui_parameters_present_flag"); - codeVUI(sps.vuiParameters); + codeVUI(sps.vuiParameters, sps.maxTempSubLayers); WRITE_FLAG(0, "sps_extension_flag"); } @@ -184,7 +192,7 @@ WRITE_FLAG(0, "pps_extension_flag"); } -void Entropy::codeProfileTier(const ProfileTierLevel& ptl) +void Entropy::codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers) { WRITE_CODE(0, 2, "XXX_profile_space[]"); WRITE_FLAG(ptl.tierFlag, "XXX_tier_flag[]"); @@ -222,9 +230,17 @@ } WRITE_CODE(ptl.levelIdc, 8, "general_level_idc"); + + if (maxTempSubLayers > 1) + { + WRITE_FLAG(0, "sub_layer_profile_present_flag[i]"); + WRITE_FLAG(0, "sub_layer_level_present_flag[i]"); + for (int i = maxTempSubLayers - 1; i < 8 ; i++) + WRITE_CODE(0, 2, "reserved_zero_2bits"); + } } -void Entropy::codeVUI(const VUI& vui) +void Entropy::codeVUI(const VUI& vui, int maxSubTLayers) { WRITE_FLAG(vui.aspectRatioInfoPresentFlag, "aspect_ratio_info_present_flag"); if (vui.aspectRatioInfoPresentFlag) @@ -282,7 +298,7 @@ WRITE_FLAG(vui.hrdParametersPresentFlag, "vui_hrd_parameters_present_flag"); if (vui.hrdParametersPresentFlag) - codeHrdParameters(vui.hrdParameters); + codeHrdParameters(vui.hrdParameters, maxSubTLayers); WRITE_FLAG(0, "bitstream_restriction_flag"); } @@ -329,7 +345,7 @@ } } -void Entropy::codeHrdParameters(const HRDInfo& hrd) +void Entropy::codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers) { WRITE_FLAG(1, "nal_hrd_parameters_present_flag"); WRITE_FLAG(0, "vcl_hrd_parameters_present_flag"); @@ -342,13 +358,16 @@ WRITE_CODE(hrd.cpbRemovalDelayLength - 1, 5, "au_cpb_removal_delay_length_minus1"); WRITE_CODE(hrd.dpbOutputDelayLength - 1, 5, "dpb_output_delay_length_minus1"); - WRITE_FLAG(1, "fixed_pic_rate_general_flag"); - WRITE_UVLC(0, "elemental_duration_in_tc_minus1"); - WRITE_UVLC(0, "cpb_cnt_minus1"); - - WRITE_UVLC(hrd.bitRateValue - 1, "bit_rate_value_minus1"); - WRITE_UVLC(hrd.cpbSizeValue - 1, "cpb_size_value_minus1"); - WRITE_FLAG(hrd.cbrFlag, "cbr_flag"); + for (int i = 0; i < maxSubTLayers; i++) + { + WRITE_FLAG(1, "fixed_pic_rate_general_flag"); + WRITE_UVLC(0, "elemental_duration_in_tc_minus1"); + WRITE_UVLC(0, "cpb_cnt_minus1"); + + WRITE_UVLC(hrd.bitRateValue - 1, "bit_rate_value_minus1"); + WRITE_UVLC(hrd.cpbSizeValue - 1, "cpb_size_value_minus1"); + WRITE_FLAG(hrd.cbrFlag, "cbr_flag"); + } } void Entropy::codeAUD(const Slice& slice) @@ -521,15 +540,14 @@ { const Slice* slice = ctu.m_slice; - if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP) - bEncodeDQP = true; - int cuSplitFlag = !(cuGeom.flags & CUGeom::LEAF); int cuUnsplitFlag = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY); if (!cuUnsplitFlag) { uint32_t qNumParts = cuGeom.numPartitions >> 2; + if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP) + bEncodeDQP = true; for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) { const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx); @@ -539,13 +557,14 @@ return; } - // We need to split, so don't try these modes. if (cuSplitFlag) codeSplitFlag(ctu, absPartIdx, depth); if (depth < ctu.m_cuDepth[absPartIdx] && depth < g_maxCUDepth) { uint32_t qNumParts = cuGeom.numPartitions >> 2; + if (depth == slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP) + bEncodeDQP = true; for (uint32_t qIdx = 0; qIdx < 4; ++qIdx, absPartIdx += qNumParts) { const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + qIdx); @@ -554,6 +573,9 @@ return; } + if (depth <= slice->m_pps->maxCuDQPDepth && slice->m_pps->bUseDQP) + bEncodeDQP = true; + if (slice->m_pps->bTransquantBypassEnabled) codeCUTransquantBypassFlag(ctu.m_tqBypass[absPartIdx]); @@ -654,7 +676,7 @@ { // Encode slice finish bool bTerminateSlice = false; - if (cuAddr + (NUM_CU_PARTITIONS >> (depth << 1)) == realEndAddress) + if (cuAddr + (NUM_4x4_PARTITIONS >> (depth << 1)) == realEndAddress) bTerminateSlice = true; // The 1-terminating bit is added to all streams, so don't add it here when it's 1. @@ -666,78 +688,78 @@ } } -void Entropy::encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize, +void Entropy::encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t curDepth, uint32_t log2CurSize, bool& bCodeDQP, const uint32_t depthRange[2]) { - const bool subdiv = cu.m_tuDepth[absPartIdx] > tuDepth; + const bool subdiv = cu.m_tuDepth[absPartIdx] > curDepth; /* in each of these conditions, the subdiv flag is implied and not signaled, * so we have checks to make sure the implied value matches our intentions */ - if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth) + if (cu.isIntra(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && log2CurSize == MIN_LOG2_CU_SIZE) { X265_CHECK(subdiv, "intra NxN requires TU depth below CU depth\n"); } - else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && !tuDepth && - cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1) + else if (cu.isInter(absPartIdx) && cu.m_partSize[absPartIdx] != SIZE_2Nx2N && + !curDepth && cu.m_slice->m_sps->quadtreeTUMaxDepthInter == 1) { - X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2TrSize %d, depthRange[0] %d\n", log2TrSize, depthRange[0]); + X265_CHECK(subdiv, "inter TU must be smaller than CU when not 2Nx2N part size: log2CurSize %d, depthRange[0] %d\n", log2CurSize, depthRange[0]); } - else if (log2TrSize > depthRange[1]) + else if (log2CurSize > depthRange[1]) { X265_CHECK(subdiv, "TU is larger than the max allowed, it should have been split\n"); } - else if (log2TrSize == cu.m_slice->m_sps->quadtreeTULog2MinSize || log2TrSize == depthRange[0]) + else if (log2CurSize == cu.m_slice->m_sps->quadtreeTULog2MinSize || log2CurSize == depthRange[0]) { X265_CHECK(!subdiv, "min sized TU cannot be subdivided\n"); } else { - X265_CHECK(log2TrSize > depthRange[0], "transform size failure\n"); - codeTransformSubdivFlag(subdiv, 5 - log2TrSize); + X265_CHECK(log2CurSize > depthRange[0], "transform size failure\n"); + codeTransformSubdivFlag(subdiv, 5 - log2CurSize); } uint32_t hChromaShift = cu.m_hChromaShift; uint32_t vChromaShift = cu.m_vChromaShift; - bool bSmallChroma = (log2TrSize - hChromaShift < 2); - if (!tuDepth || !bSmallChroma) + bool bSmallChroma = (log2CurSize - hChromaShift) < 2; + if (!curDepth || !bSmallChroma) { - if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1)) - codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, tuDepth, !subdiv); - if (!tuDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1)) - codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, tuDepth, !subdiv); + if (!curDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_U, curDepth - 1)) + codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_U, curDepth, !subdiv); + if (!curDepth || cu.getCbf(absPartIdx, TEXT_CHROMA_V, curDepth - 1)) + codeQtCbfChroma(cu, absPartIdx, TEXT_CHROMA_V, curDepth, !subdiv); } else { - X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, tuDepth - 1), "chroma xform size match failure\n"); - X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, tuDepth - 1), "chroma xform size match failure\n"); + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_U, curDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_U, curDepth - 1), "chroma xform size match failure\n"); + X265_CHECK(cu.getCbf(absPartIdx, TEXT_CHROMA_V, curDepth) == cu.getCbf(absPartIdx, TEXT_CHROMA_V, curDepth - 1), "chroma xform size match failure\n"); } if (subdiv) { - --log2TrSize; - ++tuDepth; + --log2CurSize; + ++curDepth; - uint32_t qNumParts = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2; + uint32_t qNumParts = 1 << (log2CurSize - LOG2_UNIT_SIZE) * 2; - encodeTransform(cu, absPartIdx + 0 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange); - encodeTransform(cu, absPartIdx + 1 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange); - encodeTransform(cu, absPartIdx + 2 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange); - encodeTransform(cu, absPartIdx + 3 * qNumParts, tuDepth, log2TrSize, bCodeDQP, depthRange); + encodeTransform(cu, absPartIdx + 0 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange); + encodeTransform(cu, absPartIdx + 1 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange); + encodeTransform(cu, absPartIdx + 2 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange); + encodeTransform(cu, absPartIdx + 3 * qNumParts, curDepth, log2CurSize, bCodeDQP, depthRange); return; } uint32_t absPartIdxC = bSmallChroma ? absPartIdx & 0xFC : absPartIdx; - if (cu.isInter(absPartIdxC) && !tuDepth && !cu.getCbf(absPartIdxC, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdxC, TEXT_CHROMA_V, 0)) + if (cu.isInter(absPartIdxC) && !curDepth && !cu.getCbf(absPartIdxC, TEXT_CHROMA_U, 0) && !cu.getCbf(absPartIdxC, TEXT_CHROMA_V, 0)) { X265_CHECK(cu.getCbf(absPartIdxC, TEXT_LUMA, 0), "CBF should have been set\n"); } else - codeQtCbfLuma(cu, absPartIdx, tuDepth); + codeQtCbfLuma(cu, absPartIdx, curDepth); - uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth); - uint32_t cbfU = cu.getCbf(absPartIdxC, TEXT_CHROMA_U, tuDepth); - uint32_t cbfV = cu.getCbf(absPartIdxC, TEXT_CHROMA_V, tuDepth); + uint32_t cbfY = cu.getCbf(absPartIdx, TEXT_LUMA, curDepth); + uint32_t cbfU = cu.getCbf(absPartIdxC, TEXT_CHROMA_U, curDepth); + uint32_t cbfV = cu.getCbf(absPartIdxC, TEXT_CHROMA_V, curDepth); if (!(cbfY || cbfU || cbfV)) return; @@ -753,7 +775,7 @@ if (cbfY) { uint32_t coeffOffset = absPartIdx << (LOG2_UNIT_SIZE * 2); - codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2TrSize, TEXT_LUMA); + codeCoeffNxN(cu, cu.m_trCoeff[0] + coeffOffset, absPartIdx, log2CurSize, TEXT_LUMA); if (!(cbfU || cbfV)) return; } @@ -763,7 +785,7 @@ if ((absPartIdx & 3) != 3) return; - const uint32_t log2TrSizeC = 2; + const uint32_t log2CurSizeC = 2; const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422); const uint32_t curPartNum = 4; uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift)); @@ -773,10 +795,10 @@ const coeff_t* coeffChroma = cu.m_trCoeff[chromaId]; do { - if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs)) + if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, curDepth + splitIntoSubTUs)) { - uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId); + uint32_t subTUOffset = tuIterator.section << (log2CurSizeC * 2); + codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2CurSizeC, (TextType)chromaId); } } while (tuIterator.isNextSection()); @@ -784,9 +806,9 @@ } else { - uint32_t log2TrSizeC = log2TrSize - hChromaShift; + uint32_t log2CurSizeC = log2CurSize - hChromaShift; const bool splitIntoSubTUs = (cu.m_chromaFormat == X265_CSP_I422); - uint32_t curPartNum = 1 << (log2TrSize - LOG2_UNIT_SIZE) * 2; + uint32_t curPartNum = 1 << (log2CurSize - LOG2_UNIT_SIZE) * 2; uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (hChromaShift + vChromaShift)); for (uint32_t chromaId = TEXT_CHROMA_U; chromaId <= TEXT_CHROMA_V; chromaId++) { @@ -794,10 +816,10 @@ const coeff_t* coeffChroma = cu.m_trCoeff[chromaId]; do { - if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, tuDepth + splitIntoSubTUs)) + if (cu.getCbf(tuIterator.absPartIdxTURelCU, (TextType)chromaId, curDepth + splitIntoSubTUs)) { - uint32_t subTUOffset = tuIterator.section << (log2TrSizeC * 2); - codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2TrSizeC, (TextType)chromaId); + uint32_t subTUOffset = tuIterator.section << (log2CurSizeC * 2); + codeCoeffNxN(cu, coeffChroma + coeffOffsetC + subTUOffset, tuIterator.absPartIdxTURelCU, log2CurSizeC, (TextType)chromaId); } } while (tuIterator.isNextSection()); @@ -836,10 +858,11 @@ /** encode motion information for every PU block */ void Entropy::codePUWise(const CUData& cu, uint32_t absPartIdx) { + X265_CHECK(!cu.isIntra(absPartIdx), "intra block not expected\n"); PartSize partSize = (PartSize)cu.m_partSize[absPartIdx]; uint32_t numPU = (partSize == SIZE_2Nx2N ? 1 : (partSize == SIZE_NxN ? 4 : 2)); uint32_t depth = cu.m_cuDepth[absPartIdx]; - uint32_t puOffset = (g_puOffset[uint32_t(partSize)] << (g_maxFullDepth - depth) * 2) >> 4; + uint32_t puOffset = (g_puOffset[uint32_t(partSize)] << (g_unitSizeDepth - depth) * 2) >> 4; for (uint32_t puIdx = 0, subPartIdx = absPartIdx; puIdx < numPU; puIdx++, subPartIdx += puOffset) { @@ -1411,12 +1434,9 @@ { uint32_t trSize = 1 << log2TrSize; uint32_t tqBypass = cu.m_tqBypass[absPartIdx]; - // compute number of significant coefficients - uint32_t numSig = primitives.count_nonzero(coeff, (1 << (log2TrSize << 1))); - + uint32_t numSig = primitives.cu[log2TrSize - 2].count_nonzero(coeff); X265_CHECK(numSig > 0, "cbf check fail\n"); - bool bHideFirstSign = cu.m_slice->m_pps->bSignHideEnabled && !tqBypass; if (log2TrSize <= MAX_LOG2_TS_SIZE && !tqBypass && cu.m_slice->m_pps->bTransformSkipEnabled) @@ -1431,9 +1451,6 @@ uint8_t coeffNum[MLS_GRP_NUM]; // value range[0, 16] uint16_t coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign uint16_t coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff - memset(coeffNum, 0, sizeof(coeffNum)); - memset(coeffFlag, 0, sizeof(coeffFlag)); - memset(coeffSign, 0, sizeof(coeffSign)); //----- encode significance map ----- @@ -1444,39 +1461,9 @@ //const uint32_t maskPosXY = ((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1; X265_CHECK((uint32_t)((1 << (log2TrSize - MLS_CG_LOG2_SIZE)) - 1) == (((uint32_t)~0 >> (31 - log2TrSize + MLS_CG_LOG2_SIZE)) >> 1), "maskPosXY fault\n"); - uint32_t cgBlkNum = 0; - do - { - const uint32_t cgBlkIdx = scanPosLast & (MLS_CG_BLK_SIZE - 1); - const uint32_t cgIdx = scanPosLast >> MLS_CG_SIZE; - - posLast = codingParameters.scan[scanPosLast++]; + scanPosLast = primitives.findPosLast(codingParameters.scan, coeff, coeffSign, coeffFlag, coeffNum, numSig); + posLast = codingParameters.scan[scanPosLast]; - const int curCoeff = coeff[posLast]; - const uint32_t isNZCoeff = (curCoeff != 0); - // get L1 sig map - // NOTE: the new algorithm is complicated, so I keep reference code here - //uint32_t posy = posLast >> log2TrSize; - //uint32_t posx = posLast - (posy << log2TrSize); - //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE); - //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY); - //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx); - numSig -= isNZCoeff; - - // TODO: optimize by instruction BTS - coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << cgBlkNum); - coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff; - cgBlkNum += isNZCoeff; - // TODO: reduce memory store operator, but avoid conditional branch - coeffNum[cgIdx] = (uint8_t)cgBlkNum; - - if (cgBlkIdx == (MLS_CG_BLK_SIZE - 1)) - { - cgBlkNum = 0; - } - } - while (numSig > 0); - scanPosLast--; const int lastScanSet = scanPosLast >> MLS_CG_SIZE; // Calculate CG block non-zero mask, the latest CG always flag as non-zero in CG scan loop @@ -1568,22 +1555,173 @@ // encode significant_coeff_flag if (sigCoeffGroupFlag64 & cgBlkPosMask) { + X265_CHECK((log2TrSize != 2) || (log2TrSize == 2 && subSet == 0), "log2TrSize and subSet mistake!\n"); const int patternSigCtx = Quant::calcPatternSigCtx(sigCoeffGroupFlag64, cgPosX, cgPosY, codingParameters.log2TrSizeCG); - uint32_t blkPos, sig, ctxSig; - for (; scanPosSigOff >= 0; scanPosSigOff--) + + static const uint8_t ctxIndMap4x4[16] = + { + 0, 1, 4, 5, + 2, 3, 4, 5, + 6, 6, 8, 8, + 7, 7, 8, 8 + }; + // NOTE: [patternSigCtx][posXinSubset][posYinSubset] + static const uint8_t table_cnt[4][4][4] = { - blkPos = codingParameters.scan[subPosBase + scanPosSigOff]; - sig = scanFlagMask & 1; - scanFlagMask >>= 1; - X265_CHECK((uint32_t)(coeff[blkPos] != 0) == sig, "sign bit mistake\n"); - if (scanPosSigOff != 0 || subSet == 0 || numNonZero) + // patternSigCtx = 0 { - ctxSig = Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codingParameters.firstSignificanceMapContext); - encodeBin(sig, baseCtx[ctxSig]); + { 2, 1, 1, 0 }, + { 1, 1, 0, 0 }, + { 1, 0, 0, 0 }, + { 0, 0, 0, 0 }, + }, + // patternSigCtx = 1 + { + { 2, 1, 0, 0 }, + { 2, 1, 0, 0 }, + { 2, 1, 0, 0 }, + { 2, 1, 0, 0 }, + }, + // patternSigCtx = 2 + { + { 2, 2, 2, 2 }, + { 1, 1, 1, 1 }, + { 0, 0, 0, 0 }, + { 0, 0, 0, 0 }, + }, + // patternSigCtx = 3 + { + { 2, 2, 2, 2 }, + { 2, 2, 2, 2 }, + { 2, 2, 2, 2 }, + { 2, 2, 2, 2 }, + } + }; + if (m_bitIf) + { + if (log2TrSize == 2) + { + uint32_t blkPos, sig, ctxSig; + for (; scanPosSigOff >= 0; scanPosSigOff--) + { + blkPos = codingParameters.scan[subPosBase + scanPosSigOff]; + sig = scanFlagMask & 1; + scanFlagMask >>= 1; + X265_CHECK((uint32_t)(coeff[blkPos] != 0) == sig, "sign bit mistake\n"); + { + ctxSig = ctxIndMap4x4[blkPos]; + X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");; + encodeBin(sig, baseCtx[ctxSig]); + } + absCoeff[numNonZero] = int(abs(coeff[blkPos])); + numNonZero += sig; + } + } + else + { + X265_CHECK((log2TrSize > 2), "log2TrSize must be more than 2 in this path!\n"); + + const uint8_t (*tabSigCtx)[4] = table_cnt[(uint32_t)patternSigCtx]; + const int offset = codingParameters.firstSignificanceMapContext; + const uint32_t lumaMask = bIsLuma ? ~0 : 0; + static const uint32_t posXY4Mask[] = {0x024, 0x0CC, 0x39C}; + const uint32_t posGT4Mask = posXY4Mask[log2TrSize - 3] & lumaMask; + + uint32_t blkPos, sig, ctxSig; + for (; scanPosSigOff >= 0; scanPosSigOff--) + { + blkPos = codingParameters.scan[subPosBase + scanPosSigOff]; + X265_CHECK(blkPos || (subPosBase + scanPosSigOff == 0), "blkPos==0 must be at scan[0]\n"); + const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0; + sig = scanFlagMask & 1; + scanFlagMask >>= 1; + X265_CHECK((uint32_t)(coeff[blkPos] != 0) == sig, "sign bit mistake\n"); + if (scanPosSigOff != 0 || subSet == 0 || numNonZero) + { + const uint32_t posY = blkPos >> log2TrSize; + const uint32_t posOffset = (blkPos & posGT4Mask) ? 3 : 0; + + const uint32_t posXinSubset = blkPos & 3; + const uint32_t posYinSubset = posY & 3; + const uint32_t cnt = tabSigCtx[posXinSubset][posYinSubset] + offset; + ctxSig = (cnt + posOffset) & posZeroMask; + + X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");; + encodeBin(sig, baseCtx[ctxSig]); + } + absCoeff[numNonZero] = int(abs(coeff[blkPos])); + numNonZero += sig; + } } - absCoeff[numNonZero] = int(abs(coeff[blkPos])); - numNonZero += sig; } + else // fast RD path + { + // maximum g_entropyBits are 18-bits and maximum of count are 16, so intermedia of sum are 22-bits + uint32_t sum = 0; + if (log2TrSize == 2) + { + uint32_t blkPos, sig, ctxSig; + for (; scanPosSigOff >= 0; scanPosSigOff--) + { + blkPos = codingParameters.scan[subPosBase + scanPosSigOff]; + sig = scanFlagMask & 1; + scanFlagMask >>= 1; + X265_CHECK((uint32_t)(coeff[blkPos] != 0) == sig, "sign bit mistake\n"); + { + ctxSig = ctxIndMap4x4[blkPos]; + X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");; + //encodeBin(sig, baseCtx[ctxSig]); + const uint32_t mstate = baseCtx[ctxSig]; + baseCtx[ctxSig] = sbacNext(mstate, sig); + sum += sbacGetEntropyBits(mstate, sig); + } + absCoeff[numNonZero] = int(abs(coeff[blkPos])); + numNonZero += sig; + } + } // end of 4x4 + else + { + X265_CHECK((log2TrSize > 2), "log2TrSize must be more than 2 in this path!\n"); + + const uint8_t (*tabSigCtx)[4] = table_cnt[(uint32_t)patternSigCtx]; + const int offset = codingParameters.firstSignificanceMapContext; + const uint32_t lumaMask = bIsLuma ? ~0 : 0; + static const uint32_t posXY4Mask[] = {0x024, 0x0CC, 0x39C}; + const uint32_t posGT4Mask = posXY4Mask[log2TrSize - 3] & lumaMask; + + uint32_t blkPos, sig, ctxSig; + for (; scanPosSigOff >= 0; scanPosSigOff--) + { + blkPos = codingParameters.scan[subPosBase + scanPosSigOff]; + X265_CHECK(blkPos || (subPosBase + scanPosSigOff == 0), "blkPos==0 must be at scan[0]\n"); + const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0; + sig = scanFlagMask & 1; + scanFlagMask >>= 1; + X265_CHECK((uint32_t)(coeff[blkPos] != 0) == sig, "sign bit mistake\n"); + if (scanPosSigOff != 0 || subSet == 0 || numNonZero) + { + const uint32_t posY = blkPos >> log2TrSize; + const uint32_t posOffset = (blkPos & posGT4Mask) ? 3 : 0; + + const uint32_t posXinSubset = blkPos & 3; + const uint32_t posYinSubset = posY & 3; + const uint32_t cnt = tabSigCtx[posXinSubset][posYinSubset] + offset; + ctxSig = (cnt + posOffset) & posZeroMask; + + X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, blkPos, bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");; + //encodeBin(sig, baseCtx[ctxSig]); + const uint32_t mstate = baseCtx[ctxSig]; + baseCtx[ctxSig] = sbacNext(mstate, sig); + sum += sbacGetEntropyBits(mstate, sig); + } + absCoeff[numNonZero] = int(abs(coeff[blkPos])); + numNonZero += sig; + } + } // end of non 4x4 path + + // update RD cost + m_fracBits += sum; + } // end of fast RD path -- !m_bitIf } X265_CHECK(coeffNum[subSet] == numNonZero, "coefNum mistake\n"); diff -Nru x265-1.5/source/encoder/entropy.h x265-1.6/source/encoder/entropy.h --- x265-1.5/source/encoder/entropy.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/entropy.h 2015-04-02 16:46:36.000000000 +0000 @@ -142,9 +142,9 @@ void codeVPS(const VPS& vps); void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl); void codePPS(const PPS& pps); - void codeVUI(const VUI& vui); + void codeVUI(const VUI& vui, int maxSubTLayers); void codeAUD(const Slice& slice); - void codeHrdParameters(const HRDInfo& hrd); + void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers); void codeSliceHeader(const Slice& slice, FrameData& encData); void codeSliceHeaderWPPEntryPoints(const Slice& slice, const uint32_t *substreamSizes, uint32_t maxOffset); @@ -230,7 +230,7 @@ void writeEpExGolomb(uint32_t symbol, uint32_t count); void writeCoefRemainExGolomb(uint32_t symbol, const uint32_t absGoRice); - void codeProfileTier(const ProfileTierLevel& ptl); + void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers); void codeScalingList(const ScalingList&); void codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId); diff -Nru x265-1.5/source/encoder/frameencoder.cpp x265-1.6/source/encoder/frameencoder.cpp --- x265-1.5/source/encoder/frameencoder.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/frameencoder.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -39,14 +39,13 @@ void weightAnalyse(Slice& slice, Frame& frame, x265_param& param); FrameEncoder::FrameEncoder() - : WaveFront(NULL) - , m_threadActive(true) { m_prevOutputTime = x265_mdate(); - m_totalWorkerElapsedTime = 0; + m_isFrameEncoder = true; + m_threadActive = true; m_slicetypeWaitTime = 0; - m_frameEncoderID = 0; m_activeWorkerCount = 0; + m_completionCount = 0; m_bAllRowsStop = false; m_vbvResetTriggerRow = -1; m_outStreams = NULL; @@ -59,6 +58,7 @@ m_frame = NULL; m_cuGeoms = NULL; m_ctuGeomMap = NULL; + m_localTldIdx = 0; memset(&m_frameStats, 0, sizeof(m_frameStats)); memset(&m_rce, 0, sizeof(RateControlEntry)); } @@ -66,10 +66,22 @@ void FrameEncoder::destroy() { if (m_pool) - JobProvider::flush(); // ensure no worker threads are using this frame - - m_threadActive = false; - m_enable.trigger(); + { + if (!m_jpId) + { + int numTLD = m_pool->m_numWorkers; + if (!m_param->bEnableWavefront) + numTLD += m_pool->m_numProviders; + for (int i = 0; i < numTLD; i++) + m_tld[i].destroy(); + delete [] m_tld; + } + } + else + { + m_tld->destroy(); + delete m_tld; + } delete[] m_rows; delete[] m_outStreams; @@ -85,12 +97,9 @@ delete m_rce.picTimingSEI; delete m_rce.hrdTiming; } - - // wait for worker thread to exit - stop(); } -bool FrameEncoder::init(Encoder *top, int numRows, int numCols, int id) +bool FrameEncoder::init(Encoder *top, int numRows, int numCols) { m_top = top; m_param = top->m_param; @@ -99,14 +108,14 @@ m_filterRowDelay = (m_param->bEnableSAO && m_param->bSaoNonDeblocked) ? 2 : (m_param->bEnableSAO || m_param->bEnableLoopFilter ? 1 : 0); m_filterRowDelayCus = m_filterRowDelay * numCols; - m_frameEncoderID = id; m_rows = new CTURow[m_numRows]; bool ok = !!m_numRows; - int range = m_param->searchRange; /* fpel search */ - range += 1; /* diamond search range check lag */ - range += 2; /* subpel refine */ - range += NTAPS_LUMA / 2; /* subpel filter half-length */ + /* determine full motion search range */ + int range = m_param->searchRange; /* fpel search */ + range += !!(m_param->searchMethod < 2); /* diamond/hex range check lag */ + range += NTAPS_LUMA / 2; /* subpel filter half-length */ + range += 2 + MotionEstimate::hpelIterationCount(m_param->subpelRefine) / 2; /* subpel refine steps */ m_refLagRows = 1 + ((range + g_maxCUSize - 1) / g_maxCUSize); // NOTE: 2 times of numRows because both Encoder and Filter in same queue @@ -134,7 +143,6 @@ else m_param->noiseReductionIntra = m_param->noiseReductionInter = 0; - start(); return ok; } @@ -143,6 +151,7 @@ { /* Geoms only vary between CTUs in the presence of picture edges */ int maxCUSize = m_param->maxCUSize; + int minCUSize = m_param->minCUSize; int heightRem = m_param->sourceHeight & (maxCUSize - 1); int widthRem = m_param->sourceWidth & (maxCUSize - 1); int allocGeoms = 1; // body @@ -157,7 +166,7 @@ return false; // body - CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, m_cuGeoms); + CUData::calcCTUGeoms(maxCUSize, maxCUSize, maxCUSize, minCUSize, m_cuGeoms); memset(m_ctuGeomMap, 0, sizeof(uint32_t) * m_numRows * m_numCols); if (allocGeoms == 1) return true; @@ -166,7 +175,7 @@ if (widthRem) { // right - CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); + CUData::calcCTUGeoms(widthRem, maxCUSize, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); for (uint32_t i = 0; i < m_numRows; i++) { uint32_t ctuAddr = m_numCols * (i + 1) - 1; @@ -177,7 +186,7 @@ if (heightRem) { // bottom - CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); + CUData::calcCTUGeoms(maxCUSize, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); for (uint32_t i = 0; i < m_numCols; i++) { uint32_t ctuAddr = m_numCols * (m_numRows - 1) + i; @@ -188,7 +197,7 @@ if (widthRem) { // corner - CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); + CUData::calcCTUGeoms(widthRem, heightRem, maxCUSize, minCUSize, m_cuGeoms + countGeoms * CUGeom::MAX_GEOMS); uint32_t ctuAddr = m_numCols * m_numRows - 1; m_ctuGeomMap[ctuAddr] = countGeoms * CUGeom::MAX_GEOMS; @@ -204,7 +213,9 @@ { m_slicetypeWaitTime = x265_mdate() - m_prevOutputTime; m_frame = curFrame; - curFrame->m_encData->m_frameEncoderID = m_frameEncoderID; // Each Frame knows the ID of the FrameEncoder encoding it + m_sliceType = curFrame->m_lowres.sliceType; + curFrame->m_encData->m_frameEncoderID = m_jpId; + curFrame->m_encData->m_jobProvider = this; curFrame->m_encData->m_slice->m_mref = m_mref; if (!m_cuGeoms) @@ -219,19 +230,66 @@ void FrameEncoder::threadMain() { - THREAD_NAME("Frame", m_frameEncoderID); + THREAD_NAME("Frame", m_jpId); - // worker thread routine for FrameEncoder - do + if (m_pool) { - m_enable.wait(); // Encoder::encode() triggers this event - if (m_threadActive) - { - compressFrame(); - m_done.trigger(); // FrameEncoder::getEncodedPicture() blocks for this event + m_pool->setCurrentThreadAffinity(); + + /* the first FE on each NUMA node is responsible for allocating thread + * local data for all worker threads in that pool. If WPP is disabled, then + * each FE also needs a TLD instance */ + if (!m_jpId) + { + int numTLD = m_pool->m_numWorkers; + if (!m_param->bEnableWavefront) + numTLD += m_pool->m_numProviders; + + m_tld = new ThreadLocalData[numTLD]; + for (int i = 0; i < numTLD; i++) + { + m_tld[i].analysis.initSearch(*m_param, m_top->m_scalingList); + m_tld[i].analysis.create(m_tld); + } + + for (int i = 0; i < m_pool->m_numProviders; i++) + { + if (m_pool->m_jpTable[i]->m_isFrameEncoder) /* ugh; over-allocation and other issues here */ + { + FrameEncoder *peer = dynamic_cast(m_pool->m_jpTable[i]); + peer->m_tld = m_tld; + } + } } + + if (m_param->bEnableWavefront) + m_localTldIdx = -1; // cause exception if used + else + m_localTldIdx = m_pool->m_numWorkers + m_jpId; + } + else + { + m_tld = new ThreadLocalData; + m_tld->analysis.initSearch(*m_param, m_top->m_scalingList); + m_tld->analysis.create(NULL); + m_localTldIdx = 0; + } + + m_done.trigger(); /* signal that thread is initialized */ + m_enable.wait(); /* Encoder::encode() triggers this event */ + + while (m_threadActive) + { + compressFrame(); + m_done.trigger(); /* FrameEncoder::getEncodedPicture() blocks for this event */ + m_enable.wait(); } - while (m_threadActive); +} + +void FrameEncoder::WeightAnalysis::processTasks(int /* workerThreadId */) +{ + Frame* frame = master.m_frame; + weightAnalyse(*frame->m_encData->m_slice, *frame, *master.m_param); } void FrameEncoder::compressFrame() @@ -247,6 +305,15 @@ m_allRowsAvailableTime = 0; m_stallStartTime = 0; + m_completionCount = 0; + m_bAllRowsStop = false; + m_vbvResetTriggerRow = -1; + + m_SSDY = m_SSDU = m_SSDV = 0; + m_ssim = 0; + m_ssimCnt = 0; + memset(&m_frameStats, 0, sizeof(m_frameStats)); + /* Emit access unit delimiter unless this is the first frame and the user is * not repeating headers (since AUD is supposed to be the first NAL in the access * unit) */ @@ -266,7 +333,18 @@ bool bUseWeightP = slice->m_sliceType == P_SLICE && slice->m_pps->bUseWeightPred; bool bUseWeightB = slice->m_sliceType == B_SLICE && slice->m_pps->bUseWeightedBiPred; if (bUseWeightP || bUseWeightB) - weightAnalyse(*slice, *m_frame, *m_param); + { +#if DETAILED_CU_STATS + m_cuStats.countWeightAnalyze++; + ScopedElapsedTime time(m_cuStats.weightAnalyzeTime); +#endif + WeightAnalysis wa(*this); + if (m_pool && wa.tryBondPeers(*this, 1)) + /* use an idle worker for weight analysis */ + wa.waitForExit(); + else + weightAnalyse(*slice, *m_frame, *m_param); + } else slice->disableWeights(); @@ -295,7 +373,10 @@ m_frameFilter.start(m_frame, m_initSliceContext, qp); - // reset entropy coders + /* ensure all rows are blocked prior to initializing row CTU counters */ + WaveFront::clearEnabledRowMask(); + + /* reset entropy coders */ m_entropyCoder.load(m_initSliceContext); for (uint32_t i = 0; i < m_numRows; i++) m_rows[i].init(m_initSliceContext); @@ -393,10 +474,82 @@ m_nalList.serialize(NAL_UNIT_PREFIX_SEI, m_bs); } - // Analyze CTU rows, most of the hard work is done here - // frame is compressed in a wave-front pattern if WPP is enabled. Loop filter runs as a - // wave-front behind the CU compression and reconstruction - compressCTURows(); + /* Analyze CTU rows, most of the hard work is done here. Frame is + * compressed in a wave-front pattern if WPP is enabled. Row based loop + * filters runs behind the CTU compression and reconstruction */ + + m_rows[0].active = true; + if (m_param->bEnableWavefront) + { + for (uint32_t row = 0; row < m_numRows; row++) + { + // block until all reference frames have reconstructed the rows we need + for (int l = 0; l < numPredDir; l++) + { + for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) + { + Frame *refpic = slice->m_refPicList[l][ref]; + + uint32_t reconRowCount = refpic->m_reconRowCount.get(); + while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows)) + reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); + + if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) + m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows); + } + } + + enableRowEncoder(row); /* clear external dependency for this row */ + if (!row) + { + m_row0WaitTime = x265_mdate(); + enqueueRowEncoder(0); /* clear internal dependency, start wavefront */ + } + tryWakeOne(); + } + + m_allRowsAvailableTime = x265_mdate(); + tryWakeOne(); /* ensure one thread is active or help-wanted flag is set prior to blocking */ + static const int block_ms = 250; + while (m_completionEvent.timedWait(block_ms)) + tryWakeOne(); + } + else + { + for (uint32_t i = 0; i < m_numRows + m_filterRowDelay; i++) + { + // compress + if (i < m_numRows) + { + // block until all reference frames have reconstructed the rows we need + for (int l = 0; l < numPredDir; l++) + { + int list = l; + for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++) + { + Frame *refpic = slice->m_refPicList[list][ref]; + + uint32_t reconRowCount = refpic->m_reconRowCount.get(); + while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows)) + reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); + + if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) + m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows); + } + } + + if (!i) + m_row0WaitTime = x265_mdate(); + else if (i == m_numRows - 1) + m_allRowsAvailableTime = x265_mdate(); + processRowEncoder(i, m_tld[m_localTldIdx]); + } + + // filter + if (i >= m_filterRowDelay) + m_frameFilter.processRow(i - m_filterRowDelay); + } + } if (m_param->rc.bStatWrite) { @@ -487,45 +640,55 @@ if (m_top->m_rateControl->rateControlEnd(m_frame, m_accessUnitBits, &m_rce, &m_frameStats) < 0) m_top->m_aborted = true; - /* Accumulate NR statistics from all worker threads */ + /* Decrement referenced frame reference counts, allow them to be recycled */ + for (int l = 0; l < numPredDir; l++) + { + for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) + { + Frame *refpic = slice->m_refPicList[l][ref]; + ATOMIC_DEC(&refpic->m_countRefEncoders); + } + } + + int numTLD; + if (m_pool) + numTLD = m_param->bEnableWavefront ? m_pool->m_numWorkers : m_pool->m_numWorkers + m_pool->m_numProviders; + else + numTLD = 1; + if (m_nr) { - for (int i = 0; i < m_top->m_numThreadLocalData; i++) + /* Accumulate NR statistics from all worker threads */ + for (int i = 0; i < numTLD; i++) { - NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID]; + NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId]; for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++) { - for(int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++) + for (int coeff = 0; coeff < MAX_NUM_TR_COEFFS; coeff++) m_nr->residualSum[cat][coeff] += nr->residualSum[cat][coeff]; m_nr->count[cat] += nr->count[cat]; } } - } - noiseReductionUpdate(); + noiseReductionUpdate(); - /* Copy updated NR coefficients back to all worker threads */ - if (m_nr) - { - for (int i = 0; i < m_top->m_numThreadLocalData; i++) + /* Copy updated NR coefficients back to all worker threads */ + for (int i = 0; i < numTLD; i++) { - NoiseReduction* nr = &m_top->m_threadLocalData[i].analysis.m_quant.m_frameNr[m_frameEncoderID]; + NoiseReduction* nr = &m_tld[i].analysis.m_quant.m_frameNr[m_jpId]; memcpy(nr->offsetDenoise, m_nr->offsetDenoise, sizeof(uint16_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); memset(nr->count, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES); memset(nr->residualSum, 0, sizeof(uint32_t) * MAX_NUM_TR_CATEGORIES * MAX_NUM_TR_COEFFS); } } - // Decrement referenced frame reference counts, allow them to be recycled - for (int l = 0; l < numPredDir; l++) - { - for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) - { - Frame *refpic = slice->m_refPicList[l][ref]; - ATOMIC_DEC(&refpic->m_countRefEncoders); - } - } +#if DETAILED_CU_STATS + /* Accumulate CU statistics from each worker thread, we could report + * per-frame stats here, but currently we do not. */ + for (int i = 0; i < numTLD; i++) + m_cuStats.accumulate(m_tld[i].analysis.m_stats[m_jpId]); +#endif m_endFrameTime = x265_mdate(); } @@ -534,7 +697,7 @@ { Slice* slice = m_frame->m_encData->m_slice; const uint32_t widthInLCUs = slice->m_sps->numCuInWidth; - const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_CU_PARTITIONS - 1) / NUM_CU_PARTITIONS; + const uint32_t lastCUAddr = (slice->m_endCUAddr + NUM_4x4_PARTITIONS - 1) / NUM_4x4_PARTITIONS; const uint32_t numSubstreams = m_param->bEnableWavefront ? slice->m_sps->numCuInHeight : 1; SAOParam* saoParam = slice->m_sps->bUseSAO ? m_frame->m_encData->m_saoParam : NULL; @@ -599,99 +762,6 @@ m_entropyCoder.finishSlice(); } -void FrameEncoder::compressCTURows() -{ - Slice* slice = m_frame->m_encData->m_slice; - - m_bAllRowsStop = false; - m_vbvResetTriggerRow = -1; - - m_SSDY = m_SSDU = m_SSDV = 0; - m_ssim = 0; - m_ssimCnt = 0; - memset(&m_frameStats, 0, sizeof(m_frameStats)); - - bool bUseWeightP = slice->m_pps->bUseWeightPred && slice->m_sliceType == P_SLICE; - bool bUseWeightB = slice->m_pps->bUseWeightedBiPred && slice->m_sliceType == B_SLICE; - int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0; - - m_rows[0].active = true; - if (m_pool && m_param->bEnableWavefront) - { - WaveFront::clearEnabledRowMask(); - WaveFront::enqueue(); - - for (uint32_t row = 0; row < m_numRows; row++) - { - // block until all reference frames have reconstructed the rows we need - for (int l = 0; l < numPredDir; l++) - { - for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++) - { - Frame *refpic = slice->m_refPicList[l][ref]; - - uint32_t reconRowCount = refpic->m_reconRowCount.get(); - while ((reconRowCount != m_numRows) && (reconRowCount < row + m_refLagRows)) - reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); - - if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) - m_mref[l][ref].applyWeight(row + m_refLagRows, m_numRows); - } - } - - enableRowEncoder(row); - if (row) - m_pool->pokeIdleThread(); - else - { - m_row0WaitTime = x265_mdate(); - enqueueRowEncoder(0); - } - } - - m_allRowsAvailableTime = x265_mdate(); - m_completionEvent.wait(); - - WaveFront::dequeue(); - } - else - { - for (uint32_t i = 0; i < this->m_numRows + m_filterRowDelay; i++) - { - // Encode - if (i < m_numRows) - { - // block until all reference frames have reconstructed the rows we need - for (int l = 0; l < numPredDir; l++) - { - int list = l; - for (int ref = 0; ref < slice->m_numRefIdx[list]; ref++) - { - Frame *refpic = slice->m_refPicList[list][ref]; - - uint32_t reconRowCount = refpic->m_reconRowCount.get(); - while ((reconRowCount != m_numRows) && (reconRowCount < i + m_refLagRows)) - reconRowCount = refpic->m_reconRowCount.waitForChange(reconRowCount); - - if ((bUseWeightP || bUseWeightB) && m_mref[l][ref].isWeighted) - m_mref[list][ref].applyWeight(i + m_refLagRows, m_numRows); - } - } - - if (!i) - m_row0WaitTime = x265_mdate(); - else if (i == m_numRows - 1) - m_allRowsAvailableTime = x265_mdate(); - processRowEncoder(i, *m_tld); - } - - // Filter - if (i >= m_filterRowDelay) - m_frameFilter.processRow(i - m_filterRowDelay); - } - } -} - void FrameEncoder::processRow(int row, int threadId) { int64_t startTime = x265_mdate(); @@ -701,10 +771,8 @@ const uint32_t realRow = row >> 1; const uint32_t typeNum = row & 1; - ThreadLocalData& tld = threadId >= 0 ? m_top->m_threadLocalData[threadId] : *m_tld; - if (!typeNum) - processRowEncoder(realRow, tld); + processRowEncoder(realRow, m_tld[threadId]); else { m_frameFilter.processRow(realRow); @@ -712,8 +780,6 @@ // NOTE: Active next row if (realRow != m_numRows - 1) enqueueRowFilter(realRow + 1); - else - m_completionEvent.trigger(); } if (ATOMIC_DEC(&m_activeWorkerCount) == 0) @@ -927,21 +993,21 @@ } } - // NOTE: do CU level Filter + /* SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas */ if (m_param->bEnableSAO && m_param->bSaoNonDeblocked) - // SAO parameter estimation using non-deblocked pixels for CTU bottom and right boundary areas m_frameFilter.m_sao.calcSaoStatsCu_BeforeDblk(m_frame, col, row); - // NOTE: active next row - if (curRow.completed >= 2 && row < m_numRows - 1) + if (m_param->bEnableWavefront && curRow.completed >= 2 && row < m_numRows - 1 && + (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow)) { + /* activate next row */ ScopedLock below(m_rows[row + 1].lock); if (m_rows[row + 1].active == false && - m_rows[row + 1].completed + 2 <= curRow.completed && - (!m_bAllRowsStop || intRow + 1 < m_vbvResetTriggerRow)) + m_rows[row + 1].completed + 2 <= curRow.completed) { m_rows[row + 1].active = true; enqueueRowEncoder(row + 1); + tryWakeOne(); /* wake up a sleeping thread or set the help wanted flag */ } } @@ -956,11 +1022,7 @@ } } - /* *this row of CTUs has been encoded* */ - - /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */ - if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1)) - rowCoder.finishSlice(); + /** this row of CTUs has been compressed **/ /* If encoding with ABR, update update bits and complexity in rate control * after a number of rows so the next frame's rateControlStart has more @@ -969,26 +1031,30 @@ * after refLagRows (the number of rows reference frames must have completed * before referencees may begin encoding) */ uint32_t rowCount = 0; - if (m_param->rc.rateControlMode == X265_RC_ABR) + if (m_param->rc.rateControlMode == X265_RC_ABR || bIsVbv) { if ((uint32_t)m_rce.encodeOrder <= 2 * (m_param->fpsNum / m_param->fpsDenom)) rowCount = X265_MIN((m_numRows + 1) / 2, m_numRows - 1); else rowCount = X265_MIN(m_refLagRows, m_numRows - 1); - } - if (row == rowCount) - { - m_rce.rowTotalBits = 0; - if (bIsVbv) - for (uint32_t i = 0; i < rowCount; i++) - m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits; - else - for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++) - m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].totalBits; + if (row == rowCount) + { + m_rce.rowTotalBits = 0; + if (bIsVbv) + for (uint32_t i = 0; i < rowCount; i++) + m_rce.rowTotalBits += curEncData.m_rowStat[i].encodedBits; + else + for (uint32_t cuAddr = 0; cuAddr < rowCount * numCols; cuAddr++) + m_rce.rowTotalBits += curEncData.m_cuStat[cuAddr].totalBits; - m_top->m_rateControl->rateControlUpdateStats(&m_rce); + m_top->m_rateControl->rateControlUpdateStats(&m_rce); + } } + /* flush row bitstream (if WPP and no SAO) or flush frame if no WPP and no SAO */ + if (!m_param->bEnableSAO && (m_param->bEnableWavefront || row == m_numRows - 1)) + rowCoder.finishSlice(); + if (m_param->bEnableWavefront) { /* trigger row-wise loop filters */ @@ -999,15 +1065,20 @@ /* NOTE: Activate filter if first row (row 0) */ if (row == m_filterRowDelay) enqueueRowFilter(0); + tryWakeOne(); } if (row == m_numRows - 1) { for (uint32_t i = m_numRows - m_filterRowDelay; i < m_numRows; i++) enableRowFilter(i); + tryWakeOne(); } } curRow.busy = false; + + if (ATOMIC_INC(&m_completionCount) == 2 * (int)m_numRows) + m_completionEvent.trigger(); } void FrameEncoder::collectCTUStatistics(CUData& ctu) @@ -1034,7 +1105,7 @@ else if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) { /* TODO: log intra modes at absPartIdx +0 to +3 */ - X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); + X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n"); log->cntIntraNxN++; log->cntIntra[depth]--; } @@ -1082,7 +1153,7 @@ if (ctu.m_partSize[absPartIdx] != SIZE_2Nx2N) { - X265_CHECK(depth == g_maxCUDepth, "Intra NxN found at improbable depth\n"); + X265_CHECK(ctu.m_log2CUSize[absPartIdx] == 3 && ctu.m_slice->m_sps->quadtreeTULog2MinSize < 3, "Intra NxN found at improbable depth\n"); log->cntIntraNxN++; /* TODO: log intra modes at absPartIdx +0 to +3 */ } @@ -1098,9 +1169,6 @@ /* DCT-domain noise reduction / adaptive deadzone from libavcodec */ void FrameEncoder::noiseReductionUpdate() { - if (!m_nr) - return; - static const uint32_t maxBlocksPerTrSize[4] = {1 << 18, 1 << 16, 1 << 14, 1 << 12}; for (int cat = 0; cat < MAX_NUM_TR_CATEGORIES; cat++) diff -Nru x265-1.5/source/encoder/frameencoder.h x265-1.6/source/encoder/frameencoder.h --- x265-1.5/source/encoder/frameencoder.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/frameencoder.h 2015-04-02 16:46:36.000000000 +0000 @@ -122,7 +122,7 @@ virtual ~FrameEncoder() {} - bool init(Encoder *top, int numRows, int numCols, int id); + virtual bool init(Encoder *top, int numRows, int numCols); void destroy(); @@ -135,8 +135,12 @@ Event m_enable; Event m_done; Event m_completionEvent; - bool m_threadActive; - int m_frameEncoderID; + int m_localTldIdx; + + volatile bool m_threadActive; + volatile bool m_bAllRowsStop; + volatile int m_completionCount; + volatile int m_vbvResetTriggerRow; uint32_t m_numRows; uint32_t m_numCols; @@ -144,9 +148,6 @@ uint32_t m_filterRowDelayCus; uint32_t m_refLagRows; - volatile bool m_bAllRowsStop; - volatile int m_vbvResetTriggerRow; - CTURow* m_rows; RateControlEntry m_rce; SEIDecodedPictureHash m_seiReconPictureDigest; @@ -177,6 +178,9 @@ int64_t m_slicetypeWaitTime; // total elapsed time waiting for decided frame int64_t m_totalWorkerElapsedTime; // total elapsed time spent by worker threads processing CTUs int64_t m_totalNoWorkerTime; // total elapsed time without any active worker threads +#if DETAILED_CU_STATS + CUStats m_cuStats; +#endif Encoder* m_top; x265_param* m_param; @@ -196,6 +200,21 @@ FrameFilter m_frameFilter; NALList m_nalList; + class WeightAnalysis : public BondedTaskGroup + { + public: + + FrameEncoder& master; + + WeightAnalysis(FrameEncoder& fe) : master(fe) {} + + void processTasks(int workerThreadId); + + protected: + + WeightAnalysis operator=(const WeightAnalysis&); + }; + protected: bool initializeGeoms(); @@ -203,9 +222,6 @@ /* analyze / compress frame, can be run in parallel within reference constraints */ void compressFrame(); - /* called by compressFrame to perform wave-front compression analysis */ - void compressCTURows(); - /* called by compressFrame to generate final per-row bitstreams */ void encodeSlice(); @@ -215,8 +231,8 @@ void noiseReductionUpdate(); /* Called by WaveFront::findJob() */ - void processRow(int row, int threadId); - void processRowEncoder(int row, ThreadLocalData& tld); + virtual void processRow(int row, int threadId); + virtual void processRowEncoder(int row, ThreadLocalData& tld); void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); } void enqueueRowFilter(int row) { WaveFront::enqueueRow(row * 2 + 1); } diff -Nru x265-1.5/source/encoder/framefilter.cpp x265-1.6/source/encoder/framefilter.cpp --- x265-1.5/source/encoder/framefilter.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/framefilter.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -83,6 +83,11 @@ { ProfileScopeEvent(filterCTURow); +#if DETAILED_CU_STATS + ScopedElapsedTime filterPerfScope(m_frameEncoder->m_cuStats.loopFilterElapsedTime); + m_frameEncoder->m_cuStats.countLoopFilter++; +#endif + if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO) { processRowPost(row); @@ -298,6 +303,9 @@ updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight); updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight); } + + if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows) + m_frameEncoder->m_completionEvent.trigger(); } static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height) @@ -421,7 +429,7 @@ /* Original YUV restoration for CU in lossless coding */ static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame) { - uint32_t absPartIdx = cuGeom.encodeIdx; + uint32_t absPartIdx = cuGeom.absPartIdx; if (cu->m_cuDepth[absPartIdx] > cuGeom.depth) { for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++) diff -Nru x265-1.5/source/encoder/level.cpp x265-1.6/source/encoder/level.cpp --- x265-1.5/source/encoder/level.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/level.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -60,6 +60,7 @@ /* determine minimum decoder level required to decode the described video */ void determineLevel(const x265_param ¶m, VPS& vps) { + vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1; if (param.bLossless) vps.ptl.profileIdc = Profile::NONE; else if (param.internalCsp == X265_CSP_I420) @@ -154,15 +155,25 @@ return; } - vps.ptl.levelIdc = levels[i].levelEnum; - vps.ptl.minCrForLevel = levels[i].minCompressionRatio; - vps.ptl.maxLumaSrForLevel = levels[i].maxLumaSamplesPerSecond; +#define CHECK_RANGE(value, main, high) (value > main && value <= high) - if (bitrate > levels[i].maxBitrateMain && bitrate <= levels[i].maxBitrateHigh && + if (CHECK_RANGE(bitrate, levels[i].maxBitrateMain, levels[i].maxBitrateHigh) && + CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh) && levels[i].maxBitrateHigh != MAX_UINT) - vps.ptl.tierFlag = Level::HIGH; + { + /* If the user has not enabled high tier, continue looking to see if we can encode at a higher level, main tier */ + if (!param.bHighTier && (levels[i].levelIdc < param.levelIdc)) + continue; + else + vps.ptl.tierFlag = Level::HIGH; + } else vps.ptl.tierFlag = Level::MAIN; +#undef CHECK_RANGE + + vps.ptl.levelIdc = levels[i].levelEnum; + vps.ptl.minCrForLevel = levels[i].minCompressionRatio; + vps.ptl.maxLumaSrForLevel = levels[i].maxLumaSamplesPerSecond; break; } @@ -250,7 +261,7 @@ } if ((uint32_t)param.rc.vbvBufferSize > (highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain)) { - param.rc.vbvMaxBitrate = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain; + param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain; x265_log(¶m, X265_LOG_INFO, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize); } diff -Nru x265-1.5/source/encoder/motion.cpp x265-1.6/source/encoder/motion.cpp --- x265-1.5/source/encoder/motion.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/motion.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -59,38 +59,6 @@ int sizeScale[NUM_PU_SIZES]; #define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum]))) -void initScales(void) -{ -#define SETUP_SCALE(W, H) \ - sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4; - SETUP_SCALE(4, 4); - SETUP_SCALE(8, 8); - SETUP_SCALE(8, 4); - SETUP_SCALE(4, 8); - SETUP_SCALE(16, 16); - SETUP_SCALE(16, 8); - SETUP_SCALE(8, 16); - SETUP_SCALE(16, 12); - SETUP_SCALE(12, 16); - SETUP_SCALE(4, 16); - SETUP_SCALE(16, 4); - SETUP_SCALE(32, 32); - SETUP_SCALE(32, 16); - SETUP_SCALE(16, 32); - SETUP_SCALE(32, 24); - SETUP_SCALE(24, 32); - SETUP_SCALE(32, 8); - SETUP_SCALE(8, 32); - SETUP_SCALE(64, 64); - SETUP_SCALE(64, 32); - SETUP_SCALE(32, 64); - SETUP_SCALE(64, 48); - SETUP_SCALE(48, 64); - SETUP_SCALE(64, 16); - SETUP_SCALE(16, 64); -#undef SETUP_SCALE -} - /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */ const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) }; const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */ @@ -136,20 +104,57 @@ absPartIdx = -1; searchMethod = X265_HEX_SEARCH; subpelRefine = 2; + blockwidth = blockheight = 0; + blockOffset = 0; bChromaSATD = false; chromaSatd = NULL; } void MotionEstimate::init(int method, int refine, int csp) { - if (!sizeScale[0]) - initScales(); - searchMethod = method; subpelRefine = refine; fencPUYuv.create(FENC_STRIDE, csp); } +void MotionEstimate::initScales(void) +{ +#define SETUP_SCALE(W, H) \ + sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4; + SETUP_SCALE(4, 4); + SETUP_SCALE(8, 8); + SETUP_SCALE(8, 4); + SETUP_SCALE(4, 8); + SETUP_SCALE(16, 16); + SETUP_SCALE(16, 8); + SETUP_SCALE(8, 16); + SETUP_SCALE(16, 12); + SETUP_SCALE(12, 16); + SETUP_SCALE(4, 16); + SETUP_SCALE(16, 4); + SETUP_SCALE(32, 32); + SETUP_SCALE(32, 16); + SETUP_SCALE(16, 32); + SETUP_SCALE(32, 24); + SETUP_SCALE(24, 32); + SETUP_SCALE(32, 8); + SETUP_SCALE(8, 32); + SETUP_SCALE(64, 64); + SETUP_SCALE(64, 32); + SETUP_SCALE(32, 64); + SETUP_SCALE(64, 48); + SETUP_SCALE(48, 64); + SETUP_SCALE(64, 16); + SETUP_SCALE(16, 64); +#undef SETUP_SCALE +} + +int MotionEstimate::hpelIterationCount(int subme) +{ + return workload[subme].hpel_iters + + workload[subme].qpel_iters / 2; +} + MotionEstimate::~MotionEstimate() { fencPUYuv.destroy(); diff -Nru x265-1.5/source/encoder/motion.h x265-1.6/source/encoder/motion.h --- x265-1.5/source/encoder/motion.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/motion.h 2015-04-02 16:46:36.000000000 +0000 @@ -67,6 +67,8 @@ MotionEstimate(); ~MotionEstimate(); + static void initScales(); + static int hpelIterationCount(int subme); void init(int method, int refine, int csp); /* Methods called at slice setup */ diff -Nru x265-1.5/source/encoder/nal.cpp x265-1.6/source/encoder/nal.cpp --- x265-1.5/source/encoder/nal.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/nal.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -107,7 +107,7 @@ * nuh_reserved_zero_6bits 6-bits * nuh_temporal_id_plus1 3-bits */ out[bytes++] = (uint8_t)nalUnitType << 1; - out[bytes++] = 1; + out[bytes++] = 1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N); /* 7.4.1 ... * Within the NAL unit, the following three-byte sequences shall not occur at diff -Nru x265-1.5/source/encoder/ratecontrol.cpp x265-1.6/source/encoder/ratecontrol.cpp --- x265-1.5/source/encoder/ratecontrol.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/ratecontrol.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -145,30 +145,6 @@ } } // end anonymous namespace -/* Compute variance to derive AC energy of each block */ -static inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int i) -{ - uint32_t sum = (uint32_t)sum_ssd; - uint32_t ssd = (uint32_t)(sum_ssd >> 32); - - curFrame->m_lowres.wp_sum[i] += sum; - curFrame->m_lowres.wp_ssd[i] += ssd; - return ssd - ((uint64_t)sum * sum >> shift); -} - -/* Find the energy of each block in Y/Cb/Cr plane */ -static inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int bChroma, int colorFormat) -{ - if ((colorFormat != X265_CSP_I444) && bChroma) - { - ALIGN_VAR_8(pixel, pix[8 * 8]); - primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride); - return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, bChroma); - } - else - return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, bChroma); -} - /* Returns the zone for the current frame */ x265_zone* RateControl::getZone() { @@ -181,138 +157,9 @@ return NULL; } -/* Find the total AC energy of each block in all planes */ -uint32_t RateControl::acEnergyCu(Frame* curFrame, uint32_t block_x, uint32_t block_y) -{ - intptr_t stride = curFrame->m_fencPic->m_stride; - intptr_t cStride = curFrame->m_fencPic->m_strideC; - intptr_t blockOffsetLuma = block_x + (block_y * stride); - int colorFormat = m_param->internalCsp; - int hShift = CHROMA_H_SHIFT(colorFormat); - int vShift = CHROMA_V_SHIFT(colorFormat); - intptr_t blockOffsetChroma = (block_x >> hShift) + ((block_y >> vShift) * cStride); - - uint32_t var; - - var = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, colorFormat); - var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, colorFormat); - var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, colorFormat); - x265_emms(); - return var; -} - -void RateControl::calcAdaptiveQuantFrame(Frame *curFrame) +RateControl::RateControl(x265_param& p) { - /* Actual adaptive quantization */ - int maxCol = curFrame->m_fencPic->m_picWidth; - int maxRow = curFrame->m_fencPic->m_picHeight; - - for (int y = 0; y < 3; y++) - { - curFrame->m_lowres.wp_ssd[y] = 0; - curFrame->m_lowres.wp_sum[y] = 0; - } - - /* Calculate Qp offset for each 16x16 block in the frame */ - int block_xy = 0; - int block_x = 0, block_y = 0; - double strength = 0.f; - if (m_param->rc.aqMode == X265_AQ_NONE || m_param->rc.aqStrength == 0) - { - /* Need to init it anyways for CU tree */ - int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; - int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; - int cuCount = cuWidth * cuHeight; - - if (m_param->rc.aqMode && m_param->rc.aqStrength == 0) - { - memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double)); - memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double)); - for (int cuxy = 0; cuxy < cuCount; cuxy++) - curFrame->m_lowres.invQscaleFactor[cuxy] = 256; - } - - /* Need variance data for weighted prediction */ - if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred) - { - for (block_y = 0; block_y < maxRow; block_y += 16) - for (block_x = 0; block_x < maxCol; block_x += 16) - acEnergyCu(curFrame, block_x, block_y); - } - } - else - { - block_xy = 0; - double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0; - if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE) - { - double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5); - for (block_y = 0; block_y < maxRow; block_y += 16) - { - for (block_x = 0; block_x < maxCol; block_x += 16) - { - uint32_t energy = acEnergyCu(curFrame, block_x, block_y); - qp_adj = pow(energy + 1, 0.1); - curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj; - avg_adj += qp_adj; - avg_adj_pow2 += qp_adj * qp_adj; - block_xy++; - } - } - - avg_adj /= m_ncu; - avg_adj_pow2 /= m_ncu; - strength = m_param->rc.aqStrength * avg_adj / bit_depth_correction; - avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj; - } - else - strength = m_param->rc.aqStrength * 1.0397f; - - block_xy = 0; - for (block_y = 0; block_y < maxRow; block_y += 16) - { - for (block_x = 0; block_x < maxCol; block_x += 16) - { - if (m_param->rc.aqMode == X265_AQ_AUTO_VARIANCE) - { - qp_adj = curFrame->m_lowres.qpCuTreeOffset[block_xy]; - qp_adj = strength * (qp_adj - avg_adj); - } - else - { - uint32_t energy = acEnergyCu(curFrame, block_x, block_y); - qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8))); - } - curFrame->m_lowres.qpAqOffset[block_xy] = qp_adj; - curFrame->m_lowres.qpCuTreeOffset[block_xy] = qp_adj; - curFrame->m_lowres.invQscaleFactor[block_xy] = x265_exp2fix8(qp_adj); - block_xy++; - } - } - } - - if (m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred) - { - int hShift = CHROMA_H_SHIFT(m_param->internalCsp); - int vShift = CHROMA_V_SHIFT(m_param->internalCsp); - maxCol = ((maxCol + 8) >> 4) << 4; - maxRow = ((maxRow + 8) >> 4) << 4; - int width[3] = { maxCol, maxCol >> hShift, maxCol >> hShift }; - int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift }; - - for (int i = 0; i < 3; i++) - { - uint64_t sum, ssd; - sum = curFrame->m_lowres.wp_sum[i]; - ssd = curFrame->m_lowres.wp_ssd[i]; - curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]); - } - } -} - -RateControl::RateControl(x265_param *p) -{ - m_param = p; + m_param = &p; int lowresCuWidth = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; int lowresCuHeight = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; m_ncu = lowresCuWidth * lowresCuHeight; @@ -329,13 +176,11 @@ m_partialResidualCost = 0; m_rateFactorMaxIncrement = 0; m_rateFactorMaxDecrement = 0; - m_fps = m_param->fpsNum / m_param->fpsDenom; + m_fps = (double)m_param->fpsNum / m_param->fpsDenom; m_startEndOrder.set(0); m_bTerminated = false; m_finalFrameCount = 0; m_numEntries = 0; - m_amortizeFraction = 0.85; - m_amortizeFrames = 75; if (m_param->rc.rateControlMode == X265_RC_CRF) { m_param->rc.qp = (int)m_param->rc.rfConstant; @@ -371,6 +216,7 @@ m_statFileOut = NULL; m_cutreeStatFileOut = m_cutreeStatFileIn = NULL; m_rce2Pass = NULL; + m_lastBsliceSatdCost = 0; // vbv initialization m_param->rc.vbvBufferSize = x265_clip3(0, 2000000, m_param->rc.vbvBufferSize); @@ -424,11 +270,6 @@ x265_log(m_param, X265_LOG_WARNING, "strict CBR set without CBR mode, ignored\n"); m_param->rc.bStrictCbr = 0; } - if (m_param->totalFrames <= 2 * m_fps && m_param->rc.bStrictCbr) /* Strict CBR segment encode */ - { - m_amortizeFraction = 0.85; - m_amortizeFrames = m_param->totalFrames / 2; - } if(m_param->rc.bStrictCbr) m_rateTolerance = 0.7; @@ -466,7 +307,7 @@ m_cuTreeStats.qpBuffer[i] = NULL; } -bool RateControl::init(const SPS *sps) +bool RateControl::init(const SPS& sps) { if (m_isVbv) { @@ -483,7 +324,7 @@ if (m_param->bEmitHRDSEI) { - const HRDInfo* hrd = &sps->vuiParameters.hrdParameters; + const HRDInfo* hrd = &sps.vuiParameters.hrdParameters; vbvBufferSize = hrd->cpbSizeValue << (hrd->cpbSizeScale + CPB_SHIFT); vbvMaxBitrate = hrd->bitRateValue << (hrd->bitRateScale + BR_SHIFT); } @@ -503,12 +344,21 @@ m_framesDone = 0; m_residualCost = 0; m_partialResidualCost = 0; + m_amortizeFraction = 0.85; + m_amortizeFrames = 75; + if (m_param->totalFrames && m_param->totalFrames <= 2 * m_fps && m_param->rc.bStrictCbr) /* Strict CBR segment encode */ + { + m_amortizeFraction = 0.85; + m_amortizeFrames = m_param->totalFrames / 2; + } for (int i = 0; i < s_slidingWindowFrames; i++) { m_satdCostWindow[i] = 0; m_encodedBitsWindow[i] = 0; } m_sliderPos = 0; + m_isPatternPresent = false; + m_numBframesInPattern = 0; /* 720p videos seem to be a good cutoff for cplxrSum */ double tuneCplxFactor = (m_param->rc.cuTree && m_ncu > 3600) ? 2.5 : 1; @@ -750,13 +600,13 @@ return true; } -void RateControl::initHRD(SPS *sps) +void RateControl::initHRD(SPS& sps) { int vbvBufferSize = m_param->rc.vbvBufferSize * 1000; int vbvMaxBitrate = m_param->rc.vbvMaxBitrate * 1000; // Init HRD - HRDInfo* hrd = &sps->vuiParameters.hrdParameters; + HRDInfo* hrd = &sps.vuiParameters.hrdParameters; hrd->cbrFlag = m_isCbr; // normalize HRD size and rate to the value / scale notation @@ -771,9 +621,9 @@ // arbitrary #define MAX_DURATION 0.5 - TimingInfo *time = &sps->vuiParameters.timingInfo; + TimingInfo *time = &sps.vuiParameters.timingInfo; int maxCpbOutputDelay = (int)(X265_MIN(m_param->keyframeMax * MAX_DURATION * time->timeScale / time->numUnitsInTick, INT_MAX)); - int maxDpbOutputDelay = (int)(sps->maxDecPicBuffering * MAX_DURATION * time->timeScale / time->numUnitsInTick); + int maxDpbOutputDelay = (int)(sps.maxDecPicBuffering * MAX_DURATION * time->timeScale / time->numUnitsInTick); int maxDelay = (int)(90000.0 * cpbSizeUnscale / bitRateUnscale + 0.5); hrd->initialCpbRemovalDelayLength = 2 + x265_clip3(4, 22, 32 - calcLength(maxDelay)); @@ -1082,7 +932,7 @@ int startOrdinal = rce->encodeOrder * 2; while (orderValue < startOrdinal && !m_bTerminated) - orderValue = m_startEndOrder.waitForChange(orderValue); + orderValue = m_startEndOrder.waitForChange(orderValue); if (!curFrame) { @@ -1160,6 +1010,27 @@ m_currentSatd = curFrame->m_lowres.satdCost >> (X265_DEPTH - 8); /* Update rce for use in rate control VBV later */ rce->lastSatd = m_currentSatd; + X265_CHECK(rce->lastSatd, "satdcost cannot be zero\n"); + /* Detect a pattern for B frames with same SATDcost to identify a series of static frames + * and the P frame at the end of the series marks a possible case for ABR reset logic */ + if (m_param->bframes) + { + if (m_sliceType != B_SLICE && m_numBframesInPattern > m_param->bframes) + { + m_isPatternPresent = true; + } + else if (m_sliceType == B_SLICE && !IS_REFERENCED(curFrame)) + { + if (m_currentSatd != m_lastBsliceSatdCost && !rce->bLastMiniGopBFrame) + { + m_isPatternPresent = false; + m_lastBsliceSatdCost = m_currentSatd; + m_numBframesInPattern = 0; + } + else if (m_currentSatd == m_lastBsliceSatdCost) + m_numBframesInPattern++; + } + } } double q = x265_qScale2qp(rateEstimateQscale(curFrame, rce)); q = x265_clip3((double)QP_MIN, (double)QP_MAX_MAX, q); @@ -1197,10 +1068,29 @@ m_qp = (int32_t)(curFrame->m_forceqp + 0.5) - 1; m_qp = x265_clip3(QP_MIN, QP_MAX_MAX, m_qp); rce->qpaRc = curEncData.m_avgQpRc = curEncData.m_avgQpAq = m_qp; + if (m_isAbr || m_2pass) + { + rce->qpNoVbv = rce->qpaRc; + m_lastQScaleFor[m_sliceType] = x265_qp2qScale(rce->qpaRc); + if (rce->poc == 0) + m_lastQScaleFor[P_SLICE] = m_lastQScaleFor[m_sliceType] * fabs(m_param->rc.ipFactor); + rce->frameSizePlanned = predictSize(&m_pred[m_sliceType], m_qp, (double)m_currentSatd); + } } - // Do not increment m_startEndOrder here. Make rateControlEnd of previous thread - // to wait until rateControlUpdateStats of this frame is called m_framesDone++; + + /* CQP and CRF (without capped VBV) doesn't use mid-frame statistics to + * tune RateControl parameters for other frames. + * Hence, for these modes, update m_startEndOrder and unlock RC for previous threads waiting in + * RateControlEnd here.those modes here. For the rest - ABR + * and VBV, unlock only after rateControlUpdateStats of this frame is called */ + if (m_param->rc.rateControlMode != X265_RC_ABR && !m_isVbv) + { + m_startEndOrder.incr(); + + if (rce->encodeOrder < m_param->frameNumThreads - 1) + m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames + } return m_qp; } @@ -1351,6 +1241,8 @@ if (m_rce2Pass[frame->m_poc].keptAsRef) { + /* TODO: We don't need pre-lookahead to measure AQ offsets, but there is currently + * no way to signal this */ uint8_t type; if (m_cuTreeStats.qpBufPos < 0) { @@ -1379,8 +1271,6 @@ } m_cuTreeStats.qpBufPos--; } - else - calcAdaptiveQuantFrame(frame); return true; fail: @@ -1439,7 +1329,7 @@ slidingWindowCplxSum *= 0.5; if (!m_satdCostWindow[pos]) break; - slidingWindowCplxSum += m_satdCostWindow[pos] / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION); + slidingWindowCplxSum += m_satdCostWindow[pos]; } rce->movingAvgSum = slidingWindowCplxSum; m_satdCostWindow[m_sliderPos % s_slidingWindowFrames] = rce->lastSatd; @@ -1489,32 +1379,36 @@ q += m_pbOffset; double qScale = x265_qp2qScale(q); - if (m_isCbr) + double lmin = 0, lmax = 0; + if (m_isVbv) { - qScale = tuneAbrQScaleFromFeedback(qScale); - if (!m_isAbrReset) - { - double lmin = m_lastQScaleFor[P_SLICE] / m_lstep; - double lmax = m_lastQScaleFor[P_SLICE] * m_lstep; - qScale = x265_clip3(lmin, lmax, qScale); + lmin = m_lastQScaleFor[P_SLICE] / m_lstep; + lmax = m_lastQScaleFor[P_SLICE] * m_lstep; + if (m_isCbr) + { + qScale = tuneAbrQScaleFromFeedback(qScale); + if (!m_isAbrReset) + qScale = x265_clip3(lmin, lmax, qScale); + q = x265_qScale2qp(qScale); + } + rce->qpNoVbv = q; + if (!m_2pass) + { + qScale = clipQscale(curFrame, rce, qScale); + /* clip qp to permissible range after vbv-lookahead estimation to avoid possible + * mispredictions by initial frame size predictors */ + if (m_pred[m_sliceType].count == 1) + qScale = x265_clip3(lmin, lmax, qScale); + m_lastQScaleFor[m_sliceType] = qScale; + rce->frameSizePlanned = predictSize(&m_pred[m_sliceType], qScale, (double)m_currentSatd); } - q = x265_qScale2qp(qScale); - } - rce->qpNoVbv = q; - if (!m_2pass && m_isVbv) - { - qScale = clipQscale(curFrame, rce, qScale); - m_lastQScaleFor[m_sliceType] = qScale; - rce->frameSizePlanned = predictSize(&m_pred[m_sliceType], qScale, (double)m_currentSatd); - } - else if (m_2pass && m_isVbv) - { - rce->frameSizePlanned = qScale2bits(rce, qScale); - } - /* Limit planned size by MinCR */ - if (m_isVbv) + else + rce->frameSizePlanned = qScale2bits(rce, qScale); + + /* Limit planned size by MinCR */ rce->frameSizePlanned = X265_MIN(rce->frameSizePlanned, rce->frameSizeMaximum); - rce->frameSizeEstimated = rce->frameSizePlanned; + rce->frameSizeEstimated = rce->frameSizePlanned; + } rce->newQScale = qScale; return qScale; } @@ -1589,7 +1483,7 @@ * tolerances, the bit distribution approaches that of 2pass. */ double overflow = 1; - + double lqmin = MIN_QPSCALE, lqmax = MAX_MAX_QPSCALE; m_shortTermCplxSum *= 0.5; m_shortTermCplxCount *= 0.5; m_shortTermCplxSum += m_currentSatd / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION); @@ -1623,7 +1517,6 @@ { if (m_param->rc.rateControlMode != X265_RC_CRF) { - double lqmin = 0, lqmax = 0; lqmin = m_lastQScaleFor[m_sliceType] / m_lstep; lqmax = m_lastQScaleFor[m_sliceType] * m_lstep; if (!m_partialResidualFrames) @@ -1643,12 +1536,16 @@ else if (m_framesDone == 0 && !m_isVbv && m_param->rc.rateControlMode == X265_RC_ABR) { /* for ABR alone, clip the first I frame qp */ - double lqmax = x265_qp2qScale(ABR_INIT_QP_MAX) * m_lstep; + lqmax = x265_qp2qScale(ABR_INIT_QP_MAX) * m_lstep; q = X265_MIN(lqmax, q); } q = x265_clip3(MIN_QPSCALE, MAX_MAX_QPSCALE, q); rce->qpNoVbv = x265_qScale2qp(q); q = clipQscale(curFrame, rce, q); + /* clip qp to permissible range after vbv-lookahead estimation to avoid possible + * mispredictions by initial frame size predictors */ + if (!m_2pass && m_isVbv && m_pred[m_sliceType].count == 1) + q = x265_clip3(lqmin, lqmax, q); } m_lastQScaleFor[m_sliceType] = q; if ((m_curSlice->m_poc == 0 || m_lastQScaleFor[P_SLICE] < q) && !(m_2pass && !m_isVbv)) @@ -1680,9 +1577,23 @@ /* previous I still had a residual; roll it into the new loan */ if (m_partialResidualFrames) rce->rowTotalBits += m_partialResidualCost * m_partialResidualFrames; - - m_partialResidualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax); - m_partialResidualCost = (int)((rce->rowTotalBits * m_amortizeFraction) /m_partialResidualFrames); + if ((m_param->totalFrames != 0) && (m_amortizeFrames > (m_param->totalFrames - m_framesDone))) + { + m_amortizeFrames = 0; + m_amortizeFraction = 0; + } + else + { + double depreciateRate = 1.1; + m_amortizeFrames = (int)(m_amortizeFrames / depreciateRate); + m_amortizeFraction /= depreciateRate; + m_amortizeFrames = X265_MAX(m_amortizeFrames, MIN_AMORTIZE_FRAME); + m_amortizeFraction = X265_MAX(m_amortizeFraction, MIN_AMORTIZE_FRACTION); + } + rce->amortizeFrames = m_amortizeFrames; + rce->amortizeFraction = m_amortizeFraction; + m_partialResidualFrames = X265_MIN((int)rce->amortizeFrames, m_param->keyframeMax); + m_partialResidualCost = (int)((rce->rowTotalBits * rce->amortizeFraction) / m_partialResidualFrames); rce->rowTotalBits -= m_partialResidualCost * m_partialResidualFrames; } else if (m_partialResidualFrames) @@ -1701,10 +1612,13 @@ /* do not allow the next frame to enter rateControlStart() until this * frame has updated its mid-frame statistics */ - m_startEndOrder.incr(); + if (m_param->rc.rateControlMode == X265_RC_ABR || m_isVbv) + { + m_startEndOrder.incr(); - if (rce->encodeOrder < m_param->frameNumThreads - 1) - m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames + if (rce->encodeOrder < m_param->frameNumThreads - 1) + m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames + } } void RateControl::checkAndResetABR(RateControlEntry* rce, bool isFrameDone) @@ -1714,9 +1628,11 @@ // Check if current Slice is a scene cut that follows low detailed/blank frames if (rce->lastSatd > 4 * rce->movingAvgSum) { - if (!m_isAbrReset && rce->movingAvgSum > 0) + if (!m_isAbrReset && rce->movingAvgSum > 0 + && (m_isPatternPresent || !m_param->bframes)) { - int64_t shrtTermWantedBits = (int64_t) (X265_MIN(m_sliderPos, s_slidingWindowFrames) * m_bitrate * m_frameDuration); + int pos = X265_MAX(m_sliderPos - m_param->frameNumThreads, 0); + int64_t shrtTermWantedBits = (int64_t) (X265_MIN(pos, s_slidingWindowFrames) * m_bitrate * m_frameDuration); int64_t shrtTermTotalBitsSum = 0; // Reset ABR if prev frames are blank to prevent further sudden overflows/ high bit rate spikes. for (int i = 0; i < s_slidingWindowFrames ; i++) @@ -1725,14 +1641,14 @@ const double epsilon = 0.0001f; if (underflow < epsilon && !isFrameDone) { - init(m_curSlice->m_sps); + init(*m_curSlice->m_sps); m_shortTermCplxSum = rce->lastSatd / (CLIP_DURATION(m_frameDuration) / BASE_FRAME_DURATION); m_shortTermCplxCount = 1; m_isAbrReset = true; m_lastAbrResetPoc = rce->poc; } } - else + else if (m_isAbrReset && isFrameDone) { // Clear flag to reset ABR and continue as usual. m_isAbrReset = false; @@ -2299,8 +2215,8 @@ /* previous I still had a residual; roll it into the new loan */ if (m_residualFrames) bits += m_residualCost * m_residualFrames; - m_residualFrames = X265_MIN(m_amortizeFrames, m_param->keyframeMax); - m_residualCost = (int)((bits * m_amortizeFraction) / m_residualFrames); + m_residualFrames = X265_MIN((int)rce->amortizeFrames, m_param->keyframeMax); + m_residualCost = (int)((bits * rce->amortizeFraction) / m_residualFrames); bits -= m_residualCost * m_residualFrames; } else if (m_residualFrames) diff -Nru x265-1.5/source/encoder/ratecontrol.h x265-1.6/source/encoder/ratecontrol.h --- x265-1.5/source/encoder/ratecontrol.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/ratecontrol.h 2015-04-02 16:46:36.000000000 +0000 @@ -34,14 +34,16 @@ class Encoder; class Frame; -struct SPS; class SEIBufferingPeriod; +struct SPS; #define BASE_FRAME_DURATION 0.04 /* Arbitrary limitations as a sanity check. */ #define MAX_FRAME_DURATION 1.00 #define MIN_FRAME_DURATION 0.01 +#define MIN_AMORTIZE_FRAME 10 +#define MIN_AMORTIZE_FRACTION 0.2 #define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f) /* Current frame stats for 2 pass */ @@ -79,46 +81,50 @@ struct RateControlEntry { - int64_t lastSatd; /* Contains the picture cost of the previous frame, required for resetAbr and VBV */ - int sliceType; - int bframes; - int poc; - int encodeOrder; - int64_t leadingNoBSatd; - bool bLastMiniGopBFrame; - double blurredComplexity; - double qpaRc; - double qpAq; - double qRceq; - double frameSizePlanned; /* frame Size decided by RateCotrol before encoding the frame */ - double bufferRate; - double movingAvgSum; - double rowCplxrSum; - int64_t rowTotalBits; /* update cplxrsum and totalbits at the end of 2 rows */ - double qpNoVbv; - double bufferFill; - double frameDuration; - double clippedDuration; - Predictor rowPreds[3][2]; + Predictor rowPreds[3][2]; Predictor* rowPred[2]; - double frameSizeEstimated; /* hold frameSize, updated from cu level vbv rc */ - double frameSizeMaximum; /* max frame Size according to minCR restrictions and level of the video */ - bool isActive; - SEIPictureTiming *picTimingSEI; - HRDTiming *hrdTiming; + + int64_t lastSatd; /* Contains the picture cost of the previous frame, required for resetAbr and VBV */ + int64_t leadingNoBSatd; + int64_t rowTotalBits; /* update cplxrsum and totalbits at the end of 2 rows */ + double blurredComplexity; + double qpaRc; + double qpAq; + double qRceq; + double frameSizePlanned; /* frame Size decided by RateCotrol before encoding the frame */ + double bufferRate; + double movingAvgSum; + double rowCplxrSum; + double qpNoVbv; + double bufferFill; + double frameDuration; + double clippedDuration; + double frameSizeEstimated; /* hold frameSize, updated from cu level vbv rc */ + double frameSizeMaximum; /* max frame Size according to minCR restrictions and level of the video */ + int sliceType; + int bframes; + int poc; + int encodeOrder; + bool bLastMiniGopBFrame; + bool isActive; + double amortizeFrames; + double amortizeFraction; /* Required in 2-pass rate control */ - double iCuCount; - double pCuCount; - double skipCuCount; - bool keptAsRef; - double expectedVbv; - double qScale; - double newQScale; - double newQp; - int mvBits; - int miscBits; - int coeffBits; uint64_t expectedBits; /* total expected bits up to the current frame (current one excluded) */ + double iCuCount; + double pCuCount; + double skipCuCount; + double expectedVbv; + double qScale; + double newQScale; + double newQp; + int mvBits; + int miscBits; + int coeffBits; + bool keptAsRef; + + SEIPictureTiming *picTimingSEI; + HRDTiming *hrdTiming; }; class RateControl @@ -139,7 +145,7 @@ bool m_isAbrReset; int m_lastAbrResetPoc; - double m_rateTolerance; + double m_rateTolerance; double m_frameDuration; /* current frame duration in seconds */ double m_bitrate; double m_rateFactorConstant; @@ -154,33 +160,38 @@ Predictor m_pred[5]; Predictor m_predBfromP; - int m_leadingBframes; - int64_t m_bframeBits; - int64_t m_currentSatd; - int m_qpConstant[3]; - double m_ipOffset; - double m_pbOffset; - - int m_lastNonBPictType; - int64_t m_leadingNoBSatd; - - double m_cplxrSum; /* sum of bits*qscale/rceq */ - double m_wantedBitsWindow; /* target bitrate * window */ - double m_accumPQp; /* for determining I-frame quant */ - double m_accumPNorm; - double m_lastQScaleFor[3]; /* last qscale for a specific pict type, used for max_diff & ipb factor stuff */ - double m_lstep; - double m_shortTermCplxSum; - double m_shortTermCplxCount; - double m_lastRceq; - double m_qCompress; - int64_t m_totalBits; /* total bits used for already encoded frames (after ammortization) */ - int m_framesDone; /* # of frames passed through RateCotrol already */ - int64_t m_encodedBits; /* bits used for encoded frames (without ammortization) */ - double m_fps; - int64_t m_satdCostWindow[50]; - int m_sliderPos; - int64_t m_encodedBitsWindow[50]; + int64_t m_leadingNoBSatd; + double m_ipOffset; + double m_pbOffset; + int64_t m_bframeBits; + int64_t m_currentSatd; + int m_leadingBframes; + int m_qpConstant[3]; + int m_lastNonBPictType; + int m_framesDone; /* # of frames passed through RateCotrol already */ + + double m_cplxrSum; /* sum of bits*qscale/rceq */ + double m_wantedBitsWindow; /* target bitrate * window */ + double m_accumPQp; /* for determining I-frame quant */ + double m_accumPNorm; + double m_lastQScaleFor[3]; /* last qscale for a specific pict type, used for max_diff & ipb factor stuff */ + double m_lstep; + double m_shortTermCplxSum; + double m_shortTermCplxCount; + double m_lastRceq; + double m_qCompress; + int64_t m_totalBits; /* total bits used for already encoded frames (after ammortization) */ + int64_t m_encodedBits; /* bits used for encoded frames (without ammortization) */ + double m_fps; + int64_t m_satdCostWindow[50]; + int64_t m_encodedBitsWindow[50]; + int m_sliderPos; + + /* To detect a pattern of low detailed static frames in single pass ABR using satdcosts */ + int64_t m_lastBsliceSatdCost; + int m_numBframesInPattern; + bool m_isPatternPresent; + /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to * sync the calls to these functions. For example * -F2: @@ -194,24 +205,25 @@ * rceUpdate 12 * rceEnd 11 */ ThreadSafeInteger m_startEndOrder; - int m_finalFrameCount; /* set when encoder begins flushing */ - bool m_bTerminated; /* set true when encoder is closing */ + int m_finalFrameCount; /* set when encoder begins flushing */ + bool m_bTerminated; /* set true when encoder is closing */ /* hrd stuff */ SEIBufferingPeriod m_bufPeriodSEI; - double m_nominalRemovalTime; - double m_prevCpbFinalAT; + double m_nominalRemovalTime; + double m_prevCpbFinalAT; /* 2 pass */ - bool m_2pass; - FILE* m_statFileOut; - FILE* m_cutreeStatFileOut; - FILE* m_cutreeStatFileIn; - int m_numEntries; - RateControlEntry *m_rce2Pass; - double m_lastAccumPNorm; - int64_t m_predictedBits; - double m_expectedBitsSum; /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */ + bool m_2pass; + int m_numEntries; + FILE* m_statFileOut; + FILE* m_cutreeStatFileOut; + FILE* m_cutreeStatFileIn; + double m_lastAccumPNorm; + double m_expectedBitsSum; /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */ + int64_t m_predictedBits; + RateControlEntry* m_rce2Pass; + struct { uint16_t *qpBuffer[2]; /* Global buffers for converting MB-tree quantizer data. */ @@ -219,55 +231,54 @@ * This value is the current position (0 or 1). */ } m_cuTreeStats; - RateControl(x265_param *p); + RateControl(x265_param& p); + bool init(const SPS& sps); + void initHRD(SPS& sps); + void setFinalFrameCount(int count); void terminate(); /* un-block all waiting functions so encoder may close */ void destroy(); // to be called for each curFrame to process RateControl and set QP - int rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc); - void calcAdaptiveQuantFrame(Frame *curFrame); + int rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc); void rateControlUpdateStats(RateControlEntry* rce); - int rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, FrameStats* stats); - int rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv); - void hrdFullness(SEIBufferingPeriod* sei); - bool init(const SPS* sps); - void initHRD(SPS* sps); - int rateControlSliceType(int frameNum); + int rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce, FrameStats* stats); + int rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv); + int rateControlSliceType(int frameNum); bool cuTreeReadFor2Pass(Frame* curFrame); - double tuneAbrQScaleFromFeedback(double qScale); + void hrdFullness(SEIBufferingPeriod* sei); protected: - static const int s_slidingWindowFrames; - static const char *s_defaultStatFileName; + static const int s_slidingWindowFrames; + static const char* s_defaultStatFileName; - int m_residualFrames; - int m_partialResidualFrames; - int m_residualCost; - int m_partialResidualCost; - int m_amortizeFrames; double m_amortizeFraction; + int m_amortizeFrames; + int m_residualFrames; + int m_partialResidualFrames; + int m_residualCost; + int m_partialResidualCost; x265_zone* getZone(); double getQScale(RateControlEntry *rce, double rateFactor); double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR - void accumPQpUpdate(); - uint32_t acEnergyCu(Frame* pic, uint32_t block_x, uint32_t block_y); + double tuneAbrQScaleFromFeedback(double qScale); + void accumPQpUpdate(); - void updateVbv(int64_t bits, RateControlEntry* rce); - void updatePredictor(Predictor *p, double q, double var, double bits); + void updateVbv(int64_t bits, RateControlEntry* rce); + void updatePredictor(Predictor *p, double q, double var, double bits); double clipQscale(Frame* pic, RateControlEntry* rce, double q); - void updateVbvPlan(Encoder* enc); + void updateVbvPlan(Encoder* enc); double predictSize(Predictor *p, double q, double var); - void checkAndResetABR(RateControlEntry* rce, bool isFrameDone); + void checkAndResetABR(RateControlEntry* rce, bool isFrameDone); double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits); - bool initPass2(); + bool initPass2(); double getDiffLimitedQScale(RateControlEntry *rce, double q); double countExpectedBits(); - bool vbv2Pass(uint64_t allAvailableBits); - bool findUnderflow(double *fills, int *t0, int *t1, int over); - bool fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax); + bool vbv2Pass(uint64_t allAvailableBits); + bool findUnderflow(double *fills, int *t0, int *t1, int over); + bool fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax); }; } #endif // ifndef X265_RATECONTROL_H diff -Nru x265-1.5/source/encoder/sao.cpp x265-1.6/source/encoder/sao.cpp --- x265-1.5/source/encoder/sao.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/sao.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -261,6 +261,8 @@ int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1; int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1; + memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */ + { const pixel* recR = &rec[ctuWidth - 1]; for (int i = 0; i < ctuHeight + 1; i++) diff -Nru x265-1.5/source/encoder/search.cpp x265-1.6/source/encoder/search.cpp --- x265-1.5/source/encoder/search.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/search.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -30,6 +30,9 @@ #include "entropy.h" #include "rdcost.h" +#include "analysis.h" // TLD +#include "framedata.h" + using namespace x265; #if _MSC_VER @@ -40,10 +43,9 @@ #define MVP_IDX_BITS 1 -ALIGN_VAR_32(const pixel, Search::zeroPixel[MAX_CU_SIZE]) = { 0 }; ALIGN_VAR_32(const int16_t, Search::zeroShort[MAX_CU_SIZE]) = { 0 }; -Search::Search() : JobProvider(NULL) +Search::Search() { memset(m_rqt, 0, sizeof(m_rqt)); @@ -54,25 +56,30 @@ } m_numLayers = 0; + m_intraPred = NULL; + m_intraPredAngs = NULL; + m_fencScaled = NULL; + m_fencTransposed = NULL; + m_tsCoeff = NULL; + m_tsResidual = NULL; + m_tsRecon = NULL; m_param = NULL; m_slice = NULL; m_frame = NULL; - m_bJobsQueued = false; - m_totalNumME = m_numAcquiredME = m_numCompletedME = 0; } bool Search::initSearch(const x265_param& param, ScalingList& scalingList) { uint32_t maxLog2CUSize = g_log2Size[param.maxCUSize]; m_param = ¶m; - m_bEnableRDOQ = param.rdLevel >= 4; + m_bEnableRDOQ = !!param.rdoqLevel; m_bFrameParallel = param.frameNumThreads > 1; m_numLayers = g_log2Size[param.maxCUSize] - 2; m_rdCost.setPsyRdScale(param.psyRd); m_me.init(param.searchMethod, param.subpelRefine, param.internalCsp); - bool ok = m_quant.init(m_bEnableRDOQ, param.psyRdoq, scalingList, m_entropyCoder); + bool ok = m_quant.init(param.rdoqLevel, param.psyRdoq, scalingList, m_entropyCoder); if (m_param->noiseReductionIntra || m_param->noiseReductionInter) ok &= m_quant.allocNoiseReduction(param); @@ -116,6 +123,15 @@ m_qtTempTransformSkipFlag[1] = m_qtTempTransformSkipFlag[0] + numPartitions; m_qtTempTransformSkipFlag[2] = m_qtTempTransformSkipFlag[0] + numPartitions * 2; + CHECKED_MALLOC(m_intraPred, pixel, (32 * 32) * (33 + 3)); + m_fencScaled = m_intraPred + 32 * 32; + m_fencTransposed = m_fencScaled + 32 * 32; + m_intraPredAngs = m_fencTransposed + 32 * 32; + + CHECKED_MALLOC(m_tsCoeff, coeff_t, MAX_TS_SIZE * MAX_TS_SIZE); + CHECKED_MALLOC(m_tsResidual, int16_t, MAX_TS_SIZE * MAX_TS_SIZE); + CHECKED_MALLOC(m_tsRecon, pixel, MAX_TS_SIZE * MAX_TS_SIZE); + return ok; fail: @@ -141,6 +157,10 @@ X265_FREE(m_qtTempCbf[0]); X265_FREE(m_qtTempTransformSkipFlag[0]); + X265_FREE(m_intraPred); + X265_FREE(m_tsCoeff); + X265_FREE(m_tsResidual); + X265_FREE(m_tsRecon); } void Search::setQP(const Slice& slice, int qp) @@ -421,7 +441,7 @@ } // set reconstruction for next intra prediction blocks if full TU prediction won - pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); intptr_t picStride = m_frame->m_reconPic->m_stride; primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride); @@ -477,17 +497,14 @@ if (m_bEnableRDOQ) m_entropyCoder.estBit(m_entropyCoder.m_estBitsSbac, log2TrSize, true); - ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]); - ALIGN_VAR_32(pixel, tsReconY[MAX_TS_SIZE * MAX_TS_SIZE]); - int checkTransformSkip = 1; for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++) { uint64_t tmpCost; uint32_t tmpEnergy = 0; - coeff_t* coeff = (useTSkip ? tsCoeffY : coeffY); - pixel* tmpRecon = (useTSkip ? tsReconY : reconQt); + coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffY); + pixel* tmpRecon = (useTSkip ? m_tsRecon : reconQt); uint32_t tmpReconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride); @@ -578,8 +595,8 @@ if (bTSkip) { - memcpy(coeffY, tsCoeffY, sizeof(coeff_t) << (log2TrSize * 2)); - primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, tsReconY, tuSize); + memcpy(coeffY, m_tsCoeff, sizeof(coeff_t) << (log2TrSize * 2)); + primitives.cu[sizeIdx].copy_pp(reconQt, reconQtStride, m_tsRecon, tuSize); } else if (checkTransformSkip) { @@ -589,7 +606,7 @@ } // set reconstruction for next intra prediction blocks - pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); intptr_t picStride = m_frame->m_reconPic->m_stride; primitives.cu[sizeIdx].copy_pp(picReconY, picStride, reconQt, reconQtStride); @@ -639,7 +656,7 @@ uint32_t sizeIdx = log2TrSize - 2; primitives.cu[sizeIdx].calcresidual(fenc, pred, residual, stride); - pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + pixel* picReconY = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); intptr_t picStride = m_frame->m_reconPic->m_stride; uint32_t numSig = m_quant.transformNxN(cu, fenc, stride, residual, stride, coeffY, log2TrSize, TEXT_LUMA, absPartIdx, false); @@ -799,7 +816,7 @@ coeff_t* coeffC = m_rqt[qtLayer].coeffRQT[chromaId] + coeffOffsetC; pixel* reconQt = m_rqt[qtLayer].reconQtYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t reconQtStride = m_rqt[qtLayer].reconQtYuv.m_csize; - pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); intptr_t picStride = m_frame->m_reconPic->m_strideC; uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; @@ -812,7 +829,7 @@ initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId); // get prediction signal - predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp); + predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); cu.setTransformSkipPartRange(0, ttype, absPartIdxC, tuIterator.absPartIdxStep); primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride); @@ -864,9 +881,6 @@ * condition as it arrived, and to do all bit estimates from the same state. */ m_entropyCoder.store(m_rqt[fullDepth].rqtRoot); - ALIGN_VAR_32(coeff_t, tskipCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]); - ALIGN_VAR_32(pixel, tskipReconC[MAX_TS_SIZE * MAX_TS_SIZE]); - uint32_t curPartNum = cuGeom.numPartitions >> tuDepthC * 2; const SplitType splitType = (m_csp == X265_CSP_I422) ? VERTICAL_SPLIT : DONT_SPLIT; @@ -903,7 +917,7 @@ chromaPredMode = g_chroma422IntraAngleMappingTable[chromaPredMode]; // get prediction signal - predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp); + predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); uint64_t bCost = MAX_INT64; uint32_t bDist = 0; @@ -914,8 +928,8 @@ int checkTransformSkip = 1; for (int useTSkip = 0; useTSkip <= checkTransformSkip; useTSkip++) { - coeff_t* coeff = (useTSkip ? tskipCoeffC : coeffC); - pixel* recon = (useTSkip ? tskipReconC : reconQt); + coeff_t* coeff = (useTSkip ? m_tsCoeff : coeffC); + pixel* recon = (useTSkip ? m_tsRecon : reconQt); uint32_t reconStride = (useTSkip ? MAX_TS_SIZE : reconQtStride); primitives.cu[sizeIdxC].calcresidual(fenc, pred, residual, stride); @@ -972,14 +986,14 @@ if (bTSkip) { - memcpy(coeffC, tskipCoeffC, sizeof(coeff_t) << (log2TrSizeC * 2)); - primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, tskipReconC, MAX_TS_SIZE); + memcpy(coeffC, m_tsCoeff, sizeof(coeff_t) << (log2TrSizeC * 2)); + primitives.cu[sizeIdxC].copy_pp(reconQt, reconQtStride, m_tsRecon, MAX_TS_SIZE); } cu.setCbfPartRange(bCbf << tuDepth, ttype, absPartIdxC, tuIterator.absPartIdxStep); cu.setTransformSkipPartRange(bTSkip, ttype, absPartIdxC, tuIterator.absPartIdxStep); - pixel* reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + pixel* reconPicC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); intptr_t picStride = m_frame->m_reconPic->m_strideC; primitives.cu[sizeIdxC].copy_pp(reconPicC, picStride, reconQt, reconQtStride); @@ -1089,7 +1103,7 @@ int16_t* residual = resiYuv.getChromaAddr(chromaId, absPartIdxC); uint32_t coeffOffsetC = absPartIdxC << (LOG2_UNIT_SIZE * 2 - (m_hChromaShift + m_vChromaShift)); coeff_t* coeffC = cu.m_trCoeff[ttype] + coeffOffsetC; - pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.encodeIdx + absPartIdxC); + pixel* picReconC = m_frame->m_reconPic->getChromaAddr(chromaId, cu.m_cuAddr, cuGeom.absPartIdx + absPartIdxC); intptr_t picStride = m_frame->m_reconPic->m_strideC; uint32_t chromaPredMode = cu.m_chromaIntraDir[absPartIdxC]; @@ -1102,7 +1116,7 @@ initAdiPatternChroma(cu, cuGeom, absPartIdxC, intraNeighbors, chromaId); // get prediction signal - predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC, m_csp); + predIntraChromaAng(chromaPredMode, pred, stride, log2TrSizeC); X265_CHECK(!cu.m_transformSkip[ttype][0], "transform skip not supported at low RD levels\n"); @@ -1131,7 +1145,7 @@ } } -void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes) +void Search::checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes) { CUData& cu = intraMode.cu; @@ -1144,7 +1158,7 @@ intraMode.initCosts(); intraMode.distortion += estIntraPredQT(intraMode, cuGeom, tuDepthRange, sharedModes); - intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom); + intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom, sharedChromaModes); m_entropyCoder.resetBits(); if (m_slice->m_pps->bTransquantBypassEnabled) @@ -1170,14 +1184,16 @@ const Yuv* fencYuv = intraMode.fencYuv; intraMode.psyEnergy = m_rdCost.psyCost(cuGeom.log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, intraMode.reconYuv.m_buf[0], intraMode.reconYuv.m_size); } - updateModeCost(intraMode); + checkDQP(cu, cuGeom); } /* Note that this function does not save the best intra prediction, it must * be generated later. It records the best mode in the cu */ void Search::checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom) { + ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis); + CUData& cu = intraMode.cu; uint32_t depth = cuGeom.depth; @@ -1202,9 +1218,6 @@ uint64_t cost, bcost; // 33 Angle modes once - ALIGN_VAR_32(pixel, bufScale[32 * 32]); - ALIGN_VAR_32(pixel, bufTrans[32 * 32]); - ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]); int scaleTuSize = tuSize; int scaleStride = stride; int costShift = 0; @@ -1212,15 +1225,15 @@ if (tuSize > 32) { - // origin is 64x64, we scale to 32x32 and setup required parameters - primitives.scale2D_64to32(bufScale, fenc, stride); - fenc = bufScale; + // CU is 64x64, we scale to 32x32 and adjust required parameters + primitives.scale2D_64to32(m_fencScaled, fenc, stride); + fenc = m_fencScaled; pixel nScale[129]; intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0]; primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1, 0); - //TO DO: primitive + // we do not estimate filtering for downscaled samples for (int x = 1; x < 65; x++) { intraNeighbourBuf[0][x] = nScale[x]; // Top pixel @@ -1245,14 +1258,14 @@ * pred[1], pred[2] - less probable, slightly more cost * non-mpm modes - all cost the same (rbits) */ uint64_t mpms; - uint32_t preds[3]; - uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms); + uint32_t mpmModes[3]; + uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms); // DC - primitives.cu[sizeIdx].intra_pred[DC_IDX](tmp, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16)); - bsad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPredAngs, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16)); + bsad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift; bmode = mode = DC_IDX; - bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; + bbits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; bcost = m_rdCost.calcRdSADCost(bsad, bbits); // PLANAR @@ -1260,18 +1273,18 @@ if (tuSize & (8 | 16 | 32)) planar = intraNeighbourBuf[1]; - primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](tmp, scaleStride, planar, 0, 0); - sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; + primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPredAngs, scaleStride, planar, 0, 0); + sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleStride) << costShift; mode = PLANAR_IDX; - bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; + bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; cost = m_rdCost.calcRdSADCost(sad, bits); COPY4_IF_LT(bcost, cost, bmode, mode, bsad, sad, bbits, bits); bool allangs = true; if (primitives.cu[sizeIdx].intra_pred_allangs) { - primitives.cu[sizeIdx].transpose(bufTrans, fenc, scaleStride); - primitives.cu[sizeIdx].intra_pred_allangs(tmp, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); + primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride); + primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); } else allangs = false; @@ -1279,16 +1292,16 @@ #define TRY_ANGLE(angle) \ if (allangs) { \ if (angle < 18) \ - sad = sa8d(bufTrans, scaleTuSize, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \ + sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \ else \ - sad = sa8d(fenc, scaleStride, &tmp[(angle - 2) * predsize], scaleTuSize) << costShift; \ - bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \ + sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(angle - 2) * predsize], scaleTuSize) << costShift; \ + bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \ cost = m_rdCost.calcRdSADCost(sad, bits); \ } else { \ int filter = !!(g_intraFilterFlags[angle] & scaleTuSize); \ - primitives.cu[sizeIdx].intra_pred[angle](tmp, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \ - sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift; \ - bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(preds, angle) : rbits; \ + primitives.cu[sizeIdx].intra_pred[angle](m_intraPredAngs, scaleTuSize, intraNeighbourBuf[filter], angle, scaleTuSize <= 16); \ + sad = sa8d(fenc, scaleStride, m_intraPredAngs, scaleTuSize) << costShift; \ + bits = (mpms & ((uint64_t)1 << angle)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, angle) : rbits; \ cost = m_rdCost.calcRdSADCost(sad, bits); \ } @@ -1343,10 +1356,13 @@ intraMode.distortion = bsad; intraMode.sa8dCost = bcost; intraMode.sa8dBits = bbits; + X265_CHECK(intraMode.ok(), "intra mode is not ok"); } void Search::encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom) { + ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); + CUData& cu = intraMode.cu; Yuv* reconYuv = &intraMode.reconYuv; @@ -1365,7 +1381,7 @@ extractIntraResultQT(cu, *reconYuv, 0, 0); intraMode.distortion = icosts.distortion; - intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom); + intraMode.distortion += estIntraPredChromaQT(intraMode, cuGeom, NULL); m_entropyCoder.resetBits(); if (m_slice->m_pps->bTransquantBypassEnabled) @@ -1389,6 +1405,7 @@ m_entropyCoder.store(intraMode.contexts); updateModeCost(intraMode); + checkDQP(intraMode.cu, cuGeom); } uint32_t Search::estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes) @@ -1419,123 +1436,123 @@ bmode = sharedModes[puIdx]; else { - // Reference sample smoothing - IntraNeighbors intraNeighbors; - initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors); - initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX); - - // determine set of modes to be tested (using prediction signal only) - const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); - uint32_t stride = predYuv->m_size; - - // 33 Angle modes once - ALIGN_VAR_32(pixel, bufTrans[32 * 32]); - ALIGN_VAR_32(pixel, tmp[33 * 32 * 32]); - - int scaleTuSize = tuSize; - int scaleStride = stride; - int costShift = 0; + uint64_t candCostList[MAX_RD_INTRA_MODES]; + uint32_t rdModeList[MAX_RD_INTRA_MODES]; + uint64_t bcost; + int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1); - if (tuSize > 32) { - // origin is 64x64, we scale to 32x32 and setup required parameters - ALIGN_VAR_32(pixel, bufScale[32 * 32]); - primitives.scale2D_64to32(bufScale, fenc, stride); - fenc = bufScale; + ProfileCUScope(intraMode.cu, intraAnalysisElapsedTime, countIntraAnalysis); - pixel nScale[129]; - intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0]; - primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1, 0); + // Reference sample smoothing + IntraNeighbors intraNeighbors; + initIntraNeighbors(cu, absPartIdx, initTuDepth, true, &intraNeighbors); + initAdiPattern(cu, cuGeom, absPartIdx, intraNeighbors, ALL_IDX); - // TO DO: primitive - for (int x = 1; x < 65; x++) - { - intraNeighbourBuf[0][x] = nScale[x]; // Top pixel - intraNeighbourBuf[0][x + 64] = nScale[x + 64]; // Left pixel - intraNeighbourBuf[1][x] = nScale[x]; // Top pixel - intraNeighbourBuf[1][x + 64] = nScale[x + 64]; // Left pixel - } + // determine set of modes to be tested (using prediction signal only) + const pixel* fenc = fencYuv->getLumaAddr(absPartIdx); + uint32_t stride = predYuv->m_size; - scaleTuSize = 32; - scaleStride = 32; - costShift = 2; - sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 - } + int scaleTuSize = tuSize; + int scaleStride = stride; + int costShift = 0; - m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); + if (tuSize > 32) + { + // origin is 64x64, we scale to 32x32 and setup required parameters + primitives.scale2D_64to32(m_fencScaled, fenc, stride); + fenc = m_fencScaled; - /* there are three cost tiers for intra modes: - * pred[0] - mode probable, least cost - * pred[1], pred[2] - less probable, slightly more cost - * non-mpm modes - all cost the same (rbits) */ - uint64_t mpms; - uint32_t preds[3]; - uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, preds, mpms); + pixel nScale[129]; + intraNeighbourBuf[1][0] = intraNeighbourBuf[0][0]; + primitives.scale1D_128to64(nScale + 1, intraNeighbourBuf[0] + 1, 0); - pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d; - uint64_t modeCosts[35]; - uint64_t bcost; + // TO DO: primitive + for (int x = 1; x < 65; x++) + { + intraNeighbourBuf[0][x] = nScale[x]; // Top pixel + intraNeighbourBuf[0][x + 64] = nScale[x + 64]; // Left pixel + intraNeighbourBuf[1][x] = nScale[x]; // Top pixel + intraNeighbourBuf[1][x + 64] = nScale[x + 64]; // Left pixel + } - // DC - primitives.cu[sizeIdx].intra_pred[DC_IDX](tmp, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16)); - uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, DC_IDX) : rbits; - uint32_t sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; - modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits); - - // PLANAR - pixel* planar = intraNeighbourBuf[0]; - if (tuSize >= 8 && tuSize <= 32) - planar = intraNeighbourBuf[1]; - - primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](tmp, scaleStride, planar, 0, 0); - bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(preds, PLANAR_IDX) : rbits; - sad = sa8d(fenc, scaleStride, tmp, scaleStride) << costShift; - modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits); - COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]); - - // angular predictions - if (primitives.cu[sizeIdx].intra_pred_allangs) - { - primitives.cu[sizeIdx].transpose(bufTrans, fenc, scaleStride); - primitives.cu[sizeIdx].intra_pred_allangs(tmp, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); - for (int mode = 2; mode < 35; mode++) - { - bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; - if (mode < 18) - sad = sa8d(bufTrans, scaleTuSize, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; - else - sad = sa8d(fenc, scaleStride, &tmp[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; - modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits); - COPY1_IF_LT(bcost, modeCosts[mode]); + scaleTuSize = 32; + scaleStride = 32; + costShift = 2; + sizeIdx = 5 - 2; // log2(scaleTuSize) - 2 + } + + m_entropyCoder.loadIntraDirModeLuma(m_rqt[depth].cur); + + /* there are three cost tiers for intra modes: + * pred[0] - mode probable, least cost + * pred[1], pred[2] - less probable, slightly more cost + * non-mpm modes - all cost the same (rbits) */ + uint64_t mpms; + uint32_t mpmModes[3]; + uint32_t rbits = getIntraRemModeBits(cu, absPartIdx, mpmModes, mpms); + + pixelcmp_t sa8d = primitives.cu[sizeIdx].sa8d; + uint64_t modeCosts[35]; + + // DC + primitives.cu[sizeIdx].intra_pred[DC_IDX](m_intraPred, scaleStride, intraNeighbourBuf[0], 0, (scaleTuSize <= 16)); + uint32_t bits = (mpms & ((uint64_t)1 << DC_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, DC_IDX) : rbits; + uint32_t sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift; + modeCosts[DC_IDX] = bcost = m_rdCost.calcRdSADCost(sad, bits); + + // PLANAR + pixel* planar = intraNeighbourBuf[0]; + if (tuSize >= 8 && tuSize <= 32) + planar = intraNeighbourBuf[1]; + + primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](m_intraPred, scaleStride, planar, 0, 0); + bits = (mpms & ((uint64_t)1 << PLANAR_IDX)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, PLANAR_IDX) : rbits; + sad = sa8d(fenc, scaleStride, m_intraPred, scaleStride) << costShift; + modeCosts[PLANAR_IDX] = m_rdCost.calcRdSADCost(sad, bits); + COPY1_IF_LT(bcost, modeCosts[PLANAR_IDX]); + + // angular predictions + if (primitives.cu[sizeIdx].intra_pred_allangs) + { + primitives.cu[sizeIdx].transpose(m_fencTransposed, fenc, scaleStride); + primitives.cu[sizeIdx].intra_pred_allangs(m_intraPredAngs, intraNeighbourBuf[0], intraNeighbourBuf[1], (scaleTuSize <= 16)); + for (int mode = 2; mode < 35; mode++) + { + bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; + if (mode < 18) + sad = sa8d(m_fencTransposed, scaleTuSize, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; + else + sad = sa8d(fenc, scaleStride, &m_intraPredAngs[(mode - 2) * (scaleTuSize * scaleTuSize)], scaleTuSize) << costShift; + modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits); + COPY1_IF_LT(bcost, modeCosts[mode]); + } } - } - else - { - for (int mode = 2; mode < 35; mode++) + else { - bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(preds, mode) : rbits; - int filter = !!(g_intraFilterFlags[mode] & scaleTuSize); - primitives.cu[sizeIdx].intra_pred[mode](tmp, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16); - sad = sa8d(fenc, scaleStride, tmp, scaleTuSize) << costShift; - modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits); - COPY1_IF_LT(bcost, modeCosts[mode]); + for (int mode = 2; mode < 35; mode++) + { + bits = (mpms & ((uint64_t)1 << mode)) ? m_entropyCoder.bitsIntraModeMPM(mpmModes, mode) : rbits; + int filter = !!(g_intraFilterFlags[mode] & scaleTuSize); + primitives.cu[sizeIdx].intra_pred[mode](m_intraPred, scaleTuSize, intraNeighbourBuf[filter], mode, scaleTuSize <= 16); + sad = sa8d(fenc, scaleStride, m_intraPred, scaleTuSize) << costShift; + modeCosts[mode] = m_rdCost.calcRdSADCost(sad, bits); + COPY1_IF_LT(bcost, modeCosts[mode]); + } } - } - - /* Find the top maxCandCount candidate modes with cost within 25% of best - * or among the most probable modes. maxCandCount is derived from the - * rdLevel and depth. In general we want to try more modes at slower RD - * levels and at higher depths */ - uint64_t candCostList[MAX_RD_INTRA_MODES]; - uint32_t rdModeList[MAX_RD_INTRA_MODES]; - int maxCandCount = 2 + m_param->rdLevel + ((depth + initTuDepth) >> 1); - for (int i = 0; i < maxCandCount; i++) - candCostList[i] = MAX_INT64; - uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12% - for (int mode = 0; mode < 35; mode++) - if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode))) - updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList); + /* Find the top maxCandCount candidate modes with cost within 25% of best + * or among the most probable modes. maxCandCount is derived from the + * rdLevel and depth. In general we want to try more modes at slower RD + * levels and at higher depths */ + for (int i = 0; i < maxCandCount; i++) + candCostList[i] = MAX_INT64; + + uint64_t paddedBcost = bcost + (bcost >> 3); // 1.12% + for (int mode = 0; mode < 35; mode++) + if (modeCosts[mode] < paddedBcost || (mpms & ((uint64_t)1 << mode))) + updateCandList(mode, modeCosts[mode], maxCandCount, rdModeList, candCostList); + } /* measure best candidates using simple RDO (no TU splits) */ bcost = MAX_INT64; @@ -1543,6 +1560,9 @@ { if (candCostList[i] == MAX_INT64) break; + + ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); + m_entropyCoder.load(m_rqt[depth].cur); cu.setLumaIntraDirSubParts(rdModeList[i], absPartIdx, depth + initTuDepth); @@ -1555,6 +1575,8 @@ } } + ProfileCUScope(intraMode.cu, intraRDOElapsedTime[cuGeom.depth], countIntraRDO[cuGeom.depth]); + /* remeasure best mode, allowing TU splits */ cu.setLumaIntraDirSubParts(bmode, absPartIdx, depth + initTuDepth); m_entropyCoder.load(m_rqt[depth].cur); @@ -1575,7 +1597,7 @@ * output recon picture, so it cannot proceed in parallel with anything else when doing INTRA_NXN. Also * it is not updating m_rdContexts[depth].cur for the later PUs which I suspect is slightly wrong. I think * that the contexts should be tracked through each PU */ - pixel* dst = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + absPartIdx); + pixel* dst = m_frame->m_reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + absPartIdx); uint32_t dststride = m_frame->m_reconPic->m_stride; const pixel* src = reconYuv->getLumaAddr(absPartIdx); uint32_t srcstride = reconYuv->m_size; @@ -1641,7 +1663,7 @@ pixel* pred = predYuv->m_buf[chromaId]; Predict::initAdiPatternChroma(cu, cuGeom, 0, intraNeighbors, chromaId); // get prediction signal - predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC, m_csp); + predIntraChromaAng(chromaPredMode, pred, fencYuv->m_csize, log2TrSizeC); cost += primitives.cu[log2TrSizeC - 2].sa8d(fenc, predYuv->m_csize, pred, fencYuv->m_csize) << costShift; } @@ -1655,7 +1677,7 @@ cu.setChromIntraDirSubParts(bestMode, 0, cuGeom.depth); } -uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom) +uint32_t Search::estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes) { CUData& cu = intraMode.cu; Yuv& reconYuv = intraMode.reconYuv; @@ -1683,7 +1705,14 @@ uint32_t maxMode = NUM_CHROMA_MODE; uint32_t modeList[NUM_CHROMA_MODE]; - cu.getAllowedChromaDir(absPartIdxC, modeList); + if (sharedChromaModes && !initTuDepth) + { + for (uint32_t l = 0; l < NUM_CHROMA_MODE; l++) + modeList[l] = sharedChromaModes[0]; + maxMode = 1; + } + else + cu.getAllowedChromaDir(absPartIdxC, modeList); // check chroma modes for (uint32_t mode = minMode; mode < maxMode; mode++) @@ -1733,7 +1762,7 @@ if (!tuIterator.isLastSection()) { - uint32_t zorder = cuGeom.encodeIdx + absPartIdxC; + uint32_t zorder = cuGeom.absPartIdx + absPartIdxC; uint32_t dststride = m_frame->m_reconPic->m_strideC; const pixel* src; pixel* dst; @@ -1779,22 +1808,24 @@ return totalDistortion; } -/* estimation of best merge coding of an inter PU (not a merge CU) */ -uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, int puIdx, MergeData& m) +/* estimation of best merge coding of an inter PU (2Nx2N merge PUs are evaluated as their own mode) */ +uint32_t Search::mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m) { - X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "merge tested on non-2Nx2N partition\n"); + X265_CHECK(cu.m_partSize[0] != SIZE_2Nx2N, "mergeEstimation() called for 2Nx2N\n"); - m.maxNumMergeCand = cu.getInterMergeCandidates(m.absPartIdx, puIdx, m.mvFieldNeighbours, m.interDirNeighbours); + MVField candMvField[MRG_MAX_NUM_CANDS][2]; + uint8_t candDir[MRG_MAX_NUM_CANDS]; + uint32_t numMergeCand = cu.getInterMergeCandidates(pu.puAbsPartIdx, puIdx, candMvField, candDir); if (cu.isBipredRestriction()) { - /* in 8x8 CUs do not allow bidir merge candidates if not 2Nx2N */ - for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand) + /* do not allow bidir merge candidates if PU is smaller than 8x8, drop L1 reference */ + for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand) { - if (m.interDirNeighbours[mergeCand] == 3) + if (candDir[mergeCand] == 3) { - m.interDirNeighbours[mergeCand] = 1; - m.mvFieldNeighbours[mergeCand][1].refIdx = REF_NOT_VALID; + candDir[mergeCand] = 1; + candMvField[mergeCand][1].refIdx = REF_NOT_VALID; } } } @@ -1802,27 +1833,26 @@ Yuv& tempYuv = m_rqt[cuGeom.depth].tmpPredYuv; uint32_t outCost = MAX_UINT; - for (uint32_t mergeCand = 0; mergeCand < m.maxNumMergeCand; ++mergeCand) + for (uint32_t mergeCand = 0; mergeCand < numMergeCand; ++mergeCand) { /* Prevent TMVP candidates from using unavailable reference pixels */ if (m_bFrameParallel && - (m.mvFieldNeighbours[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 || - m.mvFieldNeighbours[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)) + (candMvField[mergeCand][0].mv.y >= (m_param->searchRange + 1) * 4 || + candMvField[mergeCand][1].mv.y >= (m_param->searchRange + 1) * 4)) continue; - cu.m_mv[0][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][0].mv; - cu.m_refIdx[0][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][0].refIdx; - cu.m_mv[1][m.absPartIdx] = m.mvFieldNeighbours[mergeCand][1].mv; - cu.m_refIdx[1][m.absPartIdx] = (int8_t)m.mvFieldNeighbours[mergeCand][1].refIdx; + cu.m_mv[0][pu.puAbsPartIdx] = candMvField[mergeCand][0].mv; + cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][0].refIdx; + cu.m_mv[1][pu.puAbsPartIdx] = candMvField[mergeCand][1].mv; + cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)candMvField[mergeCand][1].refIdx; - prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(tempYuv, true, m_me.bChromaSATD); + motionCompensation(cu, pu, tempYuv, true, m_me.bChromaSATD); - uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(m.absPartIdx), tempYuv.m_size); + uint32_t costCand = m_me.bufSATD(tempYuv.getLumaAddr(pu.puAbsPartIdx), tempYuv.m_size); if (m_me.bChromaSATD) - costCand += m_me.bufChromaSATD(tempYuv, m.absPartIdx); + costCand += m_me.bufChromaSATD(tempYuv, pu.puAbsPartIdx); - uint32_t bitsCand = getTUBits(mergeCand, m.maxNumMergeCand); + uint32_t bitsCand = getTUBits(mergeCand, numMergeCand); costCand = costCand + m_rdCost.getCost(bitsCand); if (costCand < outCost) { @@ -1832,23 +1862,75 @@ } } - m.mvField[0] = m.mvFieldNeighbours[m.index][0]; - m.mvField[1] = m.mvFieldNeighbours[m.index][1]; - m.interDir = m.interDirNeighbours[m.index]; + m.mvField[0] = candMvField[m.index][0]; + m.mvField[1] = candMvField[m.index][1]; + m.dir = candDir[m.index]; return outCost; } -/* this function assumes the caller has configured its MotionEstimation engine with the - * correct source plane and source PU, and has called prepMotionCompensation() to set - * m_puAbsPartIdx, m_puWidth, and m_puHeight */ -void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref) +void Search::PME::processTasks(int workerThreadId) +{ +#if DETAILED_CU_STATS + int fe = mode.cu.m_encData->m_frameEncoderID; + master.m_stats[fe].countPMETasks++; + ScopedElapsedTime pmeTime(master.m_stats[fe].pmeTime); +#endif + ProfileScopeEvent(pme); + master.processPME(*this, master.m_tld[workerThreadId].analysis); +} + +void Search::processPME(PME& pme, Search& slave) +{ + /* acquire a motion estimation job, else exit early */ + int meId; + pme.m_lock.acquire(); + if (pme.m_jobTotal > pme.m_jobAcquired) + { + meId = pme.m_jobAcquired++; + pme.m_lock.release(); + } + else + { + pme.m_lock.release(); + return; + } + + /* Setup slave Search instance for ME for master's CU */ + if (&slave != this) + { + slave.setQP(*m_slice, m_rdCost.m_qp); + slave.m_slice = m_slice; + slave.m_frame = m_frame; + + slave.m_me.setSourcePU(*pme.mode.fencYuv, pme.pu.ctuAddr, pme.pu.cuAbsPartIdx, pme.pu.puAbsPartIdx, pme.pu.width, pme.pu.height); + } + + /* Perform ME, repeat until no more work is available */ + do + { + if (meId < m_slice->m_numRefIdx[0]) + slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.pu, pme.puIdx, 0, meId); + else + slave.singleMotionEstimation(*this, pme.mode, pme.cuGeom, pme.pu, pme.puIdx, 1, meId - m_slice->m_numRefIdx[0]); + + meId = -1; + pme.m_lock.acquire(); + if (pme.m_jobTotal > pme.m_jobAcquired) + meId = pme.m_jobAcquired++; + pme.m_lock.release(); + } + while (meId >= 0); +} + +void Search::singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, const PredictionUnit& pu, + int part, int list, int ref) { uint32_t bits = master.m_listSelBits[list] + MVP_IDX_BITS; bits += getTUBits(ref, m_slice->m_numRefIdx[list]); MV mvc[(MD_ABOVE_LEFT + 1) * 2 + 1]; - int numMvc = interMode.cu.fillMvpCand(part, m_puAbsPartIdx, list, ref, interMode.amvpCand[list][ref], mvc); + int numMvc = interMode.cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc); int mvpIdx = 0; int merange = m_param->searchRange; @@ -1868,8 +1950,8 @@ interMode.cu.clipMv(mvCand); Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; - predInterLumaPixel(tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand); - uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); + predInterLumaPixel(pu, tmpPredYuv, *m_slice->m_refPicList[list][ref]->m_reconPic, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size); if (bestCost > cost) { @@ -1905,10 +1987,11 @@ } } -/* search of the best candidate for inter prediction - * returns true if predYuv was filled with a motion compensated prediction */ -bool Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D) +/* find the best inter prediction for each PU of specified mode */ +void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChromaSA8D) { + ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate); + CUData& cu = interMode.cu; Yuv* predYuv = &interMode.predYuv; @@ -1920,90 +2003,82 @@ const int* numRefIdx = slice->m_numRefIdx; uint32_t lastMode = 0; int totalmebits = 0; - bool bDistributed = m_param->bDistributeMotionEstimation && (numRefIdx[0] + numRefIdx[1]) > 2; + int numME = numRefIdx[0] + numRefIdx[1]; + bool bTryDistributed = m_param->bDistributeMotionEstimation && numME > 2; MV mvzero(0, 0); Yuv& tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv; MergeData merge; + uint32_t mrgCost; memset(&merge, 0, sizeof(merge)); for (int puIdx = 0; puIdx < numPart; puIdx++) { MotionData* bestME = interMode.bestME[puIdx]; + PredictionUnit pu(cu, cuGeom, puIdx); - /* sets m_puAbsPartIdx, m_puWidth, m_puHeight */ - initMotionCompensation(cu, cuGeom, puIdx); - - m_me.setSourcePU(*interMode.fencYuv, cu.m_cuAddr, cuGeom.encodeIdx, m_puAbsPartIdx, m_puWidth, m_puHeight); - - uint32_t mrgCost = MAX_UINT; + m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height); /* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */ if (cu.m_partSize[0] != SIZE_2Nx2N) { - merge.absPartIdx = m_puAbsPartIdx; - merge.width = m_puWidth; - merge.height = m_puHeight; - mrgCost = mergeEstimation(cu, cuGeom, puIdx, merge); - - if (bMergeOnly) - { - if (mrgCost == MAX_UINT) - { - /* No valid merge modes were found, there is no possible way to - * perform a valid motion compensation prediction, so early-exit */ - return false; - } - // set merge result - cu.m_mergeFlag[m_puAbsPartIdx] = true; - cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx - cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx); - cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx); - cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx); + mrgCost = mergeEstimation(cu, cuGeom, pu, puIdx, merge); + + if (bMergeOnly && mrgCost != MAX_UINT) + { + cu.m_mergeFlag[pu.puAbsPartIdx] = true; + cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx + cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx); + cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx); + cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx); + cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx); + cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx); totalmebits += merge.bits; - prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(*predYuv, true, bChromaSA8D); + motionCompensation(cu, pu, *predYuv, true, bChromaSA8D); continue; } } + else + mrgCost = MAX_UINT; bestME[0].cost = MAX_UINT; bestME[1].cost = MAX_UINT; getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits); + bool bDoUnidir = true; + + cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours); /* Uni-directional prediction */ - if (m_param->analysisMode == X265_ANALYSIS_LOAD) + if (m_param->analysisMode == X265_ANALYSIS_LOAD && bestME[0].ref >= 0) { - for (int l = 0; l < numPredDir; l++) + for (int list = 0; list < numPredDir; list++) { - int ref = bestME[l].ref; - uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS; - bits += getTUBits(ref, numRefIdx[l]); + int ref = bestME[list].ref; + uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS; + bits += getTUBits(ref, numRefIdx[list]); - int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc); + int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc); // Pick the best possible MVP from AMVP candidates based on least residual int mvpIdx = 0; int merange = m_param->searchRange; - if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1]) + if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1]) { uint32_t bestCost = MAX_INT; for (int i = 0; i < AMVP_NUM_CANDS; i++) { - MV mvCand = interMode.amvpCand[l][ref][i]; + MV mvCand = interMode.amvpCand[list][ref][i]; // NOTE: skip mvCand if Y is > merange and -FN>1 if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) continue; cu.clipMv(mvCand); - predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand); - uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); + predInterLumaPixel(pu, tmpPredYuv, *slice->m_refPicList[list][ref]->m_reconPic, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size); if (bestCost > cost) { @@ -2013,111 +2088,80 @@ } } - MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx]; + MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx]; int satdCost; setSearchRange(cu, mvp, merange, mvmin, mvmax); - satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); + satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); /* Get total cost of partition, but only include MV bit cost once */ bits += m_me.bitcost(outmv); uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ - checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost); + checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost); - if (cost < bestME[l].cost) + if (cost < bestME[list].cost) { - bestME[l].mv = outmv; - bestME[l].mvp = mvp; - bestME[l].mvpIdx = mvpIdx; - bestME[l].cost = cost; - bestME[l].bits = bits; + bestME[list].mv = outmv; + bestME[list].mvp = mvp; + bestME[list].mvpIdx = mvpIdx; + bestME[list].cost = cost; + bestME[list].bits = bits; } } + bDoUnidir = false; } - else if (bDistributed) + else if (bTryDistributed) { - m_meLock.acquire(); - m_curInterMode = &interMode; - m_curGeom = &cuGeom; - m_curPart = puIdx; - m_totalNumME = 0; - m_numAcquiredME = 1; - m_numCompletedME = 0; - m_totalNumME = numRefIdx[0] + numRefIdx[1]; - m_meLock.release(); + PME pme(*this, interMode, cuGeom, pu, puIdx); + pme.m_jobTotal = numME; + pme.m_jobAcquired = 1; /* reserve L0-0 */ - if (!m_bJobsQueued) - JobProvider::enqueue(); - - for (int i = 1; i < m_totalNumME; i++) - m_pool->pokeIdleThread(); - - do + if (pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, numME - 1)) { - m_meLock.acquire(); - if (m_totalNumME > m_numAcquiredME) - { - int id = m_numAcquiredME++; - m_meLock.release(); - - if (id < numRefIdx[0]) - singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, id); - else - singleMotionEstimation(*this, interMode, cuGeom, puIdx, 1, id - numRefIdx[0]); - - m_meLock.acquire(); - m_numCompletedME++; - m_meLock.release(); - } - else - m_meLock.release(); - } - while (m_totalNumME > m_numAcquiredME); + processPME(pme, *this); - if (!m_bJobsQueued) - JobProvider::dequeue(); + singleMotionEstimation(*this, interMode, cuGeom, pu, puIdx, 0, 0); /* L0-0 */ - /* we saved L0-0 for ourselves */ - singleMotionEstimation(*this, interMode, cuGeom, puIdx, 0, 0); + bDoUnidir = false; - m_meLock.acquire(); - if (++m_numCompletedME == m_totalNumME) - m_meCompletionEvent.trigger(); - m_meLock.release(); + ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters); + pme.waitForExit(); + } - m_meCompletionEvent.wait(); + /* if no peer threads were bonded, fall back to doing unidirectional + * searches ourselves without overhead of singleMotionEstimation() */ } - else + if (bDoUnidir) { - for (int l = 0; l < numPredDir; l++) + for (int list = 0; list < numPredDir; list++) { - for (int ref = 0; ref < numRefIdx[l]; ref++) + for (int ref = 0; ref < numRefIdx[list]; ref++) { - uint32_t bits = m_listSelBits[l] + MVP_IDX_BITS; - bits += getTUBits(ref, numRefIdx[l]); + uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS; + bits += getTUBits(ref, numRefIdx[list]); - int numMvc = cu.fillMvpCand(puIdx, m_puAbsPartIdx, l, ref, interMode.amvpCand[l][ref], mvc); + int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc); // Pick the best possible MVP from AMVP candidates based on least residual int mvpIdx = 0; int merange = m_param->searchRange; - if (interMode.amvpCand[l][ref][0] != interMode.amvpCand[l][ref][1]) + if (interMode.amvpCand[list][ref][0] != interMode.amvpCand[list][ref][1]) { uint32_t bestCost = MAX_INT; for (int i = 0; i < AMVP_NUM_CANDS; i++) { - MV mvCand = interMode.amvpCand[l][ref][i]; + MV mvCand = interMode.amvpCand[list][ref][i]; // NOTE: skip mvCand if Y is > merange and -FN>1 if (m_bFrameParallel && (mvCand.y >= (merange + 1) * 4)) continue; cu.clipMv(mvCand); - predInterLumaPixel(tmpPredYuv, *slice->m_refPicList[l][ref]->m_reconPic, mvCand); - uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size); + predInterLumaPixel(pu, tmpPredYuv, *slice->m_refPicList[list][ref]->m_reconPic, mvCand); + uint32_t cost = m_me.bufSAD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size); if (bestCost > cost) { @@ -2127,26 +2171,26 @@ } } - MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[l][ref][mvpIdx]; + MV mvmin, mvmax, outmv, mvp = interMode.amvpCand[list][ref][mvpIdx]; setSearchRange(cu, mvp, merange, mvmin, mvmax); - int satdCost = m_me.motionEstimate(&slice->m_mref[l][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); + int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, merange, outmv); /* Get total cost of partition, but only include MV bit cost once */ bits += m_me.bitcost(outmv); uint32_t cost = (satdCost - m_me.mvcost(outmv)) + m_rdCost.getCost(bits); /* Refine MVP selection, updates: mvp, mvpIdx, bits, cost */ - checkBestMVP(interMode.amvpCand[l][ref], outmv, mvp, mvpIdx, bits, cost); + checkBestMVP(interMode.amvpCand[list][ref], outmv, mvp, mvpIdx, bits, cost); - if (cost < bestME[l].cost) + if (cost < bestME[list].cost) { - bestME[l].mv = outmv; - bestME[l].mvp = mvp; - bestME[l].mvpIdx = mvpIdx; - bestME[l].ref = ref; - bestME[l].cost = cost; - bestME[l].bits = bits; + bestME[list].mv = outmv; + bestME[list].mvp = mvp; + bestME[list].mvpIdx = mvpIdx; + bestME[list].ref = ref; + bestME[list].cost = cost; + bestME[list].bits = bits; } } } @@ -2158,7 +2202,7 @@ int bidirBits = 0; if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */ - cu.m_partSize[m_puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */ + cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */ bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT) { bidir[0] = bestME[0]; @@ -2168,16 +2212,14 @@ if (m_me.bChromaSATD) { - cu.m_mv[0][m_puAbsPartIdx] = bidir[0].mv; - cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref; - cu.m_mv[1][m_puAbsPartIdx] = bidir[1].mv; - cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref; + cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv; + cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref; + cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv; + cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref; + motionCompensation(cu, pu, tmpPredYuv, true, true); - prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(tmpPredYuv, true, true); - - satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) + - m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx); + satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) + + m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx); } else { @@ -2186,11 +2228,11 @@ Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv; /* Generate reference subpels */ - predInterLumaPixel(bidirYuv[0], *refPic0, bestME[0].mv); - predInterLumaPixel(bidirYuv[1], *refPic1, bestME[1].mv); + predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv); + predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv); - primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(m_puAbsPartIdx), bidirYuv[0].m_size, - bidirYuv[1].getLumaAddr(m_puAbsPartIdx), bidirYuv[1].m_size, 32); + primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size, + bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32); satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size); } @@ -2217,21 +2259,19 @@ /* coincident blocks of the two reference pictures */ if (m_me.bChromaSATD) { - cu.m_mv[0][m_puAbsPartIdx] = mvzero; - cu.m_refIdx[0][m_puAbsPartIdx] = (int8_t)bidir[0].ref; - cu.m_mv[1][m_puAbsPartIdx] = mvzero; - cu.m_refIdx[1][m_puAbsPartIdx] = (int8_t)bidir[1].ref; - - prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(tmpPredYuv, true, true); + cu.m_mv[0][pu.puAbsPartIdx] = mvzero; + cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref; + cu.m_mv[1][pu.puAbsPartIdx] = mvzero; + cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref; + motionCompensation(cu, pu, tmpPredYuv, true, true); - satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(m_puAbsPartIdx), tmpPredYuv.m_size) + - m_me.bufChromaSATD(tmpPredYuv, m_puAbsPartIdx); + satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) + + m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx); } else { - const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx); - const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(cu.m_cuAddr, cuGeom.encodeIdx + m_puAbsPartIdx); + const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx); + const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx); intptr_t refStride = slice->m_mref[0][0].lumaStride; primitives.pu[m_me.partEnum].pixelavg_pp(tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32); @@ -2269,13 +2309,13 @@ /* select best option and store into CU */ if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost) { - cu.m_mergeFlag[m_puAbsPartIdx] = true; - cu.m_mvpIdx[0][m_puAbsPartIdx] = merge.index; // merge candidate ID is stored in L0 MVP idx - cu.setPUInterDir(merge.interDir, m_puAbsPartIdx, puIdx); - cu.setPUMv(0, merge.mvField[0].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(0, merge.mvField[0].refIdx, m_puAbsPartIdx, puIdx); - cu.setPUMv(1, merge.mvField[1].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(1, merge.mvField[1].refIdx, m_puAbsPartIdx, puIdx); + cu.m_mergeFlag[pu.puAbsPartIdx] = true; + cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */ + cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx); + cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx); + cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx); + cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx); + cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx); totalmebits += merge.bits; } @@ -2283,17 +2323,17 @@ { lastMode = 2; - cu.m_mergeFlag[m_puAbsPartIdx] = false; - cu.setPUInterDir(3, m_puAbsPartIdx, puIdx); - cu.setPUMv(0, bidir[0].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx); - cu.m_mvd[0][m_puAbsPartIdx] = bidir[0].mv - bidir[0].mvp; - cu.m_mvpIdx[0][m_puAbsPartIdx] = bidir[0].mvpIdx; - - cu.setPUMv(1, bidir[1].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx); - cu.m_mvd[1][m_puAbsPartIdx] = bidir[1].mv - bidir[1].mvp; - cu.m_mvpIdx[1][m_puAbsPartIdx] = bidir[1].mvpIdx; + cu.m_mergeFlag[pu.puAbsPartIdx] = false; + cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx); + cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx); + cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx); + cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp; + cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx; + + cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx); + cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx); + cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp; + cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx; totalmebits += bidirBits; } @@ -2301,15 +2341,15 @@ { lastMode = 0; - cu.m_mergeFlag[m_puAbsPartIdx] = false; - cu.setPUInterDir(1, m_puAbsPartIdx, puIdx); - cu.setPUMv(0, bestME[0].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(0, bestME[0].ref, m_puAbsPartIdx, puIdx); - cu.m_mvd[0][m_puAbsPartIdx] = bestME[0].mv - bestME[0].mvp; - cu.m_mvpIdx[0][m_puAbsPartIdx] = bestME[0].mvpIdx; + cu.m_mergeFlag[pu.puAbsPartIdx] = false; + cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx); + cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx); + cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx); + cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp; + cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx; - cu.setPURefIdx(1, REF_NOT_VALID, m_puAbsPartIdx, puIdx); - cu.setPUMv(1, mvzero, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx); + cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx); totalmebits += bestME[0].bits; } @@ -2317,25 +2357,23 @@ { lastMode = 1; - cu.m_mergeFlag[m_puAbsPartIdx] = false; - cu.setPUInterDir(2, m_puAbsPartIdx, puIdx); - cu.setPUMv(1, bestME[1].mv, m_puAbsPartIdx, puIdx); - cu.setPURefIdx(1, bestME[1].ref, m_puAbsPartIdx, puIdx); - cu.m_mvd[1][m_puAbsPartIdx] = bestME[1].mv - bestME[1].mvp; - cu.m_mvpIdx[1][m_puAbsPartIdx] = bestME[1].mvpIdx; + cu.m_mergeFlag[pu.puAbsPartIdx] = false; + cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx); + cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx); + cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx); + cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp; + cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx; - cu.setPURefIdx(0, REF_NOT_VALID, m_puAbsPartIdx, puIdx); - cu.setPUMv(0, mvzero, m_puAbsPartIdx, puIdx); + cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx); + cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx); totalmebits += bestME[1].bits; } - prepMotionCompensation(cu, cuGeom, puIdx); - motionCompensation(*predYuv, true, bChromaSA8D); + motionCompensation(cu, pu, *predYuv, true, bChromaSA8D); } - + X265_CHECK(interMode.ok(), "inter mode is not ok"); interMode.sa8dBits += totalmebits; - return true; } void Search::getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]) @@ -2472,7 +2510,7 @@ interMode.coeffBits = 0; interMode.totalBits = interMode.mvBits; if (m_rdCost.m_psyRd) - interMode.psyEnergy = m_rdCost.psyCost(cu.m_log2CUSize[0] - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); updateModeCost(interMode); m_entropyCoder.store(interMode.contexts); @@ -2482,6 +2520,8 @@ * Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */ void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom) { + ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]); + CUData& cu = interMode.cu; Yuv* reconYuv = &interMode.reconYuv; Yuv* predYuv = &interMode.predYuv; @@ -2584,13 +2624,14 @@ bestDist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize)); bestDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize)); if (m_rdCost.m_psyRd) - interMode.psyEnergy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); + interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size); interMode.totalBits = bits; interMode.distortion = bestDist; interMode.coeffBits = coeffBits; interMode.mvBits = bits - coeffBits; updateModeCost(interMode); + checkDQP(interMode.cu, cuGeom); } void Search::residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]) @@ -2918,8 +2959,6 @@ numSig[chromaId][tuIterator.section] = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, coeffCurC + subTUOffset, log2TrSizeC, (TextType)chromaId, absPartIdxC, false); cbfFlag[chromaId][tuIterator.section] = !!numSig[chromaId][tuIterator.section]; - //Coding cbf flags has been removed from here -// m_entropyCoder.codeQtCbfChroma(cbfFlag[chromaId][tuIterator.section], tuDepth); if (cbfFlag[chromaId][tuIterator.section]) m_entropyCoder.codeCoeffNxN(cu, coeffCurC + subTUOffset, absPartIdxC, log2TrSizeC, (TextType)chromaId); uint32_t newBits = m_entropyCoder.getNumberOfWrittenBits(); @@ -3002,9 +3041,6 @@ uint32_t nonZeroPsyEnergyY = 0; uint64_t singleCostY = MAX_INT64; - ALIGN_VAR_32(coeff_t, tsCoeffY[MAX_TS_SIZE * MAX_TS_SIZE]); - ALIGN_VAR_32(int16_t, tsResiY[MAX_TS_SIZE * MAX_TS_SIZE]); - m_entropyCoder.load(m_rqt[depth].rqtRoot); cu.setTransformSkipSubParts(1, TEXT_LUMA, absPartIdx, depth); @@ -3014,22 +3050,22 @@ fenc = fencYuv->getLumaAddr(absPartIdx); resi = resiYuv.getLumaAddr(absPartIdx); - uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, tsCoeffY, log2TrSize, TEXT_LUMA, absPartIdx, true); + uint32_t numSigTSkipY = m_quant.transformNxN(cu, fenc, fencYuv->m_size, resi, resiYuv.m_size, m_tsCoeff, log2TrSize, TEXT_LUMA, absPartIdx, true); if (numSigTSkipY) { m_entropyCoder.resetBits(); m_entropyCoder.codeQtCbfLuma(!!numSigTSkipY, tuDepth); - m_entropyCoder.codeCoeffNxN(cu, tsCoeffY, absPartIdx, log2TrSize, TEXT_LUMA); + m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdx, log2TrSize, TEXT_LUMA); const uint32_t skipSingleBitsY = m_entropyCoder.getNumberOfWrittenBits(); - m_quant.invtransformNxN(tsResiY, trSize, tsCoeffY, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY); + m_quant.invtransformNxN(m_tsResidual, trSize, m_tsCoeff, log2TrSize, TEXT_LUMA, false, true, numSigTSkipY); - nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize); + nonZeroDistY = primitives.cu[partSize].sse_ss(resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize); if (m_rdCost.m_psyRd) { - nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, tsResiY, trSize); + nonZeroPsyEnergyY = m_rdCost.psyCost(partSize, resiYuv.getLumaAddr(absPartIdx), resiYuv.m_size, m_tsResidual, trSize); singleCostY = m_rdCost.calcPsyRdCost(nonZeroDistY, skipSingleBitsY, nonZeroPsyEnergyY); } else @@ -3045,8 +3081,8 @@ cbfFlag[TEXT_LUMA][0] = !!numSigTSkipY; bestTransformMode[TEXT_LUMA][0] = 1; uint32_t numCoeffY = 1 << (log2TrSize << 1); - memcpy(coeffCurY, tsCoeffY, sizeof(coeff_t) * numCoeffY); - primitives.cu[partSize].copy_ss(curResiY, strideResiY, tsResiY, trSize); + memcpy(coeffCurY, m_tsCoeff, sizeof(coeff_t) * numCoeffY); + primitives.cu[partSize].copy_ss(curResiY, strideResiY, m_tsResidual, trSize); } cu.setCbfSubParts(cbfFlag[TEXT_LUMA][0] << tuDepth, TEXT_LUMA, absPartIdx, depth); @@ -3073,9 +3109,6 @@ int16_t* curResiC = m_rqt[qtLayer].resiQtYuv.getChromaAddr(chromaId, absPartIdxC); - ALIGN_VAR_32(coeff_t, tsCoeffC[MAX_TS_SIZE * MAX_TS_SIZE]); - ALIGN_VAR_32(int16_t, tsResiC[MAX_TS_SIZE * MAX_TS_SIZE]); - cu.setTransformSkipPartRange(1, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); if (m_bEnableRDOQ && (chromaId != TEXT_CHROMA_V)) @@ -3083,7 +3116,7 @@ fenc = fencYuv->getChromaAddr(chromaId, absPartIdxC); resi = resiYuv.getChromaAddr(chromaId, absPartIdxC); - uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, tsCoeffC, log2TrSizeC, (TextType)chromaId, absPartIdxC, true); + uint32_t numSigTSkipC = m_quant.transformNxN(cu, fenc, fencYuv->m_csize, resi, resiYuv.m_csize, m_tsCoeff, log2TrSizeC, (TextType)chromaId, absPartIdxC, true); m_entropyCoder.resetBits(); singleBits[chromaId][tuIterator.section] = 0; @@ -3091,16 +3124,16 @@ if (numSigTSkipC) { m_entropyCoder.codeQtCbfChroma(!!numSigTSkipC, tuDepth); - m_entropyCoder.codeCoeffNxN(cu, tsCoeffC, absPartIdxC, log2TrSizeC, (TextType)chromaId); + m_entropyCoder.codeCoeffNxN(cu, m_tsCoeff, absPartIdxC, log2TrSizeC, (TextType)chromaId); singleBits[chromaId][tuIterator.section] = m_entropyCoder.getNumberOfWrittenBits(); - m_quant.invtransformNxN(tsResiC, trSizeC, tsCoeffC, + m_quant.invtransformNxN(m_tsResidual, trSizeC, m_tsCoeff, log2TrSizeC, (TextType)chromaId, false, true, numSigTSkipC); - uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC); + uint32_t dist = primitives.cu[partSizeC].sse_ss(resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC); nonZeroDistC = m_rdCost.scaleChromaDist(chromaId, dist); if (m_rdCost.m_psyRd) { - nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, tsResiC, trSizeC); + nonZeroPsyEnergyC = m_rdCost.psyCost(partSizeC, resiYuv.getChromaAddr(chromaId, absPartIdxC), resiYuv.m_csize, m_tsResidual, trSizeC); singleCostC = m_rdCost.calcPsyRdCost(nonZeroDistC, singleBits[chromaId][tuIterator.section], nonZeroPsyEnergyC); } else @@ -3116,8 +3149,8 @@ cbfFlag[chromaId][tuIterator.section] = !!numSigTSkipC; bestTransformMode[chromaId][tuIterator.section] = 1; uint32_t numCoeffC = 1 << (log2TrSizeC << 1); - memcpy(coeffCurC + subTUOffset, tsCoeffC, sizeof(coeff_t) * numCoeffC); - primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, tsResiC, trSizeC); + memcpy(coeffCurC + subTUOffset, m_tsCoeff, sizeof(coeff_t) * numCoeffC); + primitives.cu[partSizeC].copy_ss(curResiC, strideResiC, m_tsResidual, trSizeC); } cu.setCbfPartRange(cbfFlag[chromaId][tuIterator.section] << tuDepth, (TextType)chromaId, absPartIdxC, tuIterator.absPartIdxStep); @@ -3165,8 +3198,8 @@ } // In split mode, we need only coeffBits. The reason is encoding chroma cbfs is different from luma. - // In case of chroma, if any one of the splitted block's cbf is 1, then we need to encode cbf 1, and then for - // four splitted block's individual cbf value. This is not known before analysis of four splitted blocks. + // In case of chroma, if any one of the split block's cbf is 1, then we need to encode cbf 1, and then for + // four split block's individual cbf value. This is not known before analysis of four split blocks. // For that reason, I am collecting individual coefficient bits only. fullCost.bits = bSplitPresentFlag ? cbfBits + coeffBits : coeffBits; @@ -3196,7 +3229,7 @@ Cost splitCost; if (bSplitPresentFlag && (log2TrSize <= depthRange[1] && log2TrSize > depthRange[0])) { - // Subdiv flag can be encoded at the start of anlysis of splitted blocks. + // Subdiv flag can be encoded at the start of analysis of split blocks. m_entropyCoder.resetBits(); m_entropyCoder.codeTransformSubdivFlag(1, 5 - log2TrSize); splitCost.bits = m_entropyCoder.getNumberOfWrittenBits(); @@ -3381,13 +3414,13 @@ /* returns the number of bits required to signal a non-most-probable mode. * on return mpms contains bitmap of most probable modes */ -uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const +uint32_t Search::getIntraRemModeBits(CUData& cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const { - cu.getIntraDirLumaPredictor(absPartIdx, preds); + cu.getIntraDirLumaPredictor(absPartIdx, mpmModes); mpms = 0; for (int i = 0; i < 3; ++i) - mpms |= ((uint64_t)1 << preds[i]); + mpms |= ((uint64_t)1 << mpmModes[i]); return m_entropyCoder.bitsIntraModeNonMPM(); } @@ -3414,3 +3447,43 @@ candModeList[maxIndex] = mode; } } + +void Search::checkDQP(CUData& cu, const CUGeom& cuGeom) +{ + if (cu.m_slice->m_pps->bUseDQP && cuGeom.depth <= cu.m_slice->m_pps->maxCuDQPDepth) + { + if (cu.getQtRootCbf(0)) + { + /* When analysing RDO with DQP bits, the entropy encoder should add the cost of DQP bits here + * i.e Encode QP */ + } + else + cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); + } +} + +void Search::checkDQPForSplitPred(CUData& cu, const CUGeom& cuGeom) +{ + if ((cuGeom.depth == cu.m_slice->m_pps->maxCuDQPDepth) && cu.m_slice->m_pps->bUseDQP) + { + bool hasResidual = false; + + /* Check if any sub-CU has a non-zero QP */ + for (uint32_t blkIdx = 0; blkIdx < cuGeom.numPartitions; blkIdx++) + { + if (cu.getQtRootCbf(blkIdx)) + { + hasResidual = true; + break; + } + } + if (hasResidual) + /* TODO: Encode QP, and recalculate RD cost of splitPred */ + /* For all zero CBF sub-CUs, reset QP to RefQP (so that deltaQP is not signalled). + When the non-zero CBF sub-CU is found, stop */ + cu.setQPSubCUs(cu.getRefQP(0), 0, cuGeom.depth); + else + /* No residual within this CU or subCU, so reset QP to RefQP */ + cu.setQPSubParts(cu.getRefQP(0), 0, cuGeom.depth); + } +} diff -Nru x265-1.5/source/encoder/search.h x265-1.6/source/encoder/search.h --- x265-1.5/source/encoder/search.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/search.h 2015-04-02 16:46:36.000000000 +0000 @@ -28,6 +28,7 @@ #include "predict.h" #include "quant.h" #include "bitcost.h" +#include "framedata.h" #include "yuv.h" #include "threadpool.h" @@ -35,6 +36,18 @@ #include "entropy.h" #include "motion.h" +#if DETAILED_CU_STATS +#define ProfileCUScopeNamed(name, cu, acc, count) \ + m_stats[cu.m_encData->m_frameEncoderID].count++; \ + ScopedElapsedTime name(m_stats[cu.m_encData->m_frameEncoderID].acc) +#define ProfileCUScope(cu, acc, count) ProfileCUScopeNamed(timedScope, cu, acc, count) +#define ProfileCounter(cu, count) m_stats[cu.m_encData->m_frameEncoderID].count++; +#else +#define ProfileCUScopeNamed(name, cu, acc, count) +#define ProfileCUScope(cu, acc, count) +#define ProfileCounter(cu, count) +#endif + namespace x265 { // private namespace @@ -88,6 +101,10 @@ MotionData bestME[MAX_INTER_PARTS][2]; MV amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS]; + // Neighbour MVs of the current partition. 5 spatial candidates and the + // temporal candidate. + InterNeighbourMV interNeighbours[6]; + uint64_t rdCost; // sum of partition (psy) RD costs (sse(fenc, recon) + lambda2 * bits) uint64_t sa8dCost; // sum of partition sa8d distortion costs (sa8d(fenc, pred) + lambda * bits) uint32_t sa8dBits; // signal bits used in sa8dCost calculation @@ -109,8 +126,35 @@ coeffBits = 0; } + void invalidate() + { + /* set costs to invalid data, catch uninitialized re-use */ + rdCost = UINT64_MAX / 2; + sa8dCost = UINT64_MAX / 2; + sa8dBits = MAX_UINT / 2; + psyEnergy = MAX_UINT / 2; + distortion = MAX_UINT / 2; + totalBits = MAX_UINT / 2; + mvBits = MAX_UINT / 2; + coeffBits = MAX_UINT / 2; + } + + bool ok() const + { + return !(rdCost >= UINT64_MAX / 2 || + sa8dCost >= UINT64_MAX / 2 || + sa8dBits >= MAX_UINT / 2 || + psyEnergy >= MAX_UINT / 2 || + distortion >= MAX_UINT / 2 || + totalBits >= MAX_UINT / 2 || + mvBits >= MAX_UINT / 2 || + coeffBits >= MAX_UINT / 2); + } + void addSubCosts(const Mode& subMode) { + X265_CHECK(subMode.ok(), "sub-mode not initialized"); + rdCost += subMode.rdCost; sa8dCost += subMode.sa8dCost; sa8dBits += subMode.sa8dBits; @@ -122,16 +166,89 @@ } }; +#if DETAILED_CU_STATS +/* This structure is intended for performance debugging and we make no attempt + * to handle dynamic range overflows. Care should be taken to avoid long encodes + * if you care about the accuracy of these elapsed times and counters. This + * profiling is orthogonal to PPA/VTune and can be enabled independently from + * either of them */ +struct CUStats +{ + int64_t intraRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in intra RDO per CU depth + int64_t interRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in inter RDO per CU depth + int64_t intraAnalysisElapsedTime; // elapsed worker time in intra sa8d analysis + int64_t motionEstimationElapsedTime; // elapsed worker time in predInterSearch() + int64_t loopFilterElapsedTime; // elapsed worker time in deblock and SAO and PSNR/SSIM + int64_t pmeTime; // elapsed worker time processing ME slave jobs + int64_t pmeBlockTime; // elapsed worker time blocked for pme batch completion + int64_t pmodeTime; // elapsed worker time processing pmode slave jobs + int64_t pmodeBlockTime; // elapsed worker time blocked for pmode batch completion + int64_t weightAnalyzeTime; // elapsed worker time analyzing reference weights + int64_t totalCTUTime; // elapsed worker time in compressCTU (includes pmode master) + + uint64_t countIntraRDO[NUM_CU_DEPTH]; + uint64_t countInterRDO[NUM_CU_DEPTH]; + uint64_t countIntraAnalysis; + uint64_t countMotionEstimate; + uint64_t countLoopFilter; + uint64_t countPMETasks; + uint64_t countPMEMasters; + uint64_t countPModeTasks; + uint64_t countPModeMasters; + uint64_t countWeightAnalyze; + uint64_t totalCTUs; + + CUStats() { clear(); } + + void clear() + { + memset(this, 0, sizeof(*this)); + } + + void accumulate(CUStats& other) + { + for (uint32_t i = 0; i <= g_maxCUDepth; i++) + { + intraRDOElapsedTime[i] += other.intraRDOElapsedTime[i]; + interRDOElapsedTime[i] += other.interRDOElapsedTime[i]; + countIntraRDO[i] += other.countIntraRDO[i]; + countInterRDO[i] += other.countInterRDO[i]; + } + + intraAnalysisElapsedTime += other.intraAnalysisElapsedTime; + motionEstimationElapsedTime += other.motionEstimationElapsedTime; + loopFilterElapsedTime += other.loopFilterElapsedTime; + pmeTime += other.pmeTime; + pmeBlockTime += other.pmeBlockTime; + pmodeTime += other.pmodeTime; + pmodeBlockTime += other.pmodeBlockTime; + weightAnalyzeTime += other.weightAnalyzeTime; + totalCTUTime += other.totalCTUTime; + + countIntraAnalysis += other.countIntraAnalysis; + countMotionEstimate += other.countMotionEstimate; + countLoopFilter += other.countLoopFilter; + countPMETasks += other.countPMETasks; + countPMEMasters += other.countPMEMasters; + countPModeTasks += other.countPModeTasks; + countPModeMasters += other.countPModeMasters; + countWeightAnalyze += other.countWeightAnalyze; + totalCTUs += other.totalCTUs; + + other.clear(); + } +}; +#endif + inline int getTUBits(int idx, int numIdx) { return idx + (idx < numIdx - 1); } -class Search : public JobProvider, public Predict +class Search : public Predict { public: - static const pixel zeroPixel[MAX_CU_SIZE]; static const int16_t zeroShort[MAX_CU_SIZE]; MotionEstimate m_me; @@ -147,11 +264,25 @@ uint8_t* m_qtTempCbf[3]; uint8_t* m_qtTempTransformSkipFlag[3]; + pixel* m_fencScaled; /* 32x32 buffer for down-scaled version of 64x64 CU fenc */ + pixel* m_fencTransposed; /* 32x32 buffer for transposed copy of fenc */ + pixel* m_intraPred; /* 32x32 buffer for individual intra predictions */ + pixel* m_intraPredAngs; /* allocation for 33 consecutive (all angular) 32x32 intra predictions */ + + coeff_t* m_tsCoeff; /* transform skip coeff 32x32 */ + int16_t* m_tsResidual; /* transform skip residual 32x32 */ + pixel* m_tsRecon; /* transform skip reconstructed pixels 32x32 */ + bool m_bFrameParallel; bool m_bEnableRDOQ; uint32_t m_numLayers; uint32_t m_refLagPixels; +#if DETAILED_CU_STATS + /* Accumulate CU statistics separately for each frame encoder */ + CUStats m_stats[X265_MAX_FRAME_THREADS]; +#endif + Search(); ~Search(); @@ -162,7 +293,7 @@ void invalidateContexts(int fromDepth); // full RD search of intra modes. if sharedModes is not NULL, it directly uses them - void checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes); + void checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes); // select best intra mode using only sa8d costs, cannot measure NxN intra void checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom); @@ -170,7 +301,7 @@ void encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom); // estimation inter prediction (non-skip) - bool predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma); + void predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bMergeOnly, bool bChroma); // encode residual and compute rd-cost for inter mode void encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom); @@ -184,21 +315,39 @@ // pick be chroma mode from available using just sa8d costs void getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom); + /* update CBF flags and QP values to be internally consistent */ + void checkDQP(CUData& cu, const CUGeom& cuGeom); + void checkDQPForSplitPred(CUData& cu, const CUGeom& cuGeom); + + class PME : public BondedTaskGroup + { + public: + + Search& master; + Mode& mode; + const CUGeom& cuGeom; + const PredictionUnit& pu; + int puIdx; + + PME(Search& s, Mode& m, const CUGeom& g, const PredictionUnit& u, int p) : master(s), mode(m), cuGeom(g), pu(u), puIdx(p) {} + + void processTasks(int workerThreadId); + + protected: + + PME operator=(const PME&); + }; + + void processPME(PME& pme, Search& slave); + void singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, const PredictionUnit& pu, int part, int list, int ref); + protected: /* motion estimation distribution */ ThreadLocalData* m_tld; - Mode* m_curInterMode; - const CUGeom* m_curGeom; - int m_curPart; + uint32_t m_listSelBits[3]; - int m_totalNumME; - volatile int m_numAcquiredME; - volatile int m_numCompletedME; - Event m_meCompletionEvent; Lock m_meLock; - bool m_bJobsQueued; - void singleMotionEstimation(Search& master, Mode& interMode, const CUGeom& cuGeom, int part, int list, int ref); void saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth); @@ -206,7 +355,7 @@ uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes); // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned - uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom); + uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes); void codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx); void codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]); @@ -237,21 +386,11 @@ // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks void offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx); + /* output of mergeEstimation, best merge candidate */ struct MergeData { - /* merge candidate data, cached between calls to mergeEstimation */ - MVField mvFieldNeighbours[MRG_MAX_NUM_CANDS][2]; - uint8_t interDirNeighbours[MRG_MAX_NUM_CANDS]; - uint32_t maxNumMergeCand; - - /* data updated for each partition */ - uint32_t absPartIdx; - int width; - int height; - - /* outputs */ MVField mvField[2]; - uint32_t interDir; + uint32_t dir; uint32_t index; uint32_t bits; }; @@ -259,15 +398,15 @@ /* inter/ME helper functions */ void checkBestMVP(MV* amvpCand, MV cMv, MV& mvPred, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const; void setSearchRange(const CUData& cu, MV mvp, int merange, MV& mvmin, MV& mvmax) const; - uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, int partIdx, MergeData& m); - static void getBlkBits(PartSize cuMode, bool bPSlice, int partIdx, uint32_t lastMode, uint32_t blockBit[3]); + uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m); + static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]); /* intra helper functions */ enum { MAX_RD_INTRA_MODES = 16 }; static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList); // get most probable luma modes for CU part, and bit cost of all non mpm modes - uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t preds[3], uint64_t& mpms) const; + uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const; void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); } }; diff -Nru x265-1.5/source/encoder/slicetype.cpp x265-1.6/source/encoder/slicetype.cpp --- x265-1.5/source/encoder/slicetype.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/slicetype.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -34,11 +34,17 @@ #include "motion.h" #include "ratecontrol.h" -#define NUM_CUS (m_widthInCU > 2 && m_heightInCU > 2 ? (m_widthInCU - 2) * (m_heightInCU - 2) : m_widthInCU * m_heightInCU) +#if DETAILED_CU_STATS +#define ProfileLookaheadTime(elapsed, count) ScopedElapsedTime _scope(elapsed); count++ +#else +#define ProfileLookaheadTime(elapsed, count) +#endif using namespace x265; -static inline int16_t median(int16_t a, int16_t b, int16_t c) +namespace { + +inline int16_t median(int16_t a, int16_t b, int16_t c) { int16_t t = (a - b) & ((a - b) >> 31); @@ -49,55 +55,531 @@ return b; } -static inline void median_mv(MV &dst, MV a, MV b, MV c) +inline void median_mv(MV &dst, MV a, MV b, MV c) { dst.x = median(a.x, b.x, c.x); dst.y = median(a.y, b.y, c.y); } +/* Compute variance to derive AC energy of each block */ +inline uint32_t acEnergyVar(Frame *curFrame, uint64_t sum_ssd, int shift, int plane) +{ + uint32_t sum = (uint32_t)sum_ssd; + uint32_t ssd = (uint32_t)(sum_ssd >> 32); + + curFrame->m_lowres.wp_sum[plane] += sum; + curFrame->m_lowres.wp_ssd[plane] += ssd; + return ssd - ((uint64_t)sum * sum >> shift); +} + +/* Find the energy of each block in Y/Cb/Cr plane */ +inline uint32_t acEnergyPlane(Frame *curFrame, pixel* src, intptr_t srcStride, int plane, int colorFormat) +{ + if ((colorFormat != X265_CSP_I444) && plane) + { + ALIGN_VAR_8(pixel, pix[8 * 8]); + primitives.cu[BLOCK_8x8].copy_pp(pix, 8, src, srcStride); + return acEnergyVar(curFrame, primitives.cu[BLOCK_8x8].var(pix, 8), 6, plane); + } + else + return acEnergyVar(curFrame, primitives.cu[BLOCK_16x16].var(src, srcStride), 8, plane); +} + +} // end anonymous namespace + +/* Find the total AC energy of each block in all planes */ +uint32_t LookaheadTLD::acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp) +{ + intptr_t stride = curFrame->m_fencPic->m_stride; + intptr_t cStride = curFrame->m_fencPic->m_strideC; + intptr_t blockOffsetLuma = blockX + (blockY * stride); + int hShift = CHROMA_H_SHIFT(csp); + int vShift = CHROMA_V_SHIFT(csp); + intptr_t blockOffsetChroma = (blockX >> hShift) + ((blockY >> vShift) * cStride); + + uint32_t var; + + var = acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[0] + blockOffsetLuma, stride, 0, csp); + var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[1] + blockOffsetChroma, cStride, 1, csp); + var += acEnergyPlane(curFrame, curFrame->m_fencPic->m_picOrg[2] + blockOffsetChroma, cStride, 2, csp); + x265_emms(); + return var; +} + +void LookaheadTLD::calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param) +{ + /* Actual adaptive quantization */ + int maxCol = curFrame->m_fencPic->m_picWidth; + int maxRow = curFrame->m_fencPic->m_picHeight; + int blockWidth = ((param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + int blockHeight = ((param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + int blockCount = blockWidth * blockHeight; + + for (int y = 0; y < 3; y++) + { + curFrame->m_lowres.wp_ssd[y] = 0; + curFrame->m_lowres.wp_sum[y] = 0; + } + + /* Calculate Qp offset for each 16x16 block in the frame */ + int blockXY = 0; + int blockX = 0, blockY = 0; + double strength = 0.f; + if (param->rc.aqMode == X265_AQ_NONE || param->rc.aqStrength == 0) + { + /* Need to init it anyways for CU tree */ + int cuCount = widthInCU * heightInCU; + + if (param->rc.aqMode && param->rc.aqStrength == 0) + { + memset(curFrame->m_lowres.qpCuTreeOffset, 0, cuCount * sizeof(double)); + memset(curFrame->m_lowres.qpAqOffset, 0, cuCount * sizeof(double)); + for (int cuxy = 0; cuxy < cuCount; cuxy++) + curFrame->m_lowres.invQscaleFactor[cuxy] = 256; + } + + /* Need variance data for weighted prediction */ + if (param->bEnableWeightedPred || param->bEnableWeightedBiPred) + { + for (blockY = 0; blockY < maxRow; blockY += 16) + for (blockX = 0; blockX < maxCol; blockX += 16) + acEnergyCu(curFrame, blockX, blockY, param->internalCsp); + } + } + else + { + blockXY = 0; + double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0; + if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE) + { + double bit_depth_correction = pow(1 << (X265_DEPTH - 8), 0.5); + for (blockY = 0; blockY < maxRow; blockY += 16) + { + for (blockX = 0; blockX < maxCol; blockX += 16) + { + uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp); + qp_adj = pow(energy + 1, 0.1); + curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj; + avg_adj += qp_adj; + avg_adj_pow2 += qp_adj * qp_adj; + blockXY++; + } + } + + avg_adj /= blockCount; + avg_adj_pow2 /= blockCount; + strength = param->rc.aqStrength * avg_adj / bit_depth_correction; + avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (11.f * bit_depth_correction)) / avg_adj; + } + else + strength = param->rc.aqStrength * 1.0397f; + + blockXY = 0; + for (blockY = 0; blockY < maxRow; blockY += 16) + { + for (blockX = 0; blockX < maxCol; blockX += 16) + { + if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE) + { + qp_adj = curFrame->m_lowres.qpCuTreeOffset[blockXY]; + qp_adj = strength * (qp_adj - avg_adj); + } + else + { + uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp); + qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (14.427f + 2 * (X265_DEPTH - 8))); + } + curFrame->m_lowres.qpAqOffset[blockXY] = qp_adj; + curFrame->m_lowres.qpCuTreeOffset[blockXY] = qp_adj; + curFrame->m_lowres.invQscaleFactor[blockXY] = x265_exp2fix8(qp_adj); + blockXY++; + } + } + } + + if (param->bEnableWeightedPred || param->bEnableWeightedBiPred) + { + int hShift = CHROMA_H_SHIFT(param->internalCsp); + int vShift = CHROMA_V_SHIFT(param->internalCsp); + maxCol = ((maxCol + 8) >> 4) << 4; + maxRow = ((maxRow + 8) >> 4) << 4; + int width[3] = { maxCol, maxCol >> hShift, maxCol >> hShift }; + int height[3] = { maxRow, maxRow >> vShift, maxRow >> vShift }; + + for (int i = 0; i < 3; i++) + { + uint64_t sum, ssd; + sum = curFrame->m_lowres.wp_sum[i]; + ssd = curFrame->m_lowres.wp_ssd[i]; + curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]); + } + } +} + +void LookaheadTLD::lowresIntraEstimate(Lowres& fenc) +{ + ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); + pixel fencIntra[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]; + pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1]; + pixel* samples = neighbours[0], *filtered = neighbours[1]; + + const int lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP]; + const int intraPenalty = 5 * lookAheadLambda; + const int lowresPenalty = 4; /* fixed CU cost overhead */ + + const int cuSize = X265_LOWRES_CU_SIZE; + const int cuSize2 = cuSize << 1; + const int sizeIdx = X265_LOWRES_CU_BITS - 2; + + pixelcmp_t satd = primitives.pu[sizeIdx].satd; + int planar = !!(cuSize >= 8); + + int costEst = 0, costEstAq = 0; + + for (int cuY = 0; cuY < heightInCU; cuY++) + { + fenc.rowSatds[0][0][cuY] = 0; + + for (int cuX = 0; cuX < widthInCU; cuX++) + { + const int cuXY = cuX + cuY * widthInCU; + const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc.lumaStride; + pixel *pixCur = fenc.lowresPlane[0] + pelOffset; + + /* copy fenc pixels */ + primitives.cu[sizeIdx].copy_pp(fencIntra, cuSize, pixCur, fenc.lumaStride); + + /* collect reference sample pixels */ + pixCur -= fenc.lumaStride + 1; + memcpy(samples, pixCur, (2 * cuSize + 1) * sizeof(pixel)); /* top */ + for (int i = 1; i <= 2 * cuSize; i++) + samples[cuSize2 + i] = pixCur[i * fenc.lumaStride]; /* left */ + + primitives.cu[sizeIdx].intra_filter(samples, filtered); + + int cost, icost = me.COST_MAX; + uint32_t ilowmode = 0; + + /* DC and planar */ + primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, samples, 0, cuSize <= 16); + cost = satd(fencIntra, cuSize, prediction, cuSize); + COPY2_IF_LT(icost, cost, ilowmode, DC_IDX); + + primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, neighbours[planar], 0, 0); + cost = satd(fencIntra, cuSize, prediction, cuSize); + COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX); + + /* scan angular predictions */ + int filter, acost = me.COST_MAX; + uint32_t mode, alowmode = 4; + for (mode = 5; mode < 35; mode += 5) + { + filter = !!(g_intraFilterFlags[mode] & cuSize); + primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16); + cost = satd(fencIntra, cuSize, prediction, cuSize); + COPY2_IF_LT(acost, cost, alowmode, mode); + } + for (uint32_t dist = 2; dist >= 1; dist--) + { + int minusmode = alowmode - dist; + int plusmode = alowmode + dist; + + mode = minusmode; + filter = !!(g_intraFilterFlags[mode] & cuSize); + primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16); + cost = satd(fencIntra, cuSize, prediction, cuSize); + COPY2_IF_LT(acost, cost, alowmode, mode); + + mode = plusmode; + filter = !!(g_intraFilterFlags[mode] & cuSize); + primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16); + cost = satd(fencIntra, cuSize, prediction, cuSize); + COPY2_IF_LT(acost, cost, alowmode, mode); + } + COPY2_IF_LT(icost, acost, ilowmode, alowmode); + + icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */ + + fenc.lowresCosts[0][0][cuXY] = (uint16_t)(X265_MIN(icost, LOWRES_COST_MASK) | (0 << LOWRES_COST_SHIFT)); + fenc.intraCost[cuXY] = icost; + fenc.intraMode[cuXY] = (uint8_t)ilowmode; + + /* do not include edge blocks in the frame cost estimates, they are not very accurate */ + const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 && + cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2; + + int icostAq = (bFrameScoreCU && fenc.invQscaleFactor) ? ((icost * fenc.invQscaleFactor[cuXY] + 128) >> 8) : icost; + + if (bFrameScoreCU) + { + costEst += icost; + costEstAq += icostAq; + } + + fenc.rowSatds[0][0][cuY] += icostAq; + } + } + + fenc.costEst[0][0] = costEst; + fenc.costEstAq[0][0] = costEstAq; +} + +uint32_t LookaheadTLD::weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp) +{ + pixel *src = ref.fpelPlane[0]; + intptr_t stride = fenc.lumaStride; + + if (wp.bPresentFlag) + { + int offset = wp.inputOffset << (X265_DEPTH - 8); + int scale = wp.inputWeight; + int denom = wp.log2WeightDenom; + int round = denom ? 1 << (denom - 1) : 0; + int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth + int widthHeight = (int)stride; + + primitives.weight_pp(ref.buffer[0], wbuffer[0], stride, widthHeight, paddedLines, + scale, round << correction, denom + correction, offset); + src = weightedRef.fpelPlane[0]; + } + + uint32_t cost = 0; + intptr_t pixoff = 0; + int mb = 0; + + for (int y = 0; y < fenc.lines; y += 8, pixoff = y * stride) + { + for (int x = 0; x < fenc.width; x += 8, mb++, pixoff += 8) + { + int satd = primitives.pu[LUMA_8x8].satd(src + pixoff, stride, fenc.fpelPlane[0] + pixoff, stride); + cost += X265_MIN(satd, fenc.intraCost[mb]); + } + } + + return cost; +} + +bool LookaheadTLD::allocWeightedRef(Lowres& fenc) +{ + intptr_t planesize = fenc.buffer[1] - fenc.buffer[0]; + intptr_t padoffset = fenc.lowresPlane[0] - fenc.buffer[0]; + paddedLines = (int)(planesize / fenc.lumaStride); + + wbuffer[0] = X265_MALLOC(pixel, 4 * planesize); + if (wbuffer[0]) + { + wbuffer[1] = wbuffer[0] + planesize; + wbuffer[2] = wbuffer[1] + planesize; + wbuffer[3] = wbuffer[2] + planesize; + } + else + return false; + + for (int i = 0; i < 4; i++) + weightedRef.lowresPlane[i] = wbuffer[i] + padoffset; + + weightedRef.fpelPlane[0] = weightedRef.lowresPlane[0]; + weightedRef.lumaStride = fenc.lumaStride; + weightedRef.isLowres = true; + weightedRef.isWeighted = false; + + return true; +} + +void LookaheadTLD::weightsAnalyse(Lowres& fenc, Lowres& ref) +{ + static const float epsilon = 1.f / 128.f; + int deltaIndex = fenc.frameNum - ref.frameNum; + + WeightParam wp; + wp.bPresentFlag = false; + + if (!wbuffer[0]) + { + if (!allocWeightedRef(fenc)) + return; + } + + /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ + float guessScale, fencMean, refMean; + x265_emms(); + if (fenc.wp_ssd[0] && ref.wp_ssd[0]) + guessScale = sqrtf((float)fenc.wp_ssd[0] / ref.wp_ssd[0]); + else + guessScale = 1.0f; + fencMean = (float)fenc.wp_sum[0] / (fenc.lines * fenc.width) / (1 << (X265_DEPTH - 8)); + refMean = (float)ref.wp_sum[0] / (fenc.lines * fenc.width) / (1 << (X265_DEPTH - 8)); + + /* Early termination */ + if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon) + return; + + int minoff = 0, minscale, mindenom; + unsigned int minscore = 0, origscore = 1; + int found = 0; + + wp.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true); + mindenom = wp.log2WeightDenom; + minscale = wp.inputWeight; + + origscore = minscore = weightCostLuma(fenc, ref, wp); + + if (!minscore) + return; + + unsigned int s = 0; + int curScale = minscale; + int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f); + if (curOffset < -128 || curOffset > 127) + { + /* Rescale considering the constraints on curOffset. We do it in this order + * because scale has a much wider range than offset (because of denom), so + * it should almost never need to be clamped. */ + curOffset = x265_clip3(-128, 127, curOffset); + curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f); + curScale = x265_clip3(0, 127, curScale); + } + SET_WEIGHT(wp, true, curScale, mindenom, curOffset); + s = weightCostLuma(fenc, ref, wp); + COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1); + + /* Use a smaller denominator if possible */ + while (mindenom > 0 && !(minscale & 1)) + { + mindenom--; + minscale >>= 1; + } + + if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f) + return; + else + { + SET_WEIGHT(wp, true, minscale, mindenom, minoff); + + // set weighted delta cost + fenc.weightedCostDelta[deltaIndex] = minscore / origscore; + + int offset = wp.inputOffset << (X265_DEPTH - 8); + int scale = wp.inputWeight; + int denom = wp.log2WeightDenom; + int round = denom ? 1 << (denom - 1) : 0; + int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth + intptr_t stride = ref.lumaStride; + int widthHeight = (int)stride; + + for (int i = 0; i < 4; i++) + primitives.weight_pp(ref.buffer[i], wbuffer[i], stride, widthHeight, paddedLines, + scale, round << correction, denom + correction, offset); + + weightedRef.isWeighted = true; + } +} + Lookahead::Lookahead(x265_param *param, ThreadPool* pool) - : JobProvider(pool) - , m_est(pool) { - m_bReady = false; - m_bBusy = false; m_param = param; - m_lastKeyframe = -m_param->keyframeMax; + m_pool = pool; + m_lastNonB = NULL; - m_bFilled = false; - m_bFlushed = false; - m_bFlush = false; - m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; - m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; - m_scratch = (int*)x265_malloc(m_widthInCU * sizeof(int)); + m_scratch = NULL; + m_tld = NULL; + m_filled = false; + m_outputSignalRequired = false; + m_isActive = true; + + m_8x8Height = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + m_8x8Width = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; + m_8x8Blocks = m_8x8Width > 2 && m_8x8Height > 2 ? (m_8x8Width - 2) * (m_8x8Height - 2) : m_8x8Width * m_8x8Height; + + m_lastKeyframe = -m_param->keyframeMax; + memset(m_preframes, 0, sizeof(m_preframes)); + m_preTotal = m_preAcquired = m_preCompleted = 0; + m_sliceTypeBusy = false; + m_fullQueueSize = X265_MAX(1, m_param->lookaheadDepth); + m_bAdaptiveQuant = m_param->rc.aqMode || m_param->bEnableWeightedPred || m_param->bEnableWeightedBiPred; + + /* If we have a thread pool and are using --b-adapt 2, it is generally + * preferable to perform all motion searches for each lowres frame in large + * batched; this will create one job per --bframe per lowres frame, and + * these jobs are performed by workers bonded to the thread running + * slicetypeDecide() */ + m_bBatchMotionSearch = m_pool && m_param->bFrameAdaptive == X265_B_ADAPT_TRELLIS; + + /* It is also beneficial to pre-calculate all possible frame cost estimates + * using worker threads bonded to the worker thread running + * slicetypeDecide(). This creates bframes * bframes jobs which take less + * time than the motion search batches but there are many of them. This may + * do much unnecessary work, some frame cost estimates are not needed, so if + * the thread pool is small we disable this feature after the initial burst + * of work */ + m_bBatchFrameCosts = m_bBatchMotionSearch; + + if (m_param->lookaheadSlices && !m_pool) + m_param->lookaheadSlices = 0; + + if (m_param->lookaheadSlices > 1) + { + m_numRowsPerSlice = m_8x8Height / m_param->lookaheadSlices; + m_numRowsPerSlice = X265_MAX(m_numRowsPerSlice, 10); // at least 10 rows per slice + m_numRowsPerSlice = X265_MIN(m_numRowsPerSlice, m_8x8Height); // but no more than the full picture + m_numCoopSlices = m_8x8Height / m_numRowsPerSlice; + m_param->lookaheadSlices = m_numCoopSlices; // report actual final slice count + } + else + { + m_numRowsPerSlice = m_8x8Height; + m_numCoopSlices = 1; + } + +#if DETAILED_CU_STATS + m_slicetypeDecideElapsedTime = 0; + m_preLookaheadElapsedTime = 0; + m_countSlicetypeDecide = 0; + m_countPreLookahead = 0; +#endif + memset(m_histogram, 0, sizeof(m_histogram)); } -Lookahead::~Lookahead() { } - -void Lookahead::init() +#if DETAILED_CU_STATS +void Lookahead::getWorkerStats(int64_t& batchElapsedTime, uint64_t& batchCount, int64_t& coopSliceElapsedTime, uint64_t& coopSliceCount) { - if (m_pool && m_pool->getThreadCount() >= 4 && - ((m_param->bFrameAdaptive && m_param->bframes) || - m_param->rc.cuTree || m_param->scenecutThreshold || - (m_param->lookaheadDepth && m_param->rc.vbvBufferSize))) + batchElapsedTime = coopSliceElapsedTime = 0; + coopSliceCount = batchCount = 0; + int tldCount = m_pool ? m_pool->m_numWorkers : 1; + for (int i = 0; i < tldCount; i++) { - JobProvider::enqueue(); + batchElapsedTime += m_tld[i].batchElapsedTime; + coopSliceElapsedTime += m_tld[i].coopSliceElapsedTime; + batchCount += m_tld[i].countBatches; + coopSliceCount += m_tld[i].countCoopSlices; } - else - m_pool = NULL; /* disable use of worker thread */ +} +#endif + +bool Lookahead::create() +{ + int numTLD = 1 + (m_pool ? m_pool->m_numWorkers : 0); + m_tld = new LookaheadTLD[numTLD]; + for (int i = 0; i < numTLD; i++) + m_tld[i].init(m_8x8Width, m_8x8Height, m_8x8Blocks); + m_scratch = X265_MALLOC(int, m_tld[0].widthInCU); + + return m_tld && m_scratch; } void Lookahead::stop() { - /* do not allow slicetypeDecide() to get started again */ - m_bReady = false; - m_bFlushed = false; - m_bFlush = false; - m_bBusy = false; + if (m_pool && !m_inputQueue.empty()) + { + m_preLookaheadLock.acquire(); + m_isActive = false; + bool wait = m_outputSignalRequired = m_sliceTypeBusy; + m_preLookaheadLock.release(); - if (m_pool) - JobProvider::flush(); // flush will dequeue, if it is necessary + if (wait) + m_outputSignal.wait(); + } } void Lookahead::destroy() @@ -117,132 +599,161 @@ delete curFrame; } - x265_free(m_scratch); + X265_FREE(m_scratch); + + delete [] m_tld; } +/* The synchronization of slicetypeDecide is managed here. The findJob() method + * polls the occupancy of the input queue. If the queue is + * full, it will run slicetypeDecide() and output a mini-gop of frames to the + * output queue. If the flush() method has been called (implying no new pictures + * will be received) then the input queue is considered full if it has even one + * picture left. getDecidedPicture() removes pictures from the output queue and + * only blocks as a last resort. It does not start removing pictures until + * m_filled is true, which occurs after *more than* the lookahead depth of + * pictures have been input so slicetypeDecide() should have started prior to + * output pictures being withdrawn. The first slicetypeDecide() will obviously + * still require a blocking wait, but after this slicetypeDecide() will maintain + * its lead over the encoder (because one picture is added to the input queue + * each time one is removed from the output) and decides slice types of pictures + * just ahead of when the encoder needs them */ + /* Called by API thread */ -void Lookahead::addPicture(Frame *curFrame, int sliceType) +void Lookahead::addPicture(Frame& curFrame, int sliceType) { - { - ProfileScopeEvent(prelookahead); - PicYuv *orig = curFrame->m_fencPic; - curFrame->m_lowres.init(orig, curFrame->m_poc, sliceType); - } - - m_inputQueueLock.acquire(); - m_inputQueue.pushBack(*curFrame); - - if (m_inputQueue.size() >= m_param->lookaheadDepth) - { - if (m_pool) - { - m_bReady = !m_bBusy; - m_inputQueueLock.release(); - m_pool->pokeIdleThread(); - } - else - slicetypeDecide(); - } - else - m_inputQueueLock.release(); + curFrame.m_lowres.sliceType = sliceType; /* determine if the lookahead is (over) filled enough for frames to begin to * be consumed by frame encoders */ - if (!m_bFilled) + if (!m_filled) { if (!m_param->bframes & !m_param->lookaheadDepth) - m_bFilled = true; /* zero-latency */ - else if (curFrame->m_poc >= m_param->lookaheadDepth + 2 + m_param->bframes) - m_bFilled = true; /* full capacity plus mini-gop lag */ + m_filled = true; /* zero-latency */ + else if (curFrame.m_poc >= m_param->lookaheadDepth + 2 + m_param->bframes) + m_filled = true; /* full capacity plus mini-gop lag */ } + + m_preLookaheadLock.acquire(); + + m_inputLock.acquire(); + m_inputQueue.pushBack(curFrame); + m_inputLock.release(); + + m_preframes[m_preTotal++] = &curFrame; + X265_CHECK(m_preTotal <= X265_LOOKAHEAD_MAX, "prelookahead overflow\n"); + + m_preLookaheadLock.release(); + + if (m_pool) + tryWakeOne(); } /* Called by API thread */ void Lookahead::flush() { - m_bFlush = true; - m_bFilled = true; - - /* just in case the input queue is never allowed to fill */ - m_inputQueueLock.acquire(); - if (m_inputQueue.empty()) - { - m_bFlushed = true; - m_inputQueueLock.release(); - } - else - { - if (m_pool) - { - m_bReady = !m_bBusy; - m_inputQueueLock.release(); - m_pool->pokeIdleThread(); - } - else - slicetypeDecide(); - } + /* force slicetypeDecide to run until the input queue is empty */ + m_fullQueueSize = 1; + m_filled = true; } -/* Called by API thread. If the lookahead queue has not yet been filled the - * first time, it immediately returns NULL. Else the function blocks until - * outputs are available and then pops the first frame from the output queue. If - * flush() has been called and the output queue is empty, NULL is returned. */ -Frame* Lookahead::getDecidedPicture() +void Lookahead::findJob(int workerThreadID) { - if (!m_bFilled) - return NULL; + Frame* preFrame; + bool doDecide; - m_outputQueueLock.acquire(); - Frame *fenc = m_outputQueue.popFront(); - m_outputQueueLock.release(); + if (!m_isActive) + return; - if (fenc || m_bFlushed) - return fenc; + int tld = workerThreadID; + if (workerThreadID < 0) + tld = m_pool ? m_pool->m_numWorkers : 0; + m_preLookaheadLock.acquire(); do { - m_outputAvailable.wait(); + preFrame = NULL; + doDecide = false; - m_outputQueueLock.acquire(); - fenc = m_outputQueue.popFront(); - m_outputQueueLock.release(); - } - while (!fenc); + if (m_preTotal > m_preAcquired) + preFrame = m_preframes[m_preAcquired++]; + else + { + if (m_preTotal == m_preCompleted) + m_preAcquired = m_preTotal = m_preCompleted = 0; - return fenc; + /* the worker thread that performs the last pre-lookahead will generally get to run + * slicetypeDecide() */ + m_inputLock.acquire(); + if (!m_sliceTypeBusy && !m_preTotal && m_inputQueue.size() >= m_fullQueueSize && m_isActive) + doDecide = m_sliceTypeBusy = true; + else + m_helpWanted = false; + m_inputLock.release(); + } + m_preLookaheadLock.release(); + + if (preFrame) + { + ProfileLookaheadTime(m_preLookaheadElapsedTime, m_countPreLookahead); + ProfileScopeEvent(prelookahead); + + preFrame->m_lowres.init(preFrame->m_fencPic, preFrame->m_poc); + if (m_param->rc.bStatRead && m_param->rc.cuTree && IS_REFERENCED(preFrame)) + /* cu-tree offsets were read from stats file */; + else if (m_bAdaptiveQuant) + m_tld[tld].calcAdaptiveQuantFrame(preFrame, m_param); + m_tld[tld].lowresIntraEstimate(preFrame->m_lowres); + + m_preLookaheadLock.acquire(); /* re-acquire for next pass */ + m_preCompleted++; + } + else if (doDecide) + { + ProfileLookaheadTime(m_slicetypeDecideElapsedTime, m_countSlicetypeDecide); + ProfileScopeEvent(slicetypeDecideEV); + + slicetypeDecide(); + + m_preLookaheadLock.acquire(); /* re-acquire for next pass */ + if (m_outputSignalRequired) + { + m_outputSignal.trigger(); + m_outputSignalRequired = false; + } + m_sliceTypeBusy = false; + } + } + while (preFrame || doDecide); } -/* Called by pool worker threads */ -bool Lookahead::findJob(int) +/* Called by API thread */ +Frame* Lookahead::getDecidedPicture() { - if (!m_bReady) - return false; - - m_inputQueueLock.acquire(); - if (!m_bReady) + if (m_filled) { - m_inputQueueLock.release(); - return false; - } + m_outputLock.acquire(); + Frame *out = m_outputQueue.popFront(); + m_outputLock.release(); - m_bReady = false; - m_bBusy = true; + if (out) + return out; - do - { - slicetypeDecide(); // releases input queue lock + /* process all pending pre-lookahead frames and run slicetypeDecide() if + * necessary */ + findJob(-1); - m_inputQueueLock.acquire(); + m_preLookaheadLock.acquire(); + bool wait = m_outputSignalRequired = m_sliceTypeBusy || m_preTotal; + m_preLookaheadLock.release(); - if (!m_bBusy) - break; - } - while (m_inputQueue.size() >= m_param->lookaheadDepth || - (m_bFlush && m_inputQueue.size())); + if (wait) + m_outputSignal.wait(); - m_bBusy = false; - m_inputQueueLock.release(); - return true; + return m_outputQueue.popFront(); + } + else + return NULL; } /* Called by rate-control to calculate the estimated SATD cost for a given @@ -284,6 +795,8 @@ return; } + X265_CHECK(curFrame->m_lowres.costEst[b - p0][p1 - b] > 0, "Slice cost not estimated\n") + if (m_param->rc.cuTree && !m_param->rc.bStatRead) /* update row satds based on cutree offsets */ curFrame->m_lowres.satdCost = frameCostRecalculate(frames, p0, p1, b); @@ -299,7 +812,7 @@ uint32_t lowresRow = 0, lowresCol = 0, lowresCuIdx = 0, sum = 0; uint32_t scale = m_param->maxCUSize / (2 * X265_LOWRES_CU_SIZE); uint32_t numCuInHeight = (m_param->sourceHeight + g_maxCUSize - 1) / g_maxCUSize; - uint32_t widthInLowresCu = (uint32_t)m_widthInCU, heightInLowresCu = (uint32_t)m_heightInCU; + uint32_t widthInLowresCu = (uint32_t)m_8x8Width, heightInLowresCu = (uint32_t)m_8x8Height; double *qp_offset = 0; /* Factor in qpoffsets based on Aq/Cutree in CU costs */ if (m_param->rc.aqMode) @@ -333,8 +846,6 @@ /* called by API thread or worker thread with inputQueueLock acquired */ void Lookahead::slicetypeDecide() { - ProfileScopeEvent(slicetypeDecideEV); - Lowres *frames[X265_LOOKAHEAD_MAX]; Frame *list[X265_LOOKAHEAD_MAX]; int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); @@ -342,6 +853,7 @@ memset(frames, 0, sizeof(frames)); memset(list, 0, sizeof(list)); { + ScopedLock lock(m_inputLock); Frame *curFrame = m_inputQueue.first(); int j; for (j = 0; j < m_param->bframes + 2; j++) @@ -357,16 +869,12 @@ { if (!curFrame) break; frames[j + 1] = &curFrame->m_lowres; + X265_CHECK(curFrame->m_lowres.costEst[0][0] > 0, "prelookahead not completed for input picture\n"); curFrame = curFrame->m_next; } maxSearch = j; - } - - m_inputQueueLock.release(); - - if (!m_est.m_rows && list[0]) - m_est.init(m_param, list[0]); + } if (m_lastNonB && !m_param->rc.bStatRead && ((m_param->bFrameAdaptive && m_param->bframes) || @@ -389,7 +897,7 @@ } /* pyramid with multiple B-refs needs a big enough dpb that the preceding P-frame stays available. - smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it.*/ + * smaller dpb could be supported by smart enough use of mmco, but it's easier just to forbid it. */ else if (frm.sliceType == X265_TYPE_BREF && m_param->bBPyramid && brefs && m_param->maxNumReferences <= (brefs + 3)) { @@ -398,7 +906,7 @@ frm.sliceType, m_param->maxNumReferences); } - if ( /*(!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax) + if (/* (!param->intraRefresh || frm.frameNum == 0) && */ frm.frameNum - m_lastKeyframe >= m_param->keyframeMax) { if (frm.sliceType == X265_TYPE_AUTO || frm.sliceType == X265_TYPE_I) frm.sliceType = m_param->bOpenGOP && m_lastKeyframe >= 0 ? X265_TYPE_I : X265_TYPE_IDR; @@ -474,7 +982,10 @@ /* estimate new non-B cost */ p1 = b = bframes + 1; p0 = (IS_X265_TYPE_I(frames[bframes + 1]->sliceType)) ? b : 0; - m_est.estimateFrameCost(frames, p0, p1, b, 0); + + CostEstimateGroup estGroup(*this, frames); + + estGroup.singleCost(p0, p1, b); if (bframes) { @@ -487,7 +998,7 @@ else p1 = bframes + 1; - m_est.estimateFrameCost(frames, p0, p1, b, 0); + estGroup.singleCost(p0, p1, b); if (frames[b]->sliceType == X265_TYPE_BREF) p0 = b; @@ -495,8 +1006,7 @@ } } - m_inputQueueLock.acquire(); - + m_inputLock.acquire(); /* dequeue all frames from inputQueue that are about to be enqueued * in the output queue. The order is important because Frame can * only be in one list at a time */ @@ -508,10 +1018,9 @@ pts[i] = curFrame->m_pts; maxSearch--; } + m_inputLock.release(); - m_inputQueueLock.release(); - - m_outputQueueLock.acquire(); + m_outputLock.acquire(); /* add non-B to output queue */ int idx = 0; list[bframes]->m_reorderedPts = pts[idx++]; @@ -533,7 +1042,7 @@ /* add B frames to output queue */ for (int i = 0; i < bframes; i++) { - /* push all the B frames into output queue except B-ref, which already pushed into output queue*/ + /* push all the B frames into output queue except B-ref, which already pushed into output queue */ if (list[i]->m_lowres.sliceType != X265_TYPE_BREF) { list[i]->m_reorderedPts = pts[idx++]; @@ -544,7 +1053,7 @@ bool isKeyFrameAnalyse = (m_param->rc.cuTree || (m_param->rc.vbvBufferSize && m_param->lookaheadDepth)) && !m_param->rc.bStatRead; if (isKeyFrameAnalyse && IS_X265_TYPE_I(m_lastNonB->sliceType)) { - m_inputQueueLock.acquire(); + m_inputLock.acquire(); Frame *curFrame = m_inputQueue.first(); frames[0] = m_lastNonB; int j; @@ -553,14 +1062,12 @@ frames[j + 1] = &curFrame->m_lowres; curFrame = curFrame->m_next; } + m_inputLock.release(); frames[j + 1] = NULL; - m_inputQueueLock.release(); slicetypeAnalyse(frames, true); } - - m_outputQueueLock.release(); - m_outputAvailable.trigger(); + m_outputLock.release(); } void Lookahead::vbvLookahead(Lowres **frames, int numFrames, int keyframe) @@ -570,7 +1077,9 @@ curNonB++; int nextNonB = keyframe ? prevNonB : curNonB; int nextB = prevNonB + 1; - int nextBRef = 0; + int nextBRef = 0, curBRef = 0; + if (m_param->bBPyramid && curNonB - prevNonB > 1) + curBRef = (prevNonB + curNonB + 1) / 2; int miniGopEnd = keyframe ? prevNonB : curNonB; while (curNonB < numFrames + !keyframe) { @@ -580,6 +1089,7 @@ int p0 = IS_X265_TYPE_I(frames[curNonB]->sliceType) ? curNonB : prevNonB; frames[nextNonB]->plannedSatd[idx] = vbvFrameCost(frames, p0, curNonB, curNonB); frames[nextNonB]->plannedType[idx] = frames[curNonB]->sliceType; + /* Save the nextNonB Cost in each B frame of the current miniGop */ if (curNonB > miniGopEnd) { @@ -587,18 +1097,19 @@ { frames[j]->plannedSatd[frames[j]->indB] = frames[nextNonB]->plannedSatd[idx]; frames[j]->plannedType[frames[j]->indB++] = frames[nextNonB]->plannedType[idx]; - } } idx++; } + /* Handle the B-frames: coded order */ if (m_param->bBPyramid && curNonB - prevNonB > 1) nextBRef = (prevNonB + curNonB + 1) / 2; for (int i = prevNonB + 1; i < curNonB; i++, idx++) { - int64_t satdCost = 0; int type = X265_TYPE_B; + int64_t satdCost = 0; + int type = X265_TYPE_B; if (nextBRef) { if (i == nextBRef) @@ -612,19 +1123,19 @@ satdCost = vbvFrameCost(frames, nextBRef, curNonB, i); } else - satdCost = vbvFrameCost(frames, prevNonB, nextNonB, i); + satdCost = vbvFrameCost(frames, prevNonB, curNonB, i); frames[nextNonB]->plannedSatd[idx] = satdCost; frames[nextNonB]->plannedType[idx] = type; /* Save the nextB Cost in each B frame of the current miniGop */ for (int j = nextB; j < miniGopEnd; j++) { - if (nextBRef && i == nextBRef) + if (curBRef && curBRef == i) break; if (j >= i && j !=nextBRef) continue; frames[j]->plannedSatd[frames[j]->indB] = satdCost; - frames[j]->plannedType[frames[j]->indB++] = X265_TYPE_B; + frames[j]->plannedType[frames[j]->indB++] = type; } } prevNonB = curNonB; @@ -638,7 +1149,8 @@ int64_t Lookahead::vbvFrameCost(Lowres **frames, int p0, int p1, int b) { - int64_t cost = m_est.estimateFrameCost(frames, p0, p1, b, 0); + CostEstimateGroup estGroup(*this, frames); + int64_t cost = estGroup.singleCost(p0, p1, b); if (m_param->rc.aqMode) { @@ -647,6 +1159,7 @@ else return frames[b]->costEstAq[b - p0][p1 - b]; } + return cost; } @@ -654,7 +1167,7 @@ { int numFrames, origNumFrames, keyintLimit, framecnt; int maxSearch = X265_MIN(m_param->lookaheadDepth, X265_LOOKAHEAD_MAX); - int cuCount = NUM_CUS; + int cuCount = m_8x8Blocks; int resetStart; bool bIsVbvLookahead = m_param->rc.vbvBufferSize && m_param->lookaheadDepth; @@ -688,6 +1201,76 @@ return; } + if (m_bBatchMotionSearch) + { + /* pre-calculate all motion searches, using many worker threads */ + CostEstimateGroup estGroup(*this, frames); + for (int b = 2; b < numFrames; b++) + { + for (int i = 1; i <= m_param->bframes + 1; i++) + { + int p0 = b - i; + if (p0 < 0) + continue; + + /* Skip search if already done */ + if (frames[b]->lowresMvs[0][i - 1][0].x != 0x7FFF) + continue; + + /* perform search to p1 at same distance, if possible */ + int p1 = b + i; + if (p1 >= numFrames || frames[b]->lowresMvs[1][i - 1][0].x != 0x7FFF) + p1 = b; + + estGroup.add(p0, p1, b); + } + } + /* auto-disable after the first batch if pool is small */ + m_bBatchMotionSearch &= m_pool->m_numWorkers >= 4; + estGroup.finishBatch(); + + if (m_bBatchFrameCosts) + { + /* pre-calculate all frame cost estimates, using many worker threads */ + for (int b = 2; b < numFrames; b++) + { + for (int i = 1; i <= m_param->bframes + 1; i++) + { + if (b < i) + continue; + + /* only measure frame cost in this pass if motion searches + * are already done */ + if (frames[b]->lowresMvs[0][i - 1][0].x == 0x7FFF) + continue; + + int p0 = b - i; + + for (int j = 0; j <= m_param->bframes; j++) + { + int p1 = b + j; + if (p1 >= numFrames) + break; + + /* ensure P1 search is done */ + if (j && frames[b]->lowresMvs[1][j - 1][0].x == 0x7FFF) + continue; + + /* ensure frame cost is not done */ + if (frames[b]->costEst[i][j] >= 0) + continue; + + estGroup.add(p0, p1, b); + } + } + } + + /* auto-disable after the first batch if the pool is not large */ + m_bBatchFrameCosts &= m_pool->m_numWorkers > 12; + estGroup.finishBatch(); + } + } + int numBFrames = 0; int numAnalyzed = numFrames; if (m_param->scenecutThreshold && scenecut(frames, 0, 1, true, origNumFrames, maxSearch)) @@ -705,29 +1288,27 @@ char best_paths[X265_BFRAME_MAX + 1][X265_LOOKAHEAD_MAX + 1] = { "", "P" }; int best_path_index = numFrames % (X265_BFRAME_MAX + 1); - /* Perform the frametype analysis. */ + /* Perform the frame type analysis. */ for (int j = 2; j <= numFrames; j++) - { slicetypePath(frames, j, best_paths); - } numBFrames = (int)strspn(best_paths[best_path_index], "B"); /* Load the results of the analysis into the frame types. */ for (int j = 1; j < numFrames; j++) - { frames[j]->sliceType = best_paths[best_path_index][j - 1] == 'B' ? X265_TYPE_B : X265_TYPE_P; - } } frames[numFrames]->sliceType = X265_TYPE_P; } else if (m_param->bFrameAdaptive == X265_B_ADAPT_FAST) { + CostEstimateGroup estGroup(*this, frames); + int64_t cost1p0, cost2p0, cost1b1, cost2p1; for (int i = 0; i <= numFrames - 2; ) { - cost2p1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 2, 1); + cost2p1 = estGroup.singleCost(i + 0, i + 2, i + 2, true); if (frames[i + 2]->intraMbs[2] > cuCount / 2) { frames[i + 1]->sliceType = X265_TYPE_P; @@ -736,9 +1317,9 @@ continue; } - cost1b1 = m_est.estimateFrameCost(frames, i + 0, i + 2, i + 1, 0); - cost1p0 = m_est.estimateFrameCost(frames, i + 0, i + 1, i + 1, 0); - cost2p0 = m_est.estimateFrameCost(frames, i + 1, i + 2, i + 2, 0); + cost1b1 = estGroup.singleCost(i + 0, i + 2, i + 1); + cost1p0 = estGroup.singleCost(i + 0, i + 1, i + 1); + cost2p0 = estGroup.singleCost(i + 1, i + 2, i + 2); if (cost1p0 + cost2p0 < cost1b1 + cost2p1) { @@ -756,7 +1337,7 @@ for (j = i + 2; j <= X265_MIN(i + m_param->bframes, numFrames - 1); j++) { int64_t pthresh = X265_MAX(INTER_THRESH - P_SENS_BIAS * (j - i - 1), INTER_THRESH / 10); - int64_t pcost = m_est.estimateFrameCost(frames, i + 0, j + 1, j + 1, 1); + int64_t pcost = estGroup.singleCost(i + 0, j + 1, j + 1, true); if (pcost > pthresh * cuCount || frames[j + 1]->intraMbs[j - i + 1] > cuCount / 3) break; frames[j]->sliceType = X265_TYPE_B; @@ -768,20 +1349,17 @@ frames[numFrames]->sliceType = X265_TYPE_P; numBFrames = 0; while (numBFrames < numFrames && frames[numBFrames + 1]->sliceType == X265_TYPE_B) - { numBFrames++; - } } else { numBFrames = X265_MIN(numFrames - 1, m_param->bframes); for (int j = 1; j < numFrames; j++) - { frames[j]->sliceType = (j % (numBFrames + 1)) ? X265_TYPE_B : X265_TYPE_P; - } frames[numFrames]->sliceType = X265_TYPE_P; } + /* Check scenecut on the first minigop. */ for (int j = 1; j < numBFrames + 1; j++) { @@ -798,9 +1376,7 @@ else { for (int j = 1; j <= numFrames; j++) - { frames[j]->sliceType = X265_TYPE_P; - } resetStart = bKeyframe ? 1 : 2; } @@ -818,11 +1394,9 @@ if (bIsVbvLookahead) vbvLookahead(frames, numFrames, bKeyframe); - /* Restore frametypes for all frames that haven't actually been decided yet. */ + /* Restore frame types for all frames that haven't actually been decided yet. */ for (int j = resetStart; j <= numFrames; j++) - { frames[j]->sliceType = X265_TYPE_AUTO; - } } bool Lookahead::scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch) @@ -847,9 +1421,7 @@ if (!scenecutInternal(frames, p0, cp1, false)) /* Any frame in between p0 and cur_p1 cannot be a real scenecut. */ for (int i = cp1; i > p0; i--) - { frames[i]->bScenecut = false; - } } /* Where A-F are scenes: AAAAABBCCDDEEFFFFFF @@ -875,7 +1447,8 @@ { Lowres *frame = frames[p1]; - m_est.estimateFrameCost(frames, p0, p1, p1, 0); + CostEstimateGroup estGroup(*this, frames); + estGroup.singleCost(p0, p1, p1); int64_t icost = frame->costEst[0][0]; int64_t pcost = frame->costEst[p1 - p0][0]; @@ -904,7 +1477,7 @@ if (res && bRealScenecut) { int imb = frame->intraMbs[p1 - p0]; - int pmb = NUM_CUS - imb; + int pmb = m_8x8Blocks - imb; x265_log(m_param, X265_LOG_DEBUG, "scene cut at %d Icost:%d Pcost:%d ratio:%.4f bias:%.4f gop:%d (imb:%d pmb:%d)\n", frame->frameNum, icost, pcost, 1. - (double)pcost / icost, bias, gopSize, imb, pmb); } @@ -946,18 +1519,19 @@ int loc = 1; int cur_p = 0; + CostEstimateGroup estGroup(*this, frames); + path--; /* Since the 1st path element is really the second frame */ while (path[loc]) { int next_p = loc; /* Find the location of the next P-frame. */ while (path[next_p] != 'P') - { next_p++; - } /* Add the cost of the P-frame found above */ - cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_p, 0); + cost += estGroup.singleCost(cur_p, next_p, next_p); + /* Early terminate if the cost we have found is larger than the best path cost so far */ if (cost > threshold) break; @@ -965,23 +1539,18 @@ if (m_param->bBPyramid && next_p - cur_p > 2) { int middle = cur_p + (next_p - cur_p) / 2; - cost += m_est.estimateFrameCost(frames, cur_p, next_p, middle, 0); + cost += estGroup.singleCost(cur_p, next_p, middle); + for (int next_b = loc; next_b < middle && cost < threshold; next_b++) - { - cost += m_est.estimateFrameCost(frames, cur_p, middle, next_b, 0); - } + cost += estGroup.singleCost(cur_p, middle, next_b); for (int next_b = middle + 1; next_b < next_p && cost < threshold; next_b++) - { - cost += m_est.estimateFrameCost(frames, middle, next_p, next_b, 0); - } + cost += estGroup.singleCost(middle, next_p, next_b); } else { for (int next_b = loc; next_b < next_p && cost < threshold; next_b++) - { - cost += m_est.estimateFrameCost(frames, cur_p, next_p, next_b, 0); - } + cost += estGroup.singleCost(cur_p, next_p, next_b); } loc = next_p + 1; @@ -1005,10 +1574,7 @@ double averageDuration = totalDuration / (numframes + 1); int i = numframes; - int cuCount = m_widthInCU * m_heightInCU; - - if (bIntra) - m_est.estimateFrameCost(frames, 0, 0, 0, 0); + int cuCount = m_8x8Width * m_8x8Height; while (i > 0 && frames[i]->sliceType == X265_TYPE_B) i--; @@ -1036,6 +1602,8 @@ memset(frames[lastnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); } + CostEstimateGroup estGroup(*this, frames); + while (i-- > idx) { curnonb = i; @@ -1045,13 +1613,14 @@ if (curnonb < idx) break; - m_est.estimateFrameCost(frames, curnonb, lastnonb, lastnonb, 0); + estGroup.singleCost(curnonb, lastnonb, lastnonb); + memset(frames[curnonb]->propagateCost, 0, cuCount * sizeof(uint16_t)); bframes = lastnonb - curnonb - 1; if (m_param->bBPyramid && bframes > 1) { int middle = (bframes + 1) / 2 + curnonb; - m_est.estimateFrameCost(frames, curnonb, lastnonb, middle, 0); + estGroup.singleCost(curnonb, lastnonb, middle); memset(frames[middle]->propagateCost, 0, cuCount * sizeof(uint16_t)); while (i > curnonb) { @@ -1059,7 +1628,7 @@ int p1 = i < middle ? middle : lastnonb; if (i != middle) { - m_est.estimateFrameCost(frames, p0, p1, i, 0); + estGroup.singleCost(p0, p1, i); estimateCUPropagate(frames, averageDuration, p0, p1, i, 0); } i--; @@ -1071,7 +1640,7 @@ { while (i > curnonb) { - m_est.estimateFrameCost(frames, curnonb, lastnonb, i, 0); + estGroup.singleCost(curnonb, lastnonb, i); estimateCUPropagate(frames, averageDuration, curnonb, lastnonb, i, 0); i--; } @@ -1082,7 +1651,7 @@ if (!m_param->lookaheadDepth) { - m_est.estimateFrameCost(frames, 0, lastnonb, lastnonb, 0); + estGroup.singleCost(0, lastnonb, lastnonb); estimateCUPropagate(frames, averageDuration, 0, lastnonb, lastnonb, 1); std::swap(frames[lastnonb]->propagateCost, frames[0]->propagateCost); } @@ -1097,31 +1666,32 @@ uint16_t *refCosts[2] = { frames[p0]->propagateCost, frames[p1]->propagateCost }; int32_t distScaleFactor = (((b - p0) << 8) + ((p1 - p0) >> 1)) / (p1 - p0); int32_t bipredWeight = m_param->bEnableWeightedBiPred ? 64 - (distScaleFactor >> 2) : 32; - MV *mvs[2] = { frames[b]->lowresMvs[0][b - p0 - 1], frames[b]->lowresMvs[1][p1 - b - 1] }; int32_t bipredWeights[2] = { bipredWeight, 64 - bipredWeight }; + int listDist[2] = { b - p0 - 1, p1 - b - 1 }; - memset(m_scratch, 0, m_widthInCU * sizeof(int)); + memset(m_scratch, 0, m_8x8Width * sizeof(int)); uint16_t *propagateCost = frames[b]->propagateCost; x265_emms(); double fpsFactor = CLIP_DURATION((double)m_param->fpsDenom / m_param->fpsNum) / CLIP_DURATION(averageDuration); - /* For non-refferd frames the source costs are always zero, so just memset one row and re-use it. */ + /* For non-referred frames the source costs are always zero, so just memset one row and re-use it. */ if (!referenced) - memset(frames[b]->propagateCost, 0, m_widthInCU * sizeof(uint16_t)); + memset(frames[b]->propagateCost, 0, m_8x8Width * sizeof(uint16_t)); - int32_t StrideInCU = m_widthInCU; - for (uint16_t blocky = 0; blocky < m_heightInCU; blocky++) + int32_t strideInCU = m_8x8Width; + for (uint16_t blocky = 0; blocky < m_8x8Height; blocky++) { - int cuIndex = blocky * StrideInCU; + int cuIndex = blocky * strideInCU; primitives.propagateCost(m_scratch, propagateCost, frames[b]->intraCost + cuIndex, frames[b]->lowresCosts[b - p0][p1 - b] + cuIndex, - frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_widthInCU); + frames[b]->invQscaleFactor + cuIndex, &fpsFactor, m_8x8Width); if (referenced) - propagateCost += m_widthInCU; - for (uint16_t blockx = 0; blockx < m_widthInCU; blockx++, cuIndex++) + propagateCost += m_8x8Width; + + for (uint16_t blockx = 0; blockx < m_8x8Width; blockx++, cuIndex++) { int32_t propagate_amount = m_scratch[blockx]; /* Don't propagate for an intra block. */ @@ -1140,21 +1710,23 @@ if (lists_used == 3) listamount = (listamount * bipredWeights[list] + 32) >> 6; + MV *mvs = frames[b]->lowresMvs[list][listDist[list]]; + /* Early termination for simple case of mv0. */ - if (!mvs[list][cuIndex].word) + if (!mvs[cuIndex].word) { CLIP_ADD(refCosts[list][cuIndex], listamount); continue; } - int32_t x = mvs[list][cuIndex].x; - int32_t y = mvs[list][cuIndex].y; + int32_t x = mvs[cuIndex].x; + int32_t y = mvs[cuIndex].y; int32_t cux = (x >> 5) + blockx; int32_t cuy = (y >> 5) + blocky; - int32_t idx0 = cux + cuy * StrideInCU; + int32_t idx0 = cux + cuy * strideInCU; int32_t idx1 = idx0 + 1; - int32_t idx2 = idx0 + StrideInCU; - int32_t idx3 = idx0 + StrideInCU + 1; + int32_t idx2 = idx0 + strideInCU; + int32_t idx3 = idx0 + strideInCU + 1; x &= 31; y &= 31; int32_t idx0weight = (32 - y) * (32 - x); @@ -1164,7 +1736,7 @@ /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't * be counted. */ - if (cux < m_widthInCU - 1 && cuy < m_heightInCU - 1 && cux >= 0 && cuy >= 0) + if (cux < m_8x8Width - 1 && cuy < m_8x8Height - 1 && cux >= 0 && cuy >= 0) { CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); @@ -1173,13 +1745,13 @@ } else /* Check offsets individually */ { - if (cux < m_widthInCU && cuy < m_heightInCU && cux >= 0 && cuy >= 0) + if (cux < m_8x8Width && cuy < m_8x8Height && cux >= 0 && cuy >= 0) CLIP_ADD(refCosts[list][idx0], (listamount * idx0weight + 512) >> 10); - if (cux + 1 < m_widthInCU && cuy < m_heightInCU && cux + 1 >= 0 && cuy >= 0) + if (cux + 1 < m_8x8Width && cuy < m_8x8Height && cux + 1 >= 0 && cuy >= 0) CLIP_ADD(refCosts[list][idx1], (listamount * idx1weight + 512) >> 10); - if (cux < m_widthInCU && cuy + 1 < m_heightInCU && cux >= 0 && cuy + 1 >= 0) + if (cux < m_8x8Width && cuy + 1 < m_8x8Height && cux >= 0 && cuy + 1 >= 0) CLIP_ADD(refCosts[list][idx2], (listamount * idx2weight + 512) >> 10); - if (cux + 1 < m_widthInCU && cuy + 1 < m_heightInCU && cux + 1 >= 0 && cuy + 1 >= 0) + if (cux + 1 < m_8x8Width && cuy + 1 < m_8x8Height && cux + 1 >= 0 && cuy + 1 >= 0) CLIP_ADD(refCosts[list][idx3], (listamount * idx3weight + 512) >> 10); } } @@ -1200,10 +1772,10 @@ if (ref0Distance && frame->weightedCostDelta[ref0Distance - 1] > 0) weightdelta = (1.0 - frame->weightedCostDelta[ref0Distance - 1]); - /* Allow the strength to be adjusted via qcompress, since the two - * concepts are very similar. */ + /* Allow the strength to be adjusted via qcompress, since the two concepts + * are very similar. */ - int cuCount = m_widthInCU * m_heightInCU; + int cuCount = m_8x8Width * m_8x8Height; double strength = 5.0 * (1.0 - m_param->rc.qCompress); for (int cuIndex = 0; cuIndex < cuCount; cuIndex++) @@ -1222,24 +1794,27 @@ * re-running lookahead. */ int64_t Lookahead::frameCostRecalculate(Lowres** frames, int p0, int p1, int b) { + if (frames[b]->sliceType == X265_TYPE_B) + return frames[b]->costEstAq[b - p0][p1 - b]; + int64_t score = 0; int *rowSatd = frames[b]->rowSatds[b - p0][p1 - b]; - double *qp_offset = (frames[b]->sliceType == X265_TYPE_B) ? frames[b]->qpAqOffset : frames[b]->qpCuTreeOffset; + double *qp_offset = frames[b]->qpCuTreeOffset; x265_emms(); - for (int cuy = m_heightInCU - 1; cuy >= 0; cuy--) + for (int cuy = m_8x8Height - 1; cuy >= 0; cuy--) { rowSatd[cuy] = 0; - for (int cux = m_widthInCU - 1; cux >= 0; cux--) + for (int cux = m_8x8Width - 1; cux >= 0; cux--) { - int cuxy = cux + cuy * m_widthInCU; + int cuxy = cux + cuy * m_8x8Width; int cuCost = frames[b]->lowresCosts[b - p0][p1 - b][cuxy] & LOWRES_COST_MASK; double qp_adj = qp_offset[cuxy]; cuCost = (cuCost * x265_exp2fix8(qp_adj) + 128) >> 8; rowSatd[cuy] += cuCost; - if ((cuy > 0 && cuy < m_heightInCU - 1 && - cux > 0 && cux < m_widthInCU - 1) || - m_widthInCU <= 2 || m_heightInCU <= 2) + if ((cuy > 0 && cuy < m_8x8Height - 1 && + cux > 0 && cux < m_8x8Width - 1) || + m_8x8Width <= 2 || m_8x8Height <= 2) { score += cuCost; } @@ -1249,542 +1824,307 @@ return score; } -CostEstimate::CostEstimate(ThreadPool *p) - : WaveFront(p) + +int64_t CostEstimateGroup::singleCost(int p0, int p1, int b, bool intraPenalty) { - m_param = NULL; - m_curframes = NULL; - m_wbuffer[0] = m_wbuffer[1] = m_wbuffer[2] = m_wbuffer[3] = 0; - m_rows = NULL; - m_paddedLines = m_widthInCU = m_heightInCU = 0; - m_bDoSearch[0] = m_bDoSearch[1] = false; - m_curb = m_curp0 = m_curp1 = 0; - m_bFrameCompleted = false; + LookaheadTLD& tld = m_lookahead.m_tld[m_lookahead.m_pool ? m_lookahead.m_pool->m_numWorkers : 0]; + return estimateFrameCost(tld, p0, p1, b, intraPenalty); } -CostEstimate::~CostEstimate() +void CostEstimateGroup::add(int p0, int p1, int b) { - for (int i = 0; i < 4; i++) - { - x265_free(m_wbuffer[i]); - } + X265_CHECK(m_batchMode || !m_jobTotal, "single CostEstimateGroup instance cannot mix batch modes\n"); + m_batchMode = true; - delete[] m_rows; + Estimate& e = m_estimates[m_jobTotal++]; + e.p0 = p0; + e.p1 = p1; + e.b = b; + + if (m_jobTotal == MAX_BATCH_SIZE) + finishBatch(); } -void CostEstimate::init(x265_param *_param, Frame *curFrame) +void CostEstimateGroup::finishBatch() { - m_param = _param; - m_widthInCU = ((m_param->sourceWidth / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; - m_heightInCU = ((m_param->sourceHeight / 2) + X265_LOWRES_CU_SIZE - 1) >> X265_LOWRES_CU_BITS; - - m_rows = new EstimateRow[m_heightInCU]; - for (int i = 0; i < m_heightInCU; i++) - { - m_rows[i].m_widthInCU = m_widthInCU; - m_rows[i].m_heightInCU = m_heightInCU; - m_rows[i].m_param = m_param; - } + if (m_lookahead.m_pool) + tryBondPeers(*m_lookahead.m_pool, m_jobTotal); + processTasks(-1); + waitForExit(); + m_jobTotal = m_jobAcquired = 0; +} - if (WaveFront::init(m_heightInCU)) - WaveFront::enableAllRows(); - else - m_pool = NULL; +void CostEstimateGroup::processTasks(int workerThreadID) +{ + ThreadPool* pool = m_lookahead.m_pool; + int id = workerThreadID; + if (workerThreadID < 0) + id = pool ? pool->m_numWorkers : 0; + LookaheadTLD& tld = m_lookahead.m_tld[id]; - if (m_param->bEnableWeightedPred) + m_lock.acquire(); + while (m_jobAcquired < m_jobTotal) { - PicYuv *orig = curFrame->m_fencPic; - m_paddedLines = curFrame->m_lowres.lines + 2 * orig->m_lumaMarginY; - intptr_t padoffset = curFrame->m_lowres.lumaStride * orig->m_lumaMarginY + orig->m_lumaMarginX; + int i = m_jobAcquired++; + m_lock.release(); - /* allocate weighted lowres buffers */ - for (int i = 0; i < 4; i++) + if (m_batchMode) + { + ProfileLookaheadTime(tld.batchElapsedTime, tld.countBatches); + ProfileScopeEvent(estCostSingle); + + Estimate& e = m_estimates[i]; + estimateFrameCost(tld, e.p0, e.p1, e.b, false); + } + else { - m_wbuffer[i] = (pixel*)x265_malloc(sizeof(pixel) * (curFrame->m_lowres.lumaStride * m_paddedLines)); - m_weightedRef.lowresPlane[i] = m_wbuffer[i] + padoffset; + ProfileLookaheadTime(tld.coopSliceElapsedTime, tld.countCoopSlices); + ProfileScopeEvent(estCostCoop); + + X265_CHECK(i < MAX_COOP_SLICES, "impossible number of coop slices\n"); + + int firstY = m_lookahead.m_numRowsPerSlice * i; + int lastY = (i == m_jobTotal - 1) ? m_lookahead.m_8x8Height - 1 : m_lookahead.m_numRowsPerSlice * (i + 1) - 1; + + bool lastRow = true; + for (int cuY = lastY; cuY >= firstY; cuY--) + { + m_frames[m_coop.b]->rowSatds[m_coop.b - m_coop.p0][m_coop.p1 - m_coop.b][cuY] = 0; + + for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--) + estimateCUCost(tld, cuX, cuY, m_coop.p0, m_coop.p1, m_coop.b, m_coop.bDoSearch, lastRow, i); + + lastRow = false; + } } - m_weightedRef.fpelPlane[0] = m_weightedRef.lowresPlane[0]; - m_weightedRef.lumaStride = curFrame->m_lowres.lumaStride; - m_weightedRef.isLowres = true; - m_weightedRef.isWeighted = false; + m_lock.acquire(); } + m_lock.release(); } -int64_t CostEstimate::estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty) +int64_t CostEstimateGroup::estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool bIntraPenalty) { - int64_t score = 0; - Lowres *fenc = frames[b]; + Lowres* fenc = m_frames[b]; + x265_param* param = m_lookahead.m_param; + int64_t score = 0; if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1) score = fenc->costEst[b - p0][p1 - b]; else { - m_weightedRef.isWeighted = false; - if (m_param->bEnableWeightedPred && b == p1 && b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF) - { - if (!fenc->bIntraCalculated) - estimateFrameCost(frames, b, b, b, 0); - weightsAnalyse(frames, b, p0); - } + X265_CHECK(p0 != b, "I frame estimates should always be pre-calculated\n"); - /* For each list, check to see whether we have lowres motion-searched this reference */ - m_bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF; - m_bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF; + bool bDoSearch[2]; + bDoSearch[0] = p0 < b && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF; + bDoSearch[1] = p1 > b && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF; + +#if CHECKED_BUILD + X265_CHECK(!(p0 < b && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFE), "motion search batch duplication L0\n"); + X265_CHECK(!(p1 > b && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFE), "motion search batch duplication L1\n"); + if (bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0x7FFE; + if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0x7FFE; +#endif + + tld.weightedRef.isWeighted = false; + if (param->bEnableWeightedPred && bDoSearch[0]) + tld.weightsAnalyse(*m_frames[b], *m_frames[p0]); - if (m_bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0; - if (m_bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0; - - m_curb = b; - m_curp0 = p0; - m_curp1 = p1; - m_curframes = frames; fenc->costEst[b - p0][p1 - b] = 0; fenc->costEstAq[b - p0][p1 - b] = 0; - for (int i = 0; i < m_heightInCU; i++) + if (!m_batchMode && m_lookahead.m_numCoopSlices > 1 && ((p1 > b) || bDoSearch[0] || bDoSearch[1])) { - m_rows[i].init(); - if (!fenc->bIntraCalculated) - fenc->rowSatds[0][0][i] = 0; - fenc->rowSatds[b - p0][p1 - b][i] = 0; - } + /* Use cooperative mode if a thread pool is available and the cost estimate is + * going to need motion searches or bidir measurements */ - m_bFrameCompleted = false; + memset(&m_slice, 0, sizeof(Slice) * m_lookahead.m_numCoopSlices); - if (m_pool) - { - WaveFront::enqueue(); + m_lock.acquire(); + X265_CHECK(!m_batchMode, "single CostEstimateGroup instance cannot mix batch modes\n"); + m_coop.p0 = p0; + m_coop.p1 = p1; + m_coop.b = b; + m_coop.bDoSearch[0] = bDoSearch[0]; + m_coop.bDoSearch[1] = bDoSearch[1]; + m_jobTotal = m_lookahead.m_numCoopSlices; + m_jobAcquired = 0; + m_lock.release(); + + tryBondPeers(*m_lookahead.m_pool, m_jobTotal); - // enableAllRows must be already called - enqueueRow(0); - while (!m_bFrameCompleted) - WaveFront::findJob(-1); + processTasks(-1); - WaveFront::dequeue(); + waitForExit(); + + for (int i = 0; i < m_lookahead.m_numCoopSlices; i++) + { + fenc->costEst[b - p0][p1 - b] += m_slice[i].costEst; + fenc->costEstAq[b - p0][p1 - b] += m_slice[i].costEstAq; + if (p1 == b) + fenc->intraMbs[b - p0] += m_slice[i].intraMbs; + } } else { - for (int row = 0; row < m_heightInCU; row++) - processRow(row, -1); + bool lastRow = true; + for (int cuY = m_lookahead.m_8x8Height - 1; cuY >= 0; cuY--) + { + fenc->rowSatds[b - p0][p1 - b][cuY] = 0; - x265_emms(); - } + for (int cuX = m_lookahead.m_8x8Width - 1; cuX >= 0; cuX--) + estimateCUCost(tld, cuX, cuY, p0, p1, b, bDoSearch, lastRow, -1); - // Accumulate cost from each row - for (int row = 0; row < m_heightInCU; row++) - { - score += m_rows[row].m_costEst; - fenc->costEst[0][0] += m_rows[row].m_costIntra; - if (m_param->rc.aqMode) - { - fenc->costEstAq[0][0] += m_rows[row].m_costIntraAq; - fenc->costEstAq[b - p0][p1 - b] += m_rows[row].m_costEstAq; + lastRow = false; } - fenc->intraMbs[b - p0] += m_rows[row].m_intraMbs; } - fenc->bIntraCalculated = true; + score = fenc->costEst[b - p0][p1 - b]; if (b != p1) - score = (uint64_t)score * 100 / (130 + m_param->bFrameBias); - if (b != p0 || b != p1) //Not Intra cost - fenc->costEst[b - p0][p1 - b] = score; + score = score * 100 / (130 + param->bFrameBias); + + fenc->costEst[b - p0][p1 - b] = score; } if (bIntraPenalty) - { // arbitrary penalty for I-blocks after B-frames - int ncu = NUM_CUS; - score += (uint64_t)score * fenc->intraMbs[b - p0] / (ncu * 8); - } - return score; -} - -uint32_t CostEstimate::weightCostLuma(Lowres **frames, int b, int p0, WeightParam *wp) -{ - Lowres *fenc = frames[b]; - Lowres *ref = frames[p0]; - pixel *src = ref->fpelPlane[0]; - intptr_t stride = fenc->lumaStride; - - if (wp) - { - int offset = wp->inputOffset << (X265_DEPTH - 8); - int scale = wp->inputWeight; - int denom = wp->log2WeightDenom; - int round = denom ? 1 << (denom - 1) : 0; - int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth - int widthHeight = (int)stride; - - primitives.weight_pp(ref->buffer[0], m_wbuffer[0], stride, widthHeight, m_paddedLines, - scale, round << correction, denom + correction, offset); - src = m_weightedRef.fpelPlane[0]; - } - - uint32_t cost = 0; - intptr_t pixoff = 0; - int mb = 0; - - for (int y = 0; y < fenc->lines; y += 8, pixoff = y * stride) - { - for (int x = 0; x < fenc->width; x += 8, mb++, pixoff += 8) - { - int satd = primitives.pu[LUMA_8x8].satd(src + pixoff, stride, fenc->fpelPlane[0] + pixoff, stride); - cost += X265_MIN(satd, fenc->intraCost[mb]); - } - } - - return cost; -} - -void CostEstimate::weightsAnalyse(Lowres **frames, int b, int p0) -{ - static const float epsilon = 1.f / 128.f; - Lowres *fenc, *ref; - - fenc = frames[b]; - ref = frames[p0]; - int deltaIndex = fenc->frameNum - ref->frameNum; - - /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */ - float guessScale, fencMean, refMean; - x265_emms(); - if (fenc->wp_ssd[0] && ref->wp_ssd[0]) - guessScale = sqrtf((float)fenc->wp_ssd[0] / ref->wp_ssd[0]); - else - guessScale = 1.0f; - fencMean = (float)fenc->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); - refMean = (float)ref->wp_sum[0] / (fenc->lines * fenc->width) / (1 << (X265_DEPTH - 8)); - - /* Early termination */ - if (fabsf(refMean - fencMean) < 0.5f && fabsf(1.f - guessScale) < epsilon) - return; - - int minoff = 0, minscale, mindenom; - unsigned int minscore = 0, origscore = 1; - int found = 0; - - m_w.setFromWeightAndOffset((int)(guessScale * 128 + 0.5f), 0, 7, true); - mindenom = m_w.log2WeightDenom; - minscale = m_w.inputWeight; - - origscore = minscore = weightCostLuma(frames, b, p0, NULL); - - if (!minscore) - return; - - unsigned int s = 0; - int curScale = minscale; - int curOffset = (int)(fencMean - refMean * curScale / (1 << mindenom) + 0.5f); - if (curOffset < -128 || curOffset > 127) - { - /* Rescale considering the constraints on curOffset. We do it in this order - * because scale has a much wider range than offset (because of denom), so - * it should almost never need to be clamped. */ - curOffset = x265_clip3(-128, 127, curOffset); - curScale = (int)((1 << mindenom) * (fencMean - curOffset) / refMean + 0.5f); - curScale = x265_clip3(0, 127, curScale); - } - SET_WEIGHT(m_w, 1, curScale, mindenom, curOffset); - s = weightCostLuma(frames, b, p0, &m_w); - COPY4_IF_LT(minscore, s, minscale, curScale, minoff, curOffset, found, 1); - - /* Use a smaller denominator if possible */ - while (mindenom > 0 && !(minscale & 1)) - { - mindenom--; - minscale >>= 1; - } - - if (!found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f) - return; - else - { - SET_WEIGHT(m_w, 1, minscale, mindenom, minoff); - // set weighted delta cost - fenc->weightedCostDelta[deltaIndex] = minscore / origscore; - - int offset = m_w.inputOffset << (X265_DEPTH - 8); - int scale = m_w.inputWeight; - int denom = m_w.log2WeightDenom; - int round = denom ? 1 << (denom - 1) : 0; - int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth - intptr_t stride = ref->lumaStride; - int widthHeight = (int)stride; - - for (int i = 0; i < 4; i++) - primitives.weight_pp(ref->buffer[i], m_wbuffer[i], stride, widthHeight, m_paddedLines, - scale, round << correction, denom + correction, offset); - - m_weightedRef.isWeighted = true; - } -} - -void CostEstimate::processRow(int row, int /*threadId*/) -{ - ProfileScopeEvent(costEstimateRow); - - int realrow = m_heightInCU - 1 - row; - Lowres **frames = m_curframes; - ReferencePlanes *wfref0 = m_weightedRef.isWeighted ? &m_weightedRef : frames[m_curp0]; - - /* Lowres lookahead goes backwards because the MVs are used as - * predictors in the main encode. This considerably improves MV - * prediction overall. */ - for (int i = m_widthInCU - 1 - m_rows[row].m_completed; i >= 0; i--) - { - // TODO: use lowres MVs as motion candidates in full-res search - m_rows[row].estimateCUCost(frames, wfref0, i, realrow, m_curp0, m_curp1, m_curb, m_bDoSearch); - m_rows[row].m_completed++; - - if (m_rows[row].m_completed >= 2 && row < m_heightInCU - 1) - { - ScopedLock below(m_rows[row + 1].m_lock); - if (m_rows[row + 1].m_active == false && - m_rows[row + 1].m_completed + 2 <= m_rows[row].m_completed) - { - m_rows[row + 1].m_active = true; - enqueueRow(row + 1); - } - } - - ScopedLock self(m_rows[row].m_lock); - if (row > 0 && (int32_t)m_rows[row].m_completed < m_widthInCU - 1 && - m_rows[row - 1].m_completed < m_rows[row].m_completed + 2) - { - m_rows[row].m_active = false; - return; - } - } + score += score * fenc->intraMbs[b - p0] / (tld.ncu * 8); - if (row == m_heightInCU - 1) - m_bFrameCompleted = true; + return score; } -void EstimateRow::init() +void CostEstimateGroup::estimateCUCost(LookaheadTLD& tld, int cuX, int cuY, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice) { - m_costEst = 0; - m_costEstAq = 0; - m_costIntra = 0; - m_costIntraAq = 0; - m_intraMbs = 0; - m_active = false; - m_completed = 0; -} + Lowres *fref0 = m_frames[p0]; + Lowres *fref1 = m_frames[p1]; + Lowres *fenc = m_frames[b]; -void EstimateRow::estimateCUCost(Lowres **frames, ReferencePlanes *wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]) -{ - Lowres *fref1 = frames[p1]; - Lowres *fenc = frames[b]; + ReferencePlanes *wfref0 = tld.weightedRef.isWeighted ? &tld.weightedRef : fref0; + const int widthInCU = m_lookahead.m_8x8Width; + const int heightInCU = m_lookahead.m_8x8Height; const int bBidir = (b < p1); - const int cuXY = cux + cuy * m_widthInCU; + const int cuXY = cuX + cuY * widthInCU; const int cuSize = X265_LOWRES_CU_SIZE; - const intptr_t pelOffset = cuSize * cux + cuSize * cuy * fenc->lumaStride; - - // should this CU's cost contribute to the frame cost? - const bool bFrameScoreCU = (cux > 0 && cux < m_widthInCU - 1 && - cuy > 0 && cuy < m_heightInCU - 1) || m_widthInCU <= 2 || m_heightInCU <= 2; + const intptr_t pelOffset = cuSize * cuX + cuSize * cuY * fenc->lumaStride; - m_me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize); + if (bBidir || bDoSearch[0] || bDoSearch[1]) + tld.me.setSourcePU(fenc->lowresPlane[0], fenc->lumaStride, pelOffset, cuSize, cuSize); /* A small, arbitrary bias to avoid VBV problems caused by zero-residual lookahead blocks. */ int lowresPenalty = 4; - - MV(*fenc_mvs[2]) = { &fenc->lowresMvs[0][b - p0 - 1][cuXY], - &fenc->lowresMvs[1][p1 - b - 1][cuXY] }; - int(*fenc_costs[2]) = { &fenc->lowresMvCosts[0][b - p0 - 1][cuXY], - &fenc->lowresMvCosts[1][p1 - b - 1][cuXY] }; + int listDist[2] = { b - p0 - 1, p1 - b - 1 }; MV mvmin, mvmax; - int bcost = m_me.COST_MAX; + int bcost = tld.me.COST_MAX; int listused = 0; // establish search bounds that don't cross extended frame boundaries - mvmin.x = (int16_t)(-cux * cuSize - 8); - mvmin.y = (int16_t)(-cuy * cuSize - 8); - mvmax.x = (int16_t)((m_widthInCU - cux - 1) * cuSize + 8); - mvmax.y = (int16_t)((m_heightInCU - cuy - 1) * cuSize + 8); + mvmin.x = (int16_t)(-cuX * cuSize - 8); + mvmin.y = (int16_t)(-cuY * cuSize - 8); + mvmax.x = (int16_t)((widthInCU - cuX - 1) * cuSize + 8); + mvmax.y = (int16_t)((heightInCU - cuY - 1) * cuSize + 8); - if (p0 != p1) + for (int i = 0; i < 1 + bBidir; i++) { - for (int i = 0; i < 1 + bBidir; i++) - { - if (!bDoSearch[i]) - { - /* Use previously calculated cost */ - COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); - continue; - } - int numc = 0; - MV mvc[4], mvp; - MV *fenc_mv = fenc_mvs[i]; - - /* Reverse-order MV prediction. */ - mvc[0] = 0; - mvc[2] = 0; -#define MVC(mv) mvc[numc++] = mv; - if (cux < m_widthInCU - 1) - MVC(fenc_mv[1]); - if (cuy < m_heightInCU - 1) - { - MVC(fenc_mv[m_widthInCU]); - if (cux > 0) - MVC(fenc_mv[m_widthInCU - 1]); - if (cux < m_widthInCU - 1) - MVC(fenc_mv[m_widthInCU + 1]); - } -#undef MVC - if (numc <= 1) - mvp = mvc[0]; - else - { - median_mv(mvp, mvc[0], mvc[1], mvc[2]); - } + int& fencCost = fenc->lowresMvCosts[i][listDist[i]][cuXY]; - *fenc_costs[i] = m_me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, m_merange, *fenc_mvs[i]); - COPY2_IF_LT(bcost, *fenc_costs[i], listused, i + 1); - } - if (bBidir) + if (!bDoSearch[i]) { - ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); - ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); - intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE; - pixel *src0 = wfref0->lowresMC(pelOffset, *fenc_mvs[0], subpelbuf0, stride0); - pixel *src1 = fref1->lowresMC(pelOffset, *fenc_mvs[1], subpelbuf1, stride1); - - ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); - primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32); - int bicost = primitives.pu[LUMA_8x8].satd(fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); - COPY2_IF_LT(bcost, bicost, listused, 3); - - // Try 0,0 candidates - src0 = wfref0->lowresPlane[0] + pelOffset; - src1 = fref1->lowresPlane[0] + pelOffset; - primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, wfref0->lumaStride, src1, fref1->lumaStride, 32); - bicost = primitives.pu[LUMA_8x8].satd(fenc->lowresPlane[0] + pelOffset, fenc->lumaStride, ref, X265_LOWRES_CU_SIZE); - COPY2_IF_LT(bcost, bicost, listused, 3); + COPY2_IF_LT(bcost, fencCost, listused, i + 1); + continue; } - } - - if (!fenc->bIntraCalculated) - { - ALIGN_VAR_32(pixel, prediction[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); - pixel neighbours[2][X265_LOWRES_CU_SIZE * 4 + 1]; - const int sizeIdx = X265_LOWRES_CU_BITS - 2; // partition size - const int cuSize2 = cuSize << 1; - - pixel *pixCur = fenc->lowresPlane[0] + pelOffset; - // Copy Above - memcpy(neighbours[0], pixCur - 1 - fenc->lumaStride, (cuSize + 1) * sizeof(pixel)); - - // Copy Left - for (int i = 1; i < cuSize + 1; i++) - neighbours[0][i + cuSize2] = pixCur[-1 - fenc->lumaStride + i * fenc->lumaStride]; - - for (int i = 0; i < cuSize; i++) - { - // Copy above-last pixel - neighbours[0][i + cuSize + 1] = neighbours[0][cuSize]; //neighbours[0][i + 9] = neighbours[0][8] - // Copy left-last pixel - neighbours[0][i + cuSize2 + cuSize + 1] = neighbours[0][cuSize2 + cuSize]; //neighbours[0][i + 25] = neighbours[0][24] - } + int numc = 0; + MV mvc[4], mvp; - // Filter neighbour pixels with [1-2-1] - neighbours[1][0] = neighbours[0][0]; // Copy top-left pixel - neighbours[1][cuSize2] = neighbours[0][cuSize2]; //Copy top-right pixel - neighbours[1][cuSize2 << 1] = neighbours[0][cuSize2 << 1]; // Bottom-left pixel + MV* fencMV = &fenc->lowresMvs[i][listDist[i]][cuXY]; - neighbours[1][1] = (neighbours[0][0] + (neighbours[0][1] << 1) + neighbours[0][2] + 2) >> 2; - neighbours[1][cuSize2 + 1] = (neighbours[0][0] + (neighbours[0][cuSize2 + 1] << 1) + neighbours[0][cuSize2 + 1 + 1] + 2) >> 2; - for (int i = 2; i < cuSize2; i++) - { - neighbours[1][i] = (neighbours[0][i - 1] + (neighbours[0][i] << 1) + neighbours[0][i + 1] + 2) >> 2; - neighbours[1][cuSize2 + i] = (neighbours[0][cuSize2 + i - 1] + (neighbours[0][cuSize2 + i] << 1) + neighbours[0][cuSize2 + i + 1] + 2) >> 2; + /* Reverse-order MV prediction */ + mvc[0] = 0; + mvc[2] = 0; +#define MVC(mv) mvc[numc++] = mv; + if (cuX < widthInCU - 1) + MVC(fencMV[1]); + if (!lastRow) + { + MVC(fencMV[widthInCU]); + if (cuX > 0) + MVC(fencMV[widthInCU - 1]); + if (cuX < widthInCU - 1) + MVC(fencMV[widthInCU + 1]); } +#undef MVC + if (numc <= 1) + mvp = mvc[0]; + else + median_mv(mvp, mvc[0], mvc[1], mvc[2]); - int icost = m_me.COST_MAX, ilowmode; - primitives.cu[sizeIdx].intra_pred[DC_IDX](prediction, cuSize, neighbours[0], 0, (cuSize <= 16)); - int cost = m_me.bufSATD(prediction, cuSize); - COPY2_IF_LT(icost, cost, ilowmode, DC_IDX); - - pixel *planar = (cuSize >= 8) ? neighbours[1] : neighbours[0]; - primitives.cu[sizeIdx].intra_pred[PLANAR_IDX](prediction, cuSize, planar, 0, 0); - cost = m_me.bufSATD(prediction, cuSize); - COPY2_IF_LT(icost, cost, ilowmode, PLANAR_IDX); + fencCost = tld.me.motionEstimate(i ? fref1 : wfref0, mvmin, mvmax, mvp, numc, mvc, s_merange, *fencMV); + COPY2_IF_LT(bcost, fencCost, listused, i + 1); + } - uint32_t mode, lowmode = 4; - int acost = m_me.COST_MAX, filter; - for (mode = 5; mode < 35; mode += 5) - { - filter = !!(g_intraFilterFlags[mode] & cuSize); - primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16); - cost = m_me.bufSATD(prediction, cuSize); - COPY2_IF_LT(acost, cost, lowmode, mode); - } - for (uint32_t dist = 2; dist >= 1; dist--) - { - int minusmode = lowmode - dist; - int plusmode = lowmode + dist; + if (bBidir) /* B, also consider bidir */ + { + /* NOTE: the wfref0 (weightp) is not used for BIDIR */ - mode = minusmode; - filter = !!(g_intraFilterFlags[mode] & cuSize); - primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16); - cost = m_me.bufSATD(prediction, cuSize); - COPY2_IF_LT(acost, cost, lowmode, mode); + /* avg(l0-mv, l1-mv) candidate */ + ALIGN_VAR_32(pixel, subpelbuf0[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); + ALIGN_VAR_32(pixel, subpelbuf1[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); + intptr_t stride0 = X265_LOWRES_CU_SIZE, stride1 = X265_LOWRES_CU_SIZE; + pixel *src0 = fref0->lowresMC(pelOffset, fenc->lowresMvs[0][listDist[0]][cuXY], subpelbuf0, stride0); + pixel *src1 = fref1->lowresMC(pelOffset, fenc->lowresMvs[1][listDist[1]][cuXY], subpelbuf1, stride1); - mode = plusmode; - filter = !!(g_intraFilterFlags[mode] & cuSize); - primitives.cu[sizeIdx].intra_pred[mode](prediction, cuSize, neighbours[filter], mode, cuSize <= 16); - cost = m_me.bufSATD(prediction, cuSize); - COPY2_IF_LT(acost, cost, lowmode, mode); - } - COPY2_IF_LT(icost, acost, ilowmode, lowmode); + ALIGN_VAR_32(pixel, ref[X265_LOWRES_CU_SIZE * X265_LOWRES_CU_SIZE]); + primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, stride0, src1, stride1, 32); + int bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE); + COPY2_IF_LT(bcost, bicost, listused, 3); - const int intraPenalty = 5 * m_lookAheadLambda; - icost += intraPenalty + lowresPenalty; /* estimate intra signal cost */ - fenc->intraCost[cuXY] = icost; - fenc->intraMode[cuXY] = (uint8_t)ilowmode; + /* coloc candidate */ + src0 = fref0->lowresPlane[0] + pelOffset; + src1 = fref1->lowresPlane[0] + pelOffset; + primitives.pu[LUMA_8x8].pixelavg_pp(ref, X265_LOWRES_CU_SIZE, src0, fref0->lumaStride, src1, fref1->lumaStride, 32); + bicost = tld.me.bufSATD(ref, X265_LOWRES_CU_SIZE); + COPY2_IF_LT(bcost, bicost, listused, 3); - int icostAq = icost; - if (bFrameScoreCU) - { - m_costIntra += icost; - if (fenc->invQscaleFactor) - { - icostAq = (icost * fenc->invQscaleFactor[cuXY] + 128) >> 8; - m_costIntraAq += icostAq; - } - } - fenc->rowSatds[0][0][cuy] += icostAq; + bcost += lowresPenalty; } - bcost += lowresPenalty; - if (!bBidir) + else /* P, also consider intra */ { + bcost += lowresPenalty; + if (fenc->intraCost[cuXY] < bcost) { - if (bFrameScoreCU) m_intraMbs++; bcost = fenc->intraCost[cuXY]; listused = 0; } } - /* For I frames these costs were accumulated earlier */ - if (p0 != p1) + /* do not include edge blocks in the frame cost estimates, they are not very accurate */ + const bool bFrameScoreCU = (cuX > 0 && cuX < widthInCU - 1 && + cuY > 0 && cuY < heightInCU - 1) || widthInCU <= 2 || heightInCU <= 2; + + int bcostAq = (bFrameScoreCU && fenc->invQscaleFactor) ? ((bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8) : bcost; + + if (bFrameScoreCU) { - int bcostAq = bcost; - if (bFrameScoreCU) + if (slice < 0) { - m_costEst += bcost; - if (fenc->invQscaleFactor) - { - bcostAq = (bcost * fenc->invQscaleFactor[cuXY] + 128) >> 8; - m_costEstAq += bcostAq; - } + fenc->costEst[b - p0][p1 - b] += bcost; + fenc->costEstAq[b - p0][p1 - b] += bcostAq; + if (!listused && !bBidir) + fenc->intraMbs[b - p0]++; + } + else + { + m_slice[slice].costEst += bcost; + m_slice[slice].costEstAq += bcostAq; + if (!listused && !bBidir) + m_slice[slice].intraMbs++; } - fenc->rowSatds[b - p0][p1 - b][cuy] += bcostAq; } + + fenc->rowSatds[b - p0][p1 - b][cuY] += bcostAq; fenc->lowresCosts[b - p0][p1 - b][cuXY] = (uint16_t)(X265_MIN(bcost, LOWRES_COST_MASK) | (listused << LOWRES_COST_SHIFT)); } diff -Nru x265-1.5/source/encoder/slicetype.h x265-1.6/source/encoder/slicetype.h --- x265-1.5/source/encoder/slicetype.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/slicetype.h 2015-04-02 16:46:36.000000000 +0000 @@ -28,141 +28,135 @@ #include "slice.h" #include "motion.h" #include "piclist.h" -#include "wavefront.h" +#include "threadpool.h" namespace x265 { // private namespace struct Lowres; class Frame; +class Lookahead; #define LOWRES_COST_MASK ((1 << 14) - 1) #define LOWRES_COST_SHIFT 14 -#define SET_WEIGHT(w, b, s, d, o) \ - { \ - (w).inputWeight = (s); \ - (w).log2WeightDenom = (d); \ - (w).inputOffset = (o); \ - (w).bPresentFlag = b; \ - } - -class EstimateRow +/* Thread local data for lookahead tasks */ +struct LookaheadTLD { -public: - x265_param* m_param; - MotionEstimate m_me; - Lock m_lock; - - volatile uint32_t m_completed; // Number of CUs in this row for which cost estimation is completed - volatile bool m_active; - - uint64_t m_costEst; // Estimated cost for all CUs in a row - uint64_t m_costEstAq; // Estimated weight Aq cost for all CUs in a row - uint64_t m_costIntraAq; // Estimated weighted Aq Intra cost for all CUs in a row - int m_intraMbs; // Number of Intra CUs - int m_costIntra; // Estimated Intra cost for all CUs in a row + MotionEstimate me; + ReferencePlanes weightedRef; + pixel* wbuffer[4]; + int widthInCU; + int heightInCU; + int ncu; + int paddedLines; + +#if DETAILED_CU_STATS + int64_t batchElapsedTime; + int64_t coopSliceElapsedTime; + uint64_t countBatches; + uint64_t countCoopSlices; +#endif - int m_merange; - int m_lookAheadLambda; - - int m_widthInCU; - int m_heightInCU; + LookaheadTLD() + { + me.setQP(X265_LOOKAHEAD_QP); + me.init(X265_HEX_SEARCH, 1, X265_CSP_I400); + for (int i = 0; i < 4; i++) + wbuffer[i] = NULL; + widthInCU = heightInCU = ncu = paddedLines = 0; + +#if DETAILED_CU_STATS + batchElapsedTime = 0; + coopSliceElapsedTime = 0; + countBatches = 0; + countCoopSlices = 0; +#endif + } - EstimateRow() + void init(int w, int h, int n) { - m_me.setQP(X265_LOOKAHEAD_QP); - m_me.init(X265_HEX_SEARCH, 1, X265_CSP_I400); - m_merange = 16; - m_lookAheadLambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP]; + widthInCU = w; + heightInCU = h; + ncu = n; } - void init(); + ~LookaheadTLD() { X265_FREE(wbuffer[0]); } - void estimateCUCost(Lowres * *frames, ReferencePlanes * wfref0, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2]); -}; + void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param); + void lowresIntraEstimate(Lowres& fenc); -/* CostEstimate manages the cost estimation of a single frame, ie: - * estimateFrameCost() and everything below it in the call graph */ -class CostEstimate : public WaveFront -{ -public: - CostEstimate(ThreadPool *p); - ~CostEstimate(); - void init(x265_param *, Frame *); - - x265_param *m_param; - EstimateRow *m_rows; - pixel *m_wbuffer[4]; - Lowres **m_curframes; - - ReferencePlanes m_weightedRef; - WeightParam m_w; - - int m_paddedLines; // number of lines in padded frame - int m_widthInCU; // width of lowres frame in downscale CUs - int m_heightInCU; // height of lowres frame in downscale CUs - - bool m_bDoSearch[2]; - volatile bool m_bFrameCompleted; - int m_curb, m_curp0, m_curp1; - - void processRow(int row, int threadId); - int64_t estimateFrameCost(Lowres **frames, int p0, int p1, int b, bool bIntraPenalty); + void weightsAnalyse(Lowres& fenc, Lowres& ref); protected: - void weightsAnalyse(Lowres **frames, int b, int p0); - uint32_t weightCostLuma(Lowres **frames, int b, int p0, WeightParam *w); + uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp); + uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp); + bool allocWeightedRef(Lowres& fenc); }; class Lookahead : public JobProvider { public: + PicList m_inputQueue; // input pictures in order received + PicList m_outputQueue; // pictures to be encoded, in encode order + Lock m_inputLock; + Lock m_outputLock; + + /* pre-lookahead */ + Frame* m_preframes[X265_LOOKAHEAD_MAX]; + int m_preTotal, m_preAcquired, m_preCompleted; + int m_fullQueueSize; + bool m_isActive; + bool m_sliceTypeBusy; + bool m_bAdaptiveQuant; + bool m_outputSignalRequired; + bool m_bBatchMotionSearch; + bool m_bBatchFrameCosts; + Lock m_preLookaheadLock; + Event m_outputSignal; + + LookaheadTLD* m_tld; + x265_param* m_param; + Lowres* m_lastNonB; + int* m_scratch; // temp buffer for cutree propagate + + int m_histogram[X265_BFRAME_MAX + 1]; + int m_lastKeyframe; + int m_8x8Width; + int m_8x8Height; + int m_8x8Blocks; + int m_numCoopSlices; + int m_numRowsPerSlice; + bool m_filled; + Lookahead(x265_param *param, ThreadPool *pool); - ~Lookahead(); - void init(); - void destroy(); - - CostEstimate m_est; // Frame cost estimator - PicList m_inputQueue; // input pictures in order received - PicList m_outputQueue; // pictures to be encoded, in encode order - - x265_param *m_param; - Lowres *m_lastNonB; - int *m_scratch; // temp buffer - - int m_widthInCU; // width of lowres frame in downscale CUs - int m_heightInCU; // height of lowres frame in downscale CUs - int m_lastKeyframe; - int m_histogram[X265_BFRAME_MAX + 1]; - - void addPicture(Frame*, int sliceType); - void flush(); - void stop(); - Frame* getDecidedPicture(); - void getEstimatedPictureCost(Frame *pic); +#if DETAILED_CU_STATS + int64_t m_slicetypeDecideElapsedTime; + int64_t m_preLookaheadElapsedTime; + uint64_t m_countSlicetypeDecide; + uint64_t m_countPreLookahead; + void getWorkerStats(int64_t& batchElapsedTime, uint64_t& batchCount, int64_t& coopSliceElapsedTime, uint64_t& coopSliceCount); +#endif + + bool create(); + void destroy(); + void stop(); + + void addPicture(Frame&, int sliceType); + void flush(); + Frame* getDecidedPicture(); + + void getEstimatedPictureCost(Frame *pic); -protected: +protected: - Lock m_inputQueueLock; - Lock m_outputQueueLock; - Event m_outputAvailable; - - bool m_bReady; /* input lock - slicetypeDecide() can be started */ - bool m_bBusy; /* input lock - slicetypeDecide() is running */ - bool m_bFilled; /* enough frames in lookahead for output to be available */ - bool m_bFlushed; /* all frames have been decided, lookahead is finished */ - bool m_bFlush; /* no more frames will be received, empty the input queue */ - - bool findJob(int); - - /* called by addPicture() or flush() to trigger slice decisions */ - void slicetypeDecide(); - void slicetypeAnalyse(Lowres **frames, bool bKeyframe); + void findJob(int workerThreadID); + void slicetypeDecide(); + void slicetypeAnalyse(Lowres **frames, bool bKeyframe); /* called by slicetypeAnalyse() to make slice decisions */ bool scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames, int maxSearch); @@ -174,13 +168,63 @@ /* called by slicetypeAnalyse() to effect cuTree adjustments to adaptive * quant offsets */ - void cuTree(Lowres **frames, int numframes, bool bintra); - void estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced); - void cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance); + void cuTree(Lowres **frames, int numframes, bool bintra); + void estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced); + void cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance); /* called by getEstimatedPictureCost() to finalize cuTree costs */ int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b); }; + +class CostEstimateGroup : public BondedTaskGroup +{ +public: + + Lookahead& m_lookahead; + Lowres** m_frames; + bool m_batchMode; + + CostEstimateGroup(Lookahead& l, Lowres** f) : m_lookahead(l), m_frames(f), m_batchMode(false) {} + + /* Cooperative cost estimate using multiple slices of downscaled frame */ + struct Coop + { + int p0, b, p1; + bool bDoSearch[2]; + } m_coop; + + enum { MAX_COOP_SLICES = 32 }; + struct Slice + { + int costEst; + int costEstAq; + int intraMbs; + } m_slice[MAX_COOP_SLICES]; + + int64_t singleCost(int p0, int p1, int b, bool intraPenalty = false); + + /* Batch cost estimates, using one worker thread per estimateFrameCost() call */ + enum { MAX_BATCH_SIZE = 512 }; + struct Estimate + { + int p0, b, p1; + } m_estimates[MAX_BATCH_SIZE]; + + void add(int p0, int p1, int b); + void finishBatch(); + +protected: + + static const int s_merange = 16; + + void processTasks(int workerThreadID); + + int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool intraPenalty); + void estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice); + + CostEstimateGroup& operator=(const CostEstimateGroup&); +}; + } #endif // ifndef X265_SLICETYPE_H diff -Nru x265-1.5/source/encoder/weightPrediction.cpp x265-1.6/source/encoder/weightPrediction.cpp --- x265-1.5/source/encoder/weightPrediction.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/encoder/weightPrediction.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -27,8 +27,8 @@ #include "frame.h" #include "picyuv.h" #include "lowres.h" +#include "slice.h" #include "mv.h" -#include "slicetype.h" #include "bitstream.h" using namespace x265; @@ -58,6 +58,7 @@ void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs) { intptr_t stride = ref.lumaStride; + const int mvshift = 1 << 2; const int cuSize = 8; MV mvmin, mvmax; @@ -66,15 +67,15 @@ for (int y = 0; y < ref.lines; y += cuSize) { intptr_t pixoff = y * stride; - mvmin.y = (int16_t)((-y - 8) << 2); - mvmax.y = (int16_t)((ref.lines - y - 1 + 8) << 2); + mvmin.y = (int16_t)((-y - 8) * mvshift); + mvmax.y = (int16_t)((ref.lines - y - 1 + 8) * mvshift); for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++) { ALIGN_VAR_16(pixel, buf8x8[8 * 8]); intptr_t bstride = 8; - mvmin.x = (int16_t)((-x - 8) << 2); - mvmax.x = (int16_t)((ref.width - x - 1 + 8) << 2); + mvmin.x = (int16_t)((-x - 8) * mvshift); + mvmax.x = (int16_t)((ref.width - x - 1 + 8) * mvshift); /* clip MV to available pixels */ MV mv = mvs[cu]; @@ -100,6 +101,7 @@ int csp = cache.csp; int bw = 16 >> cache.hshift; int bh = 16 >> cache.vshift; + const int mvshift = 1 << 2; MV mvmin, mvmax; for (int y = 0; y < height; y += bh) @@ -109,8 +111,8 @@ * into the lowres structures */ int cu = y * cache.lowresWidthInCU; intptr_t pixoff = y * stride; - mvmin.y = (int16_t)((-y - 8) << 2); - mvmax.y = (int16_t)((height - y - 1 + 8) << 2); + mvmin.y = (int16_t)((-y - 8) * mvshift); + mvmax.y = (int16_t)((height - y - 1 + 8) * mvshift); for (int x = 0; x < width; x += bw, cu++, pixoff += bw) { @@ -122,8 +124,8 @@ mv.y >>= cache.vshift; /* clip MV to available pixels */ - mvmin.x = (int16_t)((-x - 8) << 2); - mvmax.x = (int16_t)((width - x - 1 + 8) << 2); + mvmin.x = (int16_t)((-x - 8) * mvshift); + mvmax.x = (int16_t)((width - x - 1 + 8) * mvshift); mv = mv.clipped(mvmin, mvmax); intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2); diff -Nru x265-1.5/source/input/y4m.cpp x265-1.6/source/input/y4m.cpp --- x265-1.5/source/input/y4m.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/input/y4m.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -177,147 +177,118 @@ int csp = 0; int d = 0; - while (!ifs->eof()) + while (ifs->good()) { // Skip Y4MPEG string int c = ifs->get(); - while (!ifs->eof() && (c != ' ') && (c != '\n')) - { + while (ifs->good() && (c != ' ') && (c != '\n')) c = ifs->get(); - } - while (c == ' ' && !ifs->eof()) + while (c == ' ' && ifs->good()) { // read parameter identifier switch (ifs->get()) { case 'W': width = 0; - while (!ifs->eof()) + while (ifs->good()) { c = ifs->get(); if (c == ' ' || c == '\n') - { break; - } else - { width = width * 10 + (c - '0'); - } } - break; case 'H': height = 0; - while (!ifs->eof()) + while (ifs->good()) { c = ifs->get(); if (c == ' ' || c == '\n') - { break; - } else - { height = height * 10 + (c - '0'); - } } - break; case 'F': rateNum = 0; rateDenom = 0; - while (!ifs->eof()) + while (ifs->good()) { c = ifs->get(); if (c == '.') { rateDenom = 1; - while (!ifs->eof()) + while (ifs->good()) { c = ifs->get(); if (c == ' ' || c == '\n') - { break; - } else { rateNum = rateNum * 10 + (c - '0'); rateDenom = rateDenom * 10; } } - break; } else if (c == ':') { - while (!ifs->eof()) + while (ifs->good()) { c = ifs->get(); if (c == ' ' || c == '\n') - { break; - } else rateDenom = rateDenom * 10 + (c - '0'); } - break; } else - { rateNum = rateNum * 10 + (c - '0'); - } } - break; case 'A': sarWidth = 0; sarHeight = 0; - while (!ifs->eof()) + while (ifs->good()) { c = ifs->get(); if (c == ':') { - while (!ifs->eof()) + while (ifs->good()) { c = ifs->get(); if (c == ' ' || c == '\n') - { break; - } else sarHeight = sarHeight * 10 + (c - '0'); } - break; } else - { sarWidth = sarWidth * 10 + (c - '0'); - } } - break; case 'C': csp = 0; d = 0; - while (!ifs->eof()) + while (ifs->good()) { c = ifs->get(); if (c <= '9' && c >= '0') - { csp = csp * 10 + (c - '0'); - } else if (c == 'p') { // example: C420p16 - while (!ifs->eof()) + while (ifs->good()) { c = ifs->get(); @@ -338,22 +309,19 @@ break; default: - while (!ifs->eof()) + while (ifs->good()) { // consume this unsupported configuration word c = ifs->get(); if (c == ' ' || c == '\n') break; } - break; } } if (c == '\n') - { break; - } } if (width < MIN_FRAME_WIDTH || width > MAX_FRAME_WIDTH || diff -Nru x265-1.5/source/output/y4m.cpp x265-1.6/source/output/y4m.cpp --- x265-1.5/source/output/y4m.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/output/y4m.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -46,9 +46,7 @@ } for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) - { frameSize += (uint32_t)((width >> x265_cli_csps[colorSpace].width[i]) * (height >> x265_cli_csps[colorSpace].height[i])); - } } Y4MOutput::~Y4MOutput() @@ -66,14 +64,10 @@ #if HIGH_BIT_DEPTH if (pic.bitDepth > 8 && pic.poc == 0) - { x265_log(NULL, X265_LOG_WARNING, "y4m: down-shifting reconstructed pixels to 8 bits\n"); - } #else if (pic.bitDepth > 8 && pic.poc == 0) - { x265_log(NULL, X265_LOG_WARNING, "y4m: forcing reconstructed pixels to 8 bits\n"); - } #endif X265_CHECK(pic.colorSpace == colorSpace, "invalid color space\n"); @@ -89,9 +83,7 @@ for (int h = 0; h < height >> x265_cli_csps[colorSpace].height[i]; h++) { for (int w = 0; w < width >> x265_cli_csps[colorSpace].width[i]; w++) - { buf[w] = (char)(src[w] >> shift); - } ofs.write(buf, width >> x265_cli_csps[colorSpace].width[i]); src += pic.stride[i] / sizeof(*src); diff -Nru x265-1.5/source/output/yuv.cpp x265-1.6/source/output/yuv.cpp --- x265-1.5/source/output/yuv.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/output/yuv.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -39,9 +39,7 @@ buf = new char[width]; for (int i = 0; i < x265_cli_csps[colorSpace].planes; i++) - { frameSize += (uint32_t)((width >> x265_cli_csps[colorSpace].width[i]) * (height >> x265_cli_csps[colorSpace].height[i])); - } } YUVOutput::~YUVOutput() @@ -69,9 +67,7 @@ for (int h = 0; h < height >> x265_cli_csps[colorSpace].height[i]; h++) { for (int w = 0; w < width >> x265_cli_csps[colorSpace].width[i]; w++) - { buf[w] = (char)(src[w] >> shift); - } ofs.write(buf, width >> x265_cli_csps[colorSpace].width[i]); src += pic.stride[i] / sizeof(*src); diff -Nru x265-1.5/source/profile/cpuEvents.h x265-1.6/source/profile/cpuEvents.h --- x265-1.5/source/profile/cpuEvents.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/profile/cpuEvents.h 2015-04-02 16:46:36.000000000 +0000 @@ -5,6 +5,7 @@ CPU_EVENT(filterCTURow) CPU_EVENT(slicetypeDecideEV) CPU_EVENT(prelookahead) -CPU_EVENT(costEstimateRow) +CPU_EVENT(estCostSingle) +CPU_EVENT(estCostCoop) CPU_EVENT(pmode) CPU_EVENT(pme) diff -Nru x265-1.5/source/test/CMakeLists.txt x265-1.6/source/test/CMakeLists.txt --- x265-1.5/source/test/CMakeLists.txt 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/test/CMakeLists.txt 2015-04-02 16:46:36.000000000 +0000 @@ -23,3 +23,6 @@ ipfilterharness.cpp ipfilterharness.h intrapredharness.cpp intrapredharness.h) target_link_libraries(TestBench x265-static ${PLATFORM_LIBS}) +if(LINKER_OPTIONS) + set_target_properties(TestBench PROPERTIES LINK_FLAGS ${LINKER_OPTIONS}) +endif() diff -Nru x265-1.5/source/test/ipfilterharness.cpp x265-1.6/source/test/ipfilterharness.cpp --- x265-1.5/source/test/ipfilterharness.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/test/ipfilterharness.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -61,7 +61,7 @@ } } -bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int isChroma, int csp) +bool IPFilterHarness::check_IPFilter_primitive(filter_p2s_wxh_t ref, filter_p2s_wxh_t opt, int isChroma, int csp) { intptr_t rand_srcStride; int min_size = isChroma ? 2 : 4; @@ -512,6 +512,46 @@ return true; } +bool IPFilterHarness::check_IPFilterLumaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt) +{ + for (int i = 0; i < ITERS; i++) + { + intptr_t rand_srcStride = rand() % 100; + int index = i % TEST_CASES; + + ref(pixel_test_buff[index] + i, rand_srcStride, IPF_C_output_s); + + checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s); + + if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(pixel))) + return false; + + reportfail(); + } + + return true; +} + +bool IPFilterHarness::check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt) +{ + for (int i = 0; i < ITERS; i++) + { + intptr_t rand_srcStride = rand() % 100; + int index = i % TEST_CASES; + + ref(pixel_test_buff[index] + i, rand_srcStride, IPF_C_output_s); + + checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s); + + if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(pixel))) + return false; + + reportfail(); + } + + return true; +} + bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt) { if (opt.luma_p2s) @@ -582,6 +622,14 @@ return false; } } + if (opt.pu[value].filter_p2s) + { + if (!check_IPFilterLumaP2S_primitive(ref.pu[value].filter_p2s, opt.pu[value].filter_p2s)) + { + printf("filter_p2s[%s]", lumaPartStr[value]); + return false; + } + } } for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++) @@ -644,6 +692,14 @@ return false; } } + if (opt.chroma[csp].pu[value].chroma_p2s) + { + if (!check_IPFilterChromaP2S_primitive(ref.chroma[csp].pu[value].chroma_p2s, opt.chroma[csp].pu[value].chroma_p2s)) + { + printf("chroma_p2s[%s]", chromaPartStr[csp][value]); + return false; + } + } } } @@ -720,6 +776,13 @@ REPORT_SPEEDUP(opt.pu[value].luma_hvpp, ref.pu[value].luma_hvpp, pixel_buff + 3 * srcStride, srcStride, IPF_vec_output_p, srcStride, 1, 3); } + + if (opt.pu[value].filter_p2s) + { + printf("filter_p2s [%s]\t", lumaPartStr[value]); + REPORT_SPEEDUP(opt.pu[value].filter_p2s, ref.pu[value].filter_p2s, + pixel_buff, srcStride, IPF_vec_output_s); + } } for (int csp = X265_CSP_I420; csp < X265_CSP_COUNT; csp++) @@ -773,6 +836,14 @@ short_buff + maxVerticalfilterHalfDistance * srcStride, srcStride, IPF_vec_output_s, dstStride, 1); } + + if (opt.chroma[csp].pu[value].chroma_p2s) + { + printf("chroma_p2s[%s]\t", chromaPartStr[csp][value]); + REPORT_SPEEDUP(opt.chroma[csp].pu[value].chroma_p2s, ref.chroma[csp].pu[value].chroma_p2s, + pixel_buff, srcStride, + IPF_vec_output_s); + } } } } diff -Nru x265-1.5/source/test/ipfilterharness.h x265-1.6/source/test/ipfilterharness.h --- x265-1.5/source/test/ipfilterharness.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/test/ipfilterharness.h 2015-04-02 16:46:36.000000000 +0000 @@ -50,7 +50,7 @@ pixel pixel_test_buff[TEST_CASES][TEST_BUF_SIZE]; int16_t short_test_buff[TEST_CASES][TEST_BUF_SIZE]; - bool check_IPFilter_primitive(filter_p2s_t ref, filter_p2s_t opt, int isChroma, int csp); + bool check_IPFilter_primitive(filter_p2s_wxh_t ref, filter_p2s_wxh_t opt, int isChroma, int csp); bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt); bool check_IPFilterChroma_ps_primitive(filter_ps_t ref, filter_ps_t opt); bool check_IPFilterChroma_hps_primitive(filter_hps_t ref, filter_hps_t opt); @@ -62,6 +62,8 @@ bool check_IPFilterLuma_sp_primitive(filter_sp_t ref, filter_sp_t opt); bool check_IPFilterLuma_ss_primitive(filter_ss_t ref, filter_ss_t opt); bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt); + bool check_IPFilterLumaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt); + bool check_IPFilterChromaP2S_primitive(filter_p2s_t ref, filter_p2s_t opt); public: diff -Nru x265-1.5/source/test/mbdstharness.cpp x265-1.6/source/test/mbdstharness.cpp --- x265-1.5/source/test/mbdstharness.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/test/mbdstharness.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -209,7 +209,7 @@ for (int i = 0; i < ITERS; i++) { - int width = (rand() % 4 + 1) * 4; + int width = 1 << (rand() % 4 + 2); int height = width; uint32_t optReturnValue = 0; @@ -278,42 +278,19 @@ return true; } - bool MBDstHarness::check_count_nonzero_primitive(count_nonzero_t ref, count_nonzero_t opt) { - ALIGN_VAR_32(int16_t, qcoeff[32 * 32]); - - for (int i = 0; i < 4; i++) + int j = 0; + for (int i = 0; i < ITERS; i++) { - int log2TrSize = i + 2; - int num = 1 << (log2TrSize * 2); - int mask = num - 1; - - for (int n = 0; n <= num; n++) - { - memset(qcoeff, 0, num * sizeof(int16_t)); - - for (int j = 0; j < n; j++) - { - int k = rand() & mask; - while (qcoeff[k]) - { - k = (k + 11) & mask; - } - - qcoeff[k] = (int16_t)rand() - RAND_MAX / 2; - } - - int refval = ref(qcoeff, num); - int optval = (int)checked(opt, qcoeff, num); - - if (refval != optval) - return false; - - reportfail(); - } + int index = i % TEST_CASES; + int opt_cnt = (int)checked(opt, short_test_buff[index] + j); + int ref_cnt = ref(short_test_buff[index] + j); + if (ref_cnt != opt_cnt) + return false; + reportfail(); + j += INCR; } - return true; } @@ -437,16 +414,17 @@ return false; } } - - if (opt.count_nonzero) + for (int i = 0; i < NUM_TR_SIZE; i++) { - if (!check_count_nonzero_primitive(ref.count_nonzero, opt.count_nonzero)) + if (opt.cu[i].count_nonzero) { - printf("count_nonzero: Failed!\n"); - return false; + if (!check_count_nonzero_primitive(ref.cu[i].count_nonzero, opt.cu[i].count_nonzero)) + { + printf("count_nonzero[%dx%d] Failed!\n", 4 << i, 4 << i); + return false; + } } } - if (opt.dequant_scaling) { if (!check_dequant_primitive(ref.dequant_scaling, opt.dequant_scaling)) @@ -523,16 +501,14 @@ printf("nquant\t\t"); REPORT_SPEEDUP(opt.nquant, ref.nquant, short_test_buff[0], int_test_buff[1], mshortbuf2, 23, 23785, 32 * 32); } - - if (opt.count_nonzero) + for (int value = 0; value < NUM_TR_SIZE; value++) { - for (int i = 4; i <= 32; i <<= 1) + if (opt.cu[value].count_nonzero) { - printf("count_nonzero[%dx%d]", i, i); - REPORT_SPEEDUP(opt.count_nonzero, ref.count_nonzero, mbuf1, i * i) + printf("count_nonzero[%dx%d]", 4 << value, 4 << value); + REPORT_SPEEDUP(opt.cu[value].count_nonzero, ref.cu[value].count_nonzero, mbuf1); } } - if (opt.denoiseDct) { printf("denoiseDct\t"); diff -Nru x265-1.5/source/test/pixelharness.cpp x265-1.6/source/test/pixelharness.cpp --- x265-1.5/source/test/pixelharness.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/test/pixelharness.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -1149,6 +1149,71 @@ return true; } +bool PixelHarness::check_findPosLast(findPosLast_t ref, findPosLast_t opt) +{ + ALIGN_VAR_16(coeff_t, ref_src[32 * 32 + ITERS * 2]); + uint8_t ref_coeffNum[MLS_GRP_NUM], opt_coeffNum[MLS_GRP_NUM]; // value range[0, 16] + uint16_t ref_coeffSign[MLS_GRP_NUM], opt_coeffSign[MLS_GRP_NUM]; // bit mask map for non-zero coeff sign + uint16_t ref_coeffFlag[MLS_GRP_NUM], opt_coeffFlag[MLS_GRP_NUM]; // bit mask map for non-zero coeff + + int totalCoeffs = 0; + for (int i = 0; i < 32 * 32; i++) + { + ref_src[i] = rand() & SHORT_MAX; + totalCoeffs += (ref_src[i] != 0); + } + + // extra test area all of 0x1234 + for (int i = 0; i < ITERS * 2; i++) + { + ref_src[32 * 32 + i] = 0x1234; + } + + + memset(ref_coeffNum, 0xCD, sizeof(ref_coeffNum)); + memset(ref_coeffSign, 0xCD, sizeof(ref_coeffSign)); + memset(ref_coeffFlag, 0xCD, sizeof(ref_coeffFlag)); + + memset(opt_coeffNum, 0xCD, sizeof(opt_coeffNum)); + memset(opt_coeffSign, 0xCD, sizeof(opt_coeffSign)); + memset(opt_coeffFlag, 0xCD, sizeof(opt_coeffFlag)); + + for (int i = 0; i < ITERS; i++) + { + int rand_scan_type = rand() % NUM_SCAN_TYPE; + int rand_scan_size = rand() % NUM_SCAN_SIZE; + int rand_numCoeff = 0; + + for (int j = 0; j < 1 << (2 * (rand_scan_size + 2)); j++) + rand_numCoeff += (ref_src[i + j] != 0); + + const uint16_t* const scanTbl = g_scanOrder[rand_scan_type][rand_scan_size]; + + int ref_scanPos = ref(scanTbl, ref_src + i, ref_coeffSign, ref_coeffFlag, ref_coeffNum, rand_numCoeff); + int opt_scanPos = (int)checked(opt, scanTbl, ref_src + i, opt_coeffSign, opt_coeffFlag, opt_coeffNum, rand_numCoeff); + + if (ref_scanPos != opt_scanPos) + return false; + + for (int j = 0; rand_numCoeff; j++) + { + if (ref_coeffSign[j] != opt_coeffSign[j]) + return false; + + if (ref_coeffFlag[j] != opt_coeffFlag[j]) + return false; + + if (ref_coeffNum[j] != opt_coeffNum[j]) + return false; + + rand_numCoeff -= ref_coeffNum[j]; + } + + reportfail(); + } + + return true; +} bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt) { @@ -1299,6 +1364,14 @@ return false; } } + if (opt.chroma[i].pu[part].satd) + { + if (!check_pixelcmp(ref.chroma[i].pu[part].satd, opt.chroma[i].pu[part].satd)) + { + printf("chroma_satd[%s][%s] failed!\n", x265_source_csp_names[i], chromaPartStr[i][part]); + return false; + } + } if (part < NUM_CU_SIZES) { if (opt.chroma[i].cu[part].sub_ps) @@ -1467,7 +1540,7 @@ { if (!check_cpy2Dto1D_shl_t(ref.cu[i].cpy2Dto1D_shl, opt.cu[i].cpy2Dto1D_shl)) { - printf("cpy2Dto1D_shl failed!\n"); + printf("cpy2Dto1D_shl[%dx%d] failed!\n", 4 << i, 4 << i); return false; } } @@ -1645,6 +1718,15 @@ } } + if (opt.findPosLast) + { + if (!check_findPosLast(ref.findPosLast, opt.findPosLast)) + { + printf("findPosLast failed!\n"); + return false; + } + } + return true; } @@ -1688,7 +1770,7 @@ if (opt.pu[part].copy_pp) { HEADER("copy_pp[%s]", lumaPartStr[part]); - REPORT_SPEEDUP(opt.pu[part].copy_pp, ref.pu[part].copy_pp, pbuf1, 64, pbuf2, 128); + REPORT_SPEEDUP(opt.pu[part].copy_pp, ref.pu[part].copy_pp, pbuf1, 64, pbuf2, 64); } if (opt.pu[part].addAvg) @@ -1723,7 +1805,7 @@ if (opt.cu[part].copy_ss) { HEADER("copy_ss[%s]", lumaPartStr[part]); - REPORT_SPEEDUP(opt.cu[part].copy_ss, ref.cu[part].copy_ss, sbuf1, 64, sbuf2, 128); + REPORT_SPEEDUP(opt.cu[part].copy_ss, ref.cu[part].copy_ss, sbuf1, 128, sbuf2, 128); } if (opt.cu[part].copy_sp) { @@ -1733,7 +1815,7 @@ if (opt.cu[part].copy_ps) { HEADER("copy_ps[%s]", lumaPartStr[part]); - REPORT_SPEEDUP(opt.cu[part].copy_ps, ref.cu[part].copy_ps, sbuf1, 64, pbuf1, 128); + REPORT_SPEEDUP(opt.cu[part].copy_ps, ref.cu[part].copy_ps, sbuf1, 128, pbuf1, 64); } } @@ -1749,6 +1831,11 @@ HEADER("[%s] addAvg[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); REPORT_SPEEDUP(opt.chroma[i].pu[part].addAvg, ref.chroma[i].pu[part].addAvg, sbuf1, sbuf2, pbuf1, STRIDE, STRIDE, STRIDE); } + if (opt.chroma[i].pu[part].satd) + { + HEADER("[%s] satd[%s]", x265_source_csp_names[i], chromaPartStr[i][part]); + REPORT_SPEEDUP(opt.chroma[i].pu[part].satd, ref.chroma[i].pu[part].satd, pbuf1, STRIDE, fref, STRIDE); + } if (part < NUM_CU_SIZES) { if (opt.chroma[i].cu[part].copy_ss) @@ -1990,4 +2077,13 @@ HEADER0("propagateCost"); REPORT_SPEEDUP(opt.propagateCost, ref.propagateCost, ibuf1, ushort_test_buff[0], int_test_buff[0], ushort_test_buff[0], int_test_buff[0], double_test_buff[0], 80); } + + if (opt.findPosLast) + { + HEADER0("findPosLast"); + coeff_t coefBuf[32 * 32]; + memset(coefBuf, 0, sizeof(coefBuf)); + memset(coefBuf + 32 * 31, 1, 32 * sizeof(coeff_t)); + REPORT_SPEEDUP(opt.findPosLast, ref.findPosLast, g_scanOrder[SCAN_DIAG][NUM_SCAN_SIZE - 1], coefBuf, (uint16_t*)sbuf1, (uint16_t*)sbuf2, (uint8_t*)psbuf1, 32); + } } diff -Nru x265-1.5/source/test/pixelharness.h x265-1.6/source/test/pixelharness.h --- x265-1.5/source/test/pixelharness.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/test/pixelharness.h 2015-04-02 16:46:36.000000000 +0000 @@ -104,6 +104,7 @@ bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt); bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt); bool check_calSign(sign_t ref, sign_t opt); + bool check_findPosLast(findPosLast_t ref, findPosLast_t opt); public: diff -Nru x265-1.5/source/test/rate-control-tests.txt x265-1.6/source/test/rate-control-tests.txt --- x265-1.5/source/test/rate-control-tests.txt 1970-01-01 00:00:00.000000000 +0000 +++ x265-1.6/source/test/rate-control-tests.txt 2015-04-02 16:46:36.000000000 +0000 @@ -0,0 +1,34 @@ +# List of command lines to be run by rate control regression tests, see https://bitbucket.org/sborho/test-harness + +# This test is listed first since it currently reproduces bugs +big_buck_bunny_360p24.y4m,--preset medium --bitrate 1000 --pass 1 -F4,--preset medium --bitrate 1000 --pass 2 -F4 + +# VBV tests, non-deterministic so testing for correctness and bitrate +# fluctuations - up to 1% bitrate fluctuation is allowed between runs +RaceHorses_416x240_30_10bit.yuv,--preset medium --bitrate 700 --vbv-bufsize 900 --vbv-maxrate 700 +RaceHorses_416x240_30_10bit.yuv,--preset superfast --bitrate 600 --vbv-bufsize 600 --vbv-maxrate 600 +RaceHorses_416x240_30_10bit.yuv,--preset veryslow --bitrate 1100 --vbv-bufsize 1100 --vbv-maxrate 1200 +112_1920x1080_25.yuv,--preset medium --bitrate 1000 --vbv-maxrate 1500 --vbv-bufsize 1500 --aud +112_1920x1080_25.yuv,--preset medium --bitrate 10000 --vbv-maxrate 10000 --vbv-bufsize 15000 --hrd +112_1920x1080_25.yuv,--preset medium --bitrate 4000 --vbv-maxrate 12000 --vbv-bufsize 12000 --repeat-headers +112_1920x1080_25.yuv,--preset superfast --bitrate 1000 --vbv-maxrate 1000 --vbv-bufsize 1500 --hrd --strict-cbr +112_1920x1080_25.yuv,--preset superfast --bitrate 30000 --vbv-maxrate 30000 --vbv-bufsize 30000 --repeat-headers +112_1920x1080_25.yuv,--preset superfast --bitrate 4000 --vbv-maxrate 6000 --vbv-bufsize 6000 --aud +112_1920x1080_25.yuv,--preset veryslow --bitrate 1000 --vbv-maxrate 3000 --vbv-bufsize 3000 --repeat-headers +big_buck_bunny_360p24.y4m,--preset medium --bitrate 1000 --vbv-bufsize 3000 --vbv-maxrate 3000 --repeat-headers +big_buck_bunny_360p24.y4m,--preset medium --bitrate 3000 --vbv-bufsize 3000 --vbv-maxrate 3000 --hrd +big_buck_bunny_360p24.y4m,--preset medium --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 600 --aud +big_buck_bunny_360p24.y4m,--preset medium --crf 1 --vbv-bufsize 3000 --vbv-maxrate 3000 --hrd +big_buck_bunny_360p24.y4m,--preset superfast --bitrate 1000 --vbv-bufsize 1000 --vbv-maxrate 1000 --aud --strict-cbr +big_buck_bunny_360p24.y4m,--preset superfast --bitrate 3000 --vbv-bufsize 9000 --vbv-maxrate 9000 --repeat-headers +big_buck_bunny_360p24.y4m,--preset superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd +big_buck_bunny_360p24.y4m,--preset superfast --crf 6 --vbv-bufsize 1000 --vbv-maxrate 1000 --aud + +# multi-pass rate control tests +big_buck_bunny_360p24.y4m,--preset slow --crf 40 --pass 1,--preset slow --bitrate 200 --pass 2 +big_buck_bunny_360p24.y4m,--preset medium --bitrate 700 --pass 1 -F4 --slow-firstpass,--preset medium --bitrate 700 --vbv-bufsize 900 --vbv-maxrate 700 --pass 2 -F4 +112_1920x1080_25.yuv,--preset slow --bitrate 1000 --pass 1 -F4,--preset slow --bitrate 1000 --pass 2 -F4 +112_1920x1080_25.yuv,--preset superfast --crf 12 --pass 1,--preset superfast --bitrate 4000 --pass 2 -F4 +RaceHorses_416x240_30_10bit.yuv,--preset veryslow --crf 40 --pass 1, --preset veryslow --bitrate 200 --pass 2 -F4 +RaceHorses_416x240_30_10bit.yuv,--preset superfast --bitrate 600 --pass 1 -F4 --slow-firstpass,--preset superfast --bitrate 600 --pass 2 -F4 +RaceHorses_416x240_30_10bit.yuv,--preset medium --crf 26 --pass 1,--preset medium --bitrate 500 --pass 3 -F4,--preset medium --bitrate 500 --pass 2 -F4 diff -Nru x265-1.5/source/test/regression-tests.txt x265-1.6/source/test/regression-tests.txt --- x265-1.5/source/test/regression-tests.txt 1970-01-01 00:00:00.000000000 +0000 +++ x265-1.6/source/test/regression-tests.txt 2015-04-02 16:46:36.000000000 +0000 @@ -0,0 +1,127 @@ +# List of command lines to be run by regression tests, see https://bitbucket.org/sborho/test-harness + +# the vast majority of the commands are tested for results matching the +# most recent commit which was known to change outputs. The output +# bitstream must be bit-exact or the test fails. If no golden outputs +# are available the bitstream is validated (decoded) and then saved as a +# new golden output + +# Note: --nr-intra, --nr-inter, and --bitrate (ABR) give different +# outputs for different frame encoder counts. In order for outputs to be +# consistent across many machines, you must force a certain -FN so it is +# not auto-detected. + +BasketballDrive_1920x1080_50.y4m,--preset faster --aq-strength 2 --merange 190 +BasketballDrive_1920x1080_50.y4m,--preset medium --ctu 16 --max-tu-size 8 --subme 7 +BasketballDrive_1920x1080_50.y4m,--preset medium --keyint -1 --nr-inter 100 -F4 --no-sao +BasketballDrive_1920x1080_50.y4m,--preset slow --nr-intra 100 -F4 --aq-strength 3 +BasketballDrive_1920x1080_50.y4m,--preset slower --lossless --chromaloc 3 --subme 0 +BasketballDrive_1920x1080_50.y4m,--preset superfast --psy-rd 1 --ctu 16 --no-wpp +BasketballDrive_1920x1080_50.y4m,--preset ultrafast --signhide --colormatrix bt709 +BasketballDrive_1920x1080_50.y4m,--preset veryfast --tune zerolatency --no-temporal-mvp +BasketballDrive_1920x1080_50.y4m,--preset veryslow --crf 4 --cu-lossless --pmode +Coastguard-4k.y4m,--preset medium --rdoq-level 1 --tune ssim --no-signhide --me umh +Coastguard-4k.y4m,--preset slow --tune psnr --cbqpoffs -1 --crqpoffs 1 +Coastguard-4k.y4m,--preset superfast --tune grain --overscan=crop +CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --aq-mode 0 --sar 2 --range full +CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --max-tu-size 4 --min-cu-size 32 +CrowdRun_1920x1080_50_10bit_422.yuv,--preset medium --no-wpp --no-cutree --no-strong-intra-smoothing +CrowdRun_1920x1080_50_10bit_422.yuv,--preset slow --no-wpp --tune ssim --transfer smpte240m +CrowdRun_1920x1080_50_10bit_422.yuv,--preset slower --tune ssim --tune fastdecode +CrowdRun_1920x1080_50_10bit_422.yuv,--preset superfast --weightp --no-wpp --sao +CrowdRun_1920x1080_50_10bit_422.yuv,--preset ultrafast --weightp --tune zerolatency +CrowdRun_1920x1080_50_10bit_422.yuv,--preset veryfast --temporal-layers --tune grain +CrowdRun_1920x1080_50_10bit_444.yuv,--preset medium --dither --keyint -1 --rdoq-level 1 +CrowdRun_1920x1080_50_10bit_444.yuv,--preset superfast --weightp --dither --no-psy-rd +CrowdRun_1920x1080_50_10bit_444.yuv,--preset ultrafast --weightp --no-wpp --no-open-gop +CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryfast --temporal-layers --repeat-headers +CrowdRun_1920x1080_50_10bit_444.yuv,--preset veryslow --tskip --tskip-fast --no-scenecut +DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset medium --tune psnr --bframes 16 +DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset slow --temporal-layers --no-psy-rd +DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset superfast --weightp +DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset medium --nr-inter 500 -F4 --no-psy-rdoq +DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset slower --no-weightp --rdoq-level 0 +DucksAndLegs_1920x1080_60_10bit_444.yuv,--preset veryfast --weightp --nr-intra 1000 -F4 +FourPeople_1280x720_60.y4m,--preset medium --qp 38 --no-psy-rd +FourPeople_1280x720_60.y4m,--preset superfast --no-wpp --lookahead-slices 2 +Keiba_832x480_30.y4m,--preset medium --pmode --tune grain +Keiba_832x480_30.y4m,--preset slower --fast-intra --nr-inter 500 -F4 +Keiba_832x480_30.y4m,--preset superfast --no-fast-intra --nr-intra 1000 -F4 +Kimono1_1920x1080_24_10bit_444.yuv,--preset medium --min-cu-size 32 +Kimono1_1920x1080_24_10bit_444.yuv,--preset superfast --weightb +KristenAndSara_1280x720_60.y4m,--preset medium --no-cutree --max-tu-size 16 +KristenAndSara_1280x720_60.y4m,--preset slower --pmode --max-tu-size 8 +KristenAndSara_1280x720_60.y4m,--preset superfast --min-cu-size 16 +KristenAndSara_1280x720_60.y4m,--preset ultrafast --strong-intra-smoothing +NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset medium --tune grain +NebutaFestival_2560x1600_60_10bit_crop.yuv,--preset superfast --tune psnr +News-4k.y4m,--preset medium --tune ssim --no-sao +News-4k.y4m,--preset superfast --lookahead-slices 6 --aq-mode 0 +OldTownCross_1920x1080_50_10bit_422.yuv,--preset medium --no-weightp +OldTownCross_1920x1080_50_10bit_422.yuv,--preset slower --tune fastdecode +OldTownCross_1920x1080_50_10bit_422.yuv,--preset superfast --weightp +ParkScene_1920x1080_24.y4m,--preset medium --qp 40 --rdpenalty 2 --tu-intra-depth 3 +ParkScene_1920x1080_24.y4m,--preset slower --no-weightp +ParkScene_1920x1080_24_10bit_444.yuv,--preset superfast --weightp --lookahead-slices 4 +RaceHorses_416x240_30.y4m,--preset medium --tskip-fast --tskip +RaceHorses_416x240_30.y4m,--preset slower --keyint -1 --rdoq-level 0 +RaceHorses_416x240_30.y4m,--preset superfast --no-cutree +RaceHorses_416x240_30.y4m,--preset veryslow --tskip-fast --tskip +RaceHorses_416x240_30_10bit.yuv,--preset fast --lookahead-slices 2 --b-intra +RaceHorses_416x240_30_10bit.yuv,--preset faster --rdoq-level 0 --dither +RaceHorses_416x240_30_10bit.yuv,--preset slow --tune grain +RaceHorses_416x240_30_10bit.yuv,--preset ultrafast --tune psnr +RaceHorses_416x240_30_10bit.yuv,--preset veryfast --weightb +RaceHorses_416x240_30_10bit.yuv,--preset placebo +SteamLocomotiveTrain_2560x1600_60_10bit_crop.yuv,--preset medium --dither +big_buck_bunny_360p24.y4m,--preset faster --keyint 240 --min-keyint 60 --rc-lookahead 200 +big_buck_bunny_360p24.y4m,--preset medium --keyint 60 --min-keyint 48 --weightb +big_buck_bunny_360p24.y4m,--preset slow --psy-rdoq 2.0 --rdoq-level 1 --no-b-intra +big_buck_bunny_360p24.y4m,--preset superfast --psy-rdoq 2.0 +big_buck_bunny_360p24.y4m,--preset ultrafast --deblock=2 +big_buck_bunny_360p24.y4m,--preset veryfast --no-deblock +city_4cif_60fps.y4m,--preset medium --crf 4 --cu-lossless --sao-non-deblock +city_4cif_60fps.y4m,--preset superfast --rdpenalty 1 --tu-intra-depth 2 +city_4cif_60fps.y4m,--preset slower --scaling-list default +city_4cif_60fps.y4m,--preset veryslow --rdpenalty 2 --sao-non-deblock --no-b-intra +ducks_take_off_420_720p50.y4m,--preset fast --deblock 6 --bframes 16 --rc-lookahead 40 +ducks_take_off_420_720p50.y4m,--preset faster --qp 24 --deblock -6 +ducks_take_off_420_720p50.y4m,--preset medium --tskip --tskip-fast --constrained-intra +ducks_take_off_420_720p50.y4m,--preset slow --scaling-list default --qp 40 +ducks_take_off_420_720p50.y4m,--preset ultrafast --constrained-intra --rd 1 +ducks_take_off_420_720p50.y4m,--preset veryslow --constrained-intra --bframes 2 +ducks_take_off_444_720p50.y4m,--preset medium --qp 38 --no-scenecut +ducks_take_off_444_720p50.y4m,--preset superfast --weightp --rd 0 +ducks_take_off_444_720p50.y4m,--preset slower --psy-rd 1 --psy-rdoq 2.0 --rdoq-level 1 +mobile_calendar_422_ntsc.y4m,--preset medium --bitrate 500 -F4 +mobile_calendar_422_ntsc.y4m,--preset slower --tskip --tskip-fast +mobile_calendar_422_ntsc.y4m,--preset superfast --weightp --rd 0 +mobile_calendar_422_ntsc.y4m,--preset veryslow --tskip +old_town_cross_444_720p50.y4m,--preset faster --rd 1 --tune zero-latency +old_town_cross_444_720p50.y4m,--preset medium --keyint -1 --no-weightp --ref 6 +old_town_cross_444_720p50.y4m,--preset slow --rdoq-level 1 --early-skip --ref 7 --no-b-pyramid +old_town_cross_444_720p50.y4m,--preset slower --crf 4 --cu-lossless +old_town_cross_444_720p50.y4m,--preset superfast --weightp --min-cu 16 +old_town_cross_444_720p50.y4m,--preset ultrafast --weightp --min-cu 32 +old_town_cross_444_720p50.y4m,--preset veryfast --qp 1 --tune ssim +parkrun_ter_720p50.y4m,--preset medium --no-open-gop --sao-non-deblock --crf 4 --cu-lossless +parkrun_ter_720p50.y4m,--preset slower --fast-intra --no-rect --tune grain +silent_cif_420.y4m,--preset medium --me full --rect --amp +silent_cif_420.y4m,--preset superfast --weightp --rect +silent_cif_420.y4m,--preset placebo --ctu 32 --no-sao +vtc1nw_422_ntsc.y4m,--preset medium --scaling-list default --ctu 16 --ref 5 +vtc1nw_422_ntsc.y4m,--preset slower --nr-inter 1000 -F4 --tune fast-decode +vtc1nw_422_ntsc.y4m,--preset superfast --weightp --nr-intra 100 -F4 +washdc_422_ntsc.y4m,--preset faster --rdoq-level 1 --max-merge 5 +washdc_422_ntsc.y4m,--preset medium --no-weightp --max-tu-size 4 +washdc_422_ntsc.y4m,--preset slower --psy-rdoq 2.0 --rdoq-level 2 +washdc_422_ntsc.y4m,--preset superfast --psy-rd 1 --tune zerolatency +washdc_422_ntsc.y4m,--preset ultrafast --weightp --tu-intra-depth 4 +washdc_422_ntsc.y4m,--preset veryfast --tu-inter-depth 4 +washdc_422_ntsc.y4m,--preset veryslow --crf 4 --cu-lossless + +# interlace test, even though input YUV is not field seperated +CrowdRun_1920x1080_50_10bit_422.yuv,--preset fast --interlace bff +CrowdRun_1920x1080_50_10bit_422.yuv,--preset faster --interlace tff + +# vim: tw=200 diff -Nru x265-1.5/source/test/smoke-tests.txt x265-1.6/source/test/smoke-tests.txt --- x265-1.5/source/test/smoke-tests.txt 1970-01-01 00:00:00.000000000 +0000 +++ x265-1.6/source/test/smoke-tests.txt 2015-04-02 16:46:36.000000000 +0000 @@ -0,0 +1,17 @@ +# List of command lines to be run by smoke tests, see https://bitbucket.org/sborho/test-harness + +big_buck_bunny_360p24.y4m,--preset=superfast --bitrate 400 --vbv-bufsize 600 --vbv-maxrate 400 --hrd --aud --repeat-headers +big_buck_bunny_360p24.y4m,--preset=medium --bitrate 1000 -F4 --cu-lossless --scaling-list default +big_buck_bunny_360p24.y4m,--preset=slower --no-weightp --cu-stats --pme +washdc_422_ntsc.y4m,--preset=faster --no-strong-intra-smoothing --keyint 1 +washdc_422_ntsc.y4m,--preset=medium --qp 40 --nr-inter 400 -F4 +washdc_422_ntsc.y4m,--preset=veryslow --pmode --tskip --rdoq-level 0 +old_town_cross_444_720p50.y4m,--preset=ultrafast --weightp --keyint -1 +old_town_cross_444_720p50.y4m,--preset=fast --keyint 20 --min-cu-size 16 +old_town_cross_444_720p50.y4m,--preset=slow --sao-non-deblock --pmode +RaceHorses_416x240_30_10bit.yuv,--preset=veryfast --cu-stats --max-tu-size 8 +RaceHorses_416x240_30_10bit.yuv,--preset=slower --bitrate 500 -F4 --rdoq-level 1 +CrowdRun_1920x1080_50_10bit_444.yuv,--preset=ultrafast --constrained-intra --min-keyint 5 --keyint 10 +CrowdRun_1920x1080_50_10bit_444.yuv,--preset=medium --max-tu-size 16 +DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=veryfast --min-cu 16 +DucksAndLegs_1920x1080_60_10bit_422.yuv,--preset=fast --weightb --interlace bff diff -Nru x265-1.5/source/test/testbench.cpp x265-1.6/source/test/testbench.cpp --- x265-1.5/source/test/testbench.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/test/testbench.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -174,7 +174,10 @@ for (int i = 0; test_arch[i].flag; i++) { if (test_arch[i].flag & cpuid) + { printf("Testing primitives: %s\n", test_arch[i].name); + fflush(stdout); + } else continue; @@ -188,6 +191,7 @@ continue; if (!harness[h]->testCorrectness(cprim, vecprim)) { + fflush(stdout); fprintf(stderr, "\nx265: intrinsic primitive has failed. Go and fix that Right Now!\n"); return -1; } @@ -204,6 +208,7 @@ continue; if (!harness[h]->testCorrectness(cprim, asmprim)) { + fflush(stdout); fprintf(stderr, "\nx265: asm primitive has failed. Go and fix that Right Now!\n"); return -1; } @@ -226,6 +231,7 @@ memcpy(&primitives, &optprim, sizeof(EncoderPrimitives)); printf("\nTest performance improvement with full optimizations\n"); + fflush(stdout); for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++) { diff -Nru x265-1.5/source/test/testharness.h x265-1.6/source/test/testharness.h --- x265-1.5/source/test/testharness.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/test/testharness.h 2015-04-02 16:46:36.000000000 +0000 @@ -158,7 +158,7 @@ m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, m_rand, \ m_rand, m_rand, m_rand, m_rand, m_rand), /* max_args+6 */ \ x265_checkasm_call_float((float(*)())func, &m_ok, 0, 0, 0, 0, __VA_ARGS__)) -#define reportfail() if (!m_ok) { fprintf(stderr, "stack clobber check failed at %s:%d", __FILE__, __LINE__); abort(); } +#define reportfail() if (!m_ok) { fflush(stdout); fprintf(stderr, "stack clobber check failed at %s:%d", __FILE__, __LINE__); abort(); } #elif ARCH_X86 #define checked(func, ...) x265_checkasm_call((intptr_t(*)())func, &m_ok, __VA_ARGS__); #define checked_float(func, ...) x265_checkasm_call_float((float(*)())func, &m_ok, __VA_ARGS__); diff -Nru x265-1.5/source/x265cli.h x265-1.6/source/x265cli.h --- x265-1.5/source/x265cli.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/x265cli.h 2015-04-02 16:46:36.000000000 +0000 @@ -37,7 +37,8 @@ { "version", no_argument, NULL, 'V' }, { "asm", required_argument, NULL, 0 }, { "no-asm", no_argument, NULL, 0 }, - { "threads", required_argument, NULL, 0 }, + { "pools", required_argument, NULL, 0 }, + { "numa-pools", required_argument, NULL, 0 }, { "preset", required_argument, NULL, 'p' }, { "tune", required_argument, NULL, 't' }, { "frame-threads", required_argument, NULL, 'F' }, @@ -71,6 +72,8 @@ { "no-wpp", no_argument, NULL, 0 }, { "wpp", no_argument, NULL, 0 }, { "ctu", required_argument, NULL, 's' }, + { "min-cu-size", required_argument, NULL, 0 }, + { "max-tu-size", required_argument, NULL, 0 }, { "tu-intra-depth", required_argument, NULL, 0 }, { "tu-inter-depth", required_argument, NULL, 0 }, { "me", required_argument, NULL, 0 }, @@ -96,6 +99,8 @@ { "no-cu-lossless", no_argument, NULL, 0 }, { "no-constrained-intra", no_argument, NULL, 0 }, { "constrained-intra", no_argument, NULL, 0 }, + { "cip", no_argument, NULL, 0 }, + { "no-cip", no_argument, NULL, 0 }, { "fast-intra", no_argument, NULL, 0 }, { "no-fast-intra", no_argument, NULL, 0 }, { "no-open-gop", no_argument, NULL, 0 }, @@ -105,6 +110,7 @@ { "scenecut", required_argument, NULL, 0 }, { "no-scenecut", no_argument, NULL, 0 }, { "rc-lookahead", required_argument, NULL, 0 }, + { "lookahead-slices", required_argument, NULL, 0 }, { "bframes", required_argument, NULL, 'b' }, { "bframe-bias", required_argument, NULL, 0 }, { "b-adapt", required_argument, NULL, 0 }, @@ -136,6 +142,8 @@ { "cbqpoffs", required_argument, NULL, 0 }, { "crqpoffs", required_argument, NULL, 0 }, { "rd", required_argument, NULL, 0 }, + { "rdoq-level", required_argument, NULL, 0 }, + { "no-rdoq-level", no_argument, NULL, 0 }, { "psy-rd", required_argument, NULL, 0 }, { "psy-rdoq", required_argument, NULL, 0 }, { "no-psy-rd", no_argument, NULL, 0 }, @@ -195,6 +203,8 @@ { "analysis-mode", required_argument, NULL, 0 }, { "analysis-file", required_argument, NULL, 0 }, { "strict-cbr", no_argument, NULL, 0 }, + { "temporal-layers", no_argument, NULL, 0 }, + { "no-temporal-layers", no_argument, NULL, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, @@ -246,10 +256,11 @@ H0(" --[no-]psnr Enable reporting PSNR metric scores. Default %s\n", OPT(param->bEnablePsnr)); H0("\nProfile, Level, Tier:\n"); H0(" --profile Enforce an encode profile: main, main10, mainstillpicture\n"); - H0(" --level-idc Force a minumum required decoder level (as '5.0' or '50')\n"); + H0(" --level-idc Force a minimum required decoder level (as '5.0' or '50')\n"); H0(" --[no-]high-tier If a decoder level is specified, this modifier selects High tier of that level\n"); H0("\nThreading, performance:\n"); - H0(" --threads Number of threads for thread pool (0: detect CPU core count, default)\n"); + H0(" --pools Comma separated thread count per thread pool (pool per NUMA node)\n"); + H0(" '-' implies no threads on node, '+' implies one thread per core on node\n"); H0("-F/--frame-threads Number of concurrently encoded frames. 0: auto-determined by core count\n"); H0(" --[no-]wpp Enable Wavefront Parallel Processing. Default %s\n", OPT(param->bEnableWavefront)); H0(" --[no-]pmode Parallel mode analysis. Default %s\n", OPT(param->bDistributeModeAnalysis)); @@ -262,14 +273,16 @@ H0(" psnr, ssim, grain, zerolatency, fastdecode\n"); H0("\nQuad-Tree size and depth:\n"); H0("-s/--ctu <64|32|16> Maximum CU size (WxH). Default %d\n", param->maxCUSize); + H0(" --min-cu-size <64|32|16|8> Minimum CU size (WxH). Default %d\n", param->minCUSize); + H0(" --max-tu-size <32|16|8|4> Maximum TU size (WxH). Default %d\n", param->maxTUSize); H0(" --tu-intra-depth Max TU recursive depth for intra CUs. Default %d\n", param->tuQTMaxIntraDepth); H0(" --tu-inter-depth Max TU recursive depth for inter CUs. Default %d\n", param->tuQTMaxInterDepth); H0("\nAnalysis:\n"); - H0(" --rd <0..6> Level of RD in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel); + H0(" --rd <0..6> Level of RDO in mode decision 0:least....6:full RDO. Default %d\n", param->rdLevel); H0(" --[no-]psy-rd <0..2.0> Strength of psycho-visual rate distortion optimization, 0 to disable. Default %.1f\n", param->psyRd); - H0(" --[no-]psy-rdoq <0..50.0> Strength of psycho-visual optimization in quantization, 0 to disable. Default %.1f\n", param->psyRdoq); + H0(" --[no-]rdoq-level <0|1|2> Level of RDO in quantization 0:none, 1:levels, 2:levels & coding groups. Default %d\n", param->rdoqLevel); + H0(" --[no-]psy-rdoq <0..50.0> Strength of psycho-visual optimization in RDO quantization, 0 to disable. Default %.1f\n", param->psyRdoq); H0(" --[no-]early-skip Enable early SKIP detection. Default %s\n", OPT(param->bEnableEarlySkip)); - H1(" --[no-]fast-cbf Enable early outs based on whether residual is coded. Default %s\n", OPT(param->bEnableCbfFastMode)); H1(" --[no-]tskip-fast Enable fast intra transform skipping. Default %s\n", OPT(param->bEnableTSkipFast)); H1(" --nr-intra An integer value in range of 0 to 2000, which denotes strength of noise reduction in intra CUs. Default 0\n"); H1(" --nr-inter An integer value in range of 0 to 2000, which denotes strength of noise reduction in inter CUs. Default 0\n"); @@ -300,6 +313,7 @@ H0(" --no-scenecut Disable adaptive I-frame decision\n"); H0(" --scenecut How aggressively to insert extra I-frames. Default %d\n", param->scenecutThreshold); H0(" --rc-lookahead Number of frames for frame-type lookahead (determines encoder latency) Default %d\n", param->lookaheadDepth); + H1(" --lookahead-slices <0..16> Number of slices to use per lookahead cost estimate. Default %d\n", param->lookaheadSlices); H0(" --bframes Maximum number of consecutive b-frames (now it only enables B GOP structure) Default %d\n", param->bframes); H1(" --bframe-bias Bias towards B frame decisions. Default %d\n", param->bFrameBias); H0(" --b-adapt <0..2> 0 - none, 1 - fast, 2 - full (trellis) adaptive B frame scheduling. Default %d\n", param->bFrameAdaptive); @@ -371,10 +385,11 @@ H1(" smpte240m, GBR, YCgCo, bt2020nc, bt2020c. Default undef\n"); H1(" --chromaloc Specify chroma sample location (0 to 5). Default of %d\n", param->vui.chromaSampleLocTypeTopField); H0("\nBitstream options:\n"); + H0(" --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders)); H0(" --[no-]info Emit SEI identifying encoder and parameters. Default %s\n", OPT(param->bEmitInfoSEI)); - H0(" --[no-]aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters)); H0(" --[no-]hrd Enable HRD parameters signaling. Default %s\n", OPT(param->bEmitHRDSEI)); - H0(" --[no-]repeat-headers Emit SPS and PPS headers at each keyframe. Default %s\n", OPT(param->bRepeatHeaders)); + H0(" --[no-]temporal-layers Enable a temporal sublayer for unreferenced B frames. Default %s\n", OPT(param->bEnableTemporalSubLayers)); + H0(" --[no-]aud Emit access unit delimiters at the start of each access unit. Default %s\n", OPT(param->bEnableAccessUnitDelimiters)); H1(" --hash Decoded Picture Hash SEI 0: disabled, 1: MD5, 2: CRC, 3: Checksum. Default %d\n", param->decodedPictureHashSEI); H1("\nReconstructed video options (debugging):\n"); H1("-r/--recon Reconstructed raw image YUV or Y4M output file name\n"); diff -Nru x265-1.5/source/x265.cpp x265-1.6/source/x265.cpp --- x265-1.5/source/x265.cpp 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/x265.cpp 2015-04-02 16:46:36.000000000 +0000 @@ -147,6 +147,7 @@ if (!bProgress || !frameNum || (prevUpdateTime && time - prevUpdateTime < UPDATE_INTERVAL)) return; + int64_t elapsed = time - startTime; double fps = elapsed > 0 ? frameNum * 1000000. / elapsed : 0; float bitrate = 0.008f * totalbytes * (param->fpsNum / param->fpsDenom) / ((float)frameNum); @@ -158,9 +159,8 @@ eta / 3600, (eta / 60) % 60, eta % 60); } else - { sprintf(buf, "x265 %d frames: %.2f fps, %.2f kb/s", frameNum, fps, bitrate); - } + fprintf(stderr, "%s \r", buf + 5); SetConsoleTitle(buf); fflush(stderr); // needed in windows @@ -530,7 +530,7 @@ while (pic_in && !b_ctrl_c) { pic_orig.poc = inFrameCount; - if (cliopt.qpfile && !param->rc.bStatRead) + if (cliopt.qpfile) { if (!cliopt.parseQPFile(pic_orig)) { diff -Nru x265-1.5/source/x265.def.in x265-1.6/source/x265.def.in --- x265-1.5/source/x265.def.in 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/x265.def.in 2015-04-02 16:46:36.000000000 +0000 @@ -1,6 +1,5 @@ EXPORTS x265_encoder_open_${X265_BUILD} -x265_setup_primitives x265_param_default x265_param_default_preset x265_param_parse @@ -20,3 +19,4 @@ x265_encoder_log x265_encoder_close x265_cleanup +x265_api_get_${X265_BUILD} diff -Nru x265-1.5/source/x265.h x265-1.6/source/x265.h --- x265-1.5/source/x265.h 2015-02-10 21:15:13.000000000 +0000 +++ x265-1.6/source/x265.h 2015-04-02 16:46:36.000000000 +0000 @@ -91,19 +91,31 @@ /* Stores all analysis data for a single frame */ typedef struct x265_analysis_data { + void* interData; + void* intraData; uint32_t frameRecordSize; - int32_t poc; - int32_t sliceType; + uint32_t poc; + uint32_t sliceType; uint32_t numCUsInFrame; uint32_t numPartitions; - void* interData; - void* intraData; } x265_analysis_data; /* Used to pass pictures into the encoder, and to get picture data back out of * the encoder. The input and output semantics are different */ typedef struct x265_picture { + /* presentation time stamp: user-specified, returned on output */ + int64_t pts; + + /* display time stamp: ignored on input, copied from reordered pts. Returned + * on output */ + int64_t dts; + + /* force quantizer for != X265_QP_AUTO */ + /* The value provided on input is returned with the same picture (POC) on + * output */ + void* userData; + /* Must be specified on input pictures, the number of planes is determined * by the colorSpace value */ void* planes[3]; @@ -132,18 +144,8 @@ * initialize this value to the internal color space */ int colorSpace; - /* presentation time stamp: user-specified, returned on output */ - int64_t pts; - - /* display time stamp: ignored on input, copied from reordered pts. Returned - * on output */ - int64_t dts; - - /* The value provided on input is returned with the same picture (POC) on - * output */ - void* userData; - - /* force quantizer for != X265_QP_AUTO */ + /* Force the slice base QP for this picture within the encoder. Set to 0 + * to allow the encoder to determine base QP */ int forceqp; /* If param.analysisMode is X265_ANALYSIS_OFF this field is ignored on input @@ -159,8 +161,6 @@ * this data structure */ x265_analysis_data analysisData; - /* new data members to this structure must be added to the end so that - * users of x265_picture_alloc/free() can be assured of future safety */ } x265_picture; typedef enum @@ -229,7 +229,11 @@ #define X265_B_ADAPT_FAST 1 #define X265_B_ADAPT_TRELLIS 2 +#define X265_REF_LIMIT_DEPTH 1 +#define X265_REF_LIMIT_CU 2 + #define X265_BFRAME_MAX 16 +#define X265_MAX_FRAME_THREADS 16 #define X265_TYPE_AUTO 0x0000 /* Let x265 choose the right type */ #define X265_TYPE_IDR 0x0001 @@ -237,13 +241,14 @@ #define X265_TYPE_P 0x0003 #define X265_TYPE_BREF 0x0004 /* Non-disposable B-frame */ #define X265_TYPE_B 0x0005 +#define IS_X265_TYPE_I(x) ((x) == X265_TYPE_I || (x) == X265_TYPE_IDR) +#define IS_X265_TYPE_B(x) ((x) == X265_TYPE_B || (x) == X265_TYPE_BREF) + #define X265_QP_AUTO 0 #define X265_AQ_NONE 0 #define X265_AQ_VARIANCE 1 #define X265_AQ_AUTO_VARIANCE 2 -#define IS_X265_TYPE_I(x) ((x) == X265_TYPE_I || (x) == X265_TYPE_IDR) -#define IS_X265_TYPE_B(x) ((x) == X265_TYPE_B || (x) == X265_TYPE_BREF) /* NOTE! For this release only X265_CSP_I420 and X265_CSP_I444 are supported */ @@ -308,11 +313,9 @@ double elapsedEncodeTime; /* wall time since encoder was opened */ double elapsedVideoTime; /* encoded picture count / frame rate */ double bitrate; /* accBits / elapsed video time */ + uint64_t accBits; /* total bits output thus far */ uint32_t encodedPictureCount; /* number of output pictures thus far */ uint32_t totalWPFrames; /* number of uni-directional weighted frames used */ - uint64_t accBits; /* total bits output thus far */ - - /* new statistic member variables must be added below this line */ } x265_stats; /* String values accepted by x265_param_parse() (and CLI) for various parameters */ @@ -322,7 +325,8 @@ static const char * const x265_fullrange_names[] = { "limited", "full", 0 }; static const char * const x265_colorprim_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "film", "bt2020", 0 }; static const char * const x265_transfer_names[] = { "", "bt709", "undef", "", "bt470m", "bt470bg", "smpte170m", "smpte240m", "linear", "log100", - "log316", "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", 0 }; + "log316", "iec61966-2-4", "bt1361e", "iec61966-2-1", "bt2020-10", "bt2020-12", + "smpte-st-2084", "smpte-st-428", 0 }; static const char * const x265_colmatrix_names[] = { "GBR", "bt709", "undef", "", "fcc", "bt470bg", "smpte170m", "smpte240m", "YCgCo", "bt2020nc", "bt2020c", 0 }; static const char * const x265_sar_names[] = { "undef", "1:1", "12:11", "10:11", "16:11", "40:33", "24:11", "20:11", @@ -334,9 +338,9 @@ * If zones overlap, whichever comes later in the list takes precedence. */ typedef struct x265_zone { - int startFrame, endFrame; /* range of frame numbers */ - int bForceQp; /* whether to use qp vs bitrate factor */ - int qp; + int startFrame, endFrame; /* range of frame numbers */ + int bForceQp; /* whether to use qp vs bitrate factor */ + int qp; float bitrateFactor; } x265_zone; @@ -348,36 +352,77 @@ * x265_param as an opaque data structure */ typedef struct x265_param { - /*== Encoder Environment ==*/ - /* x265_param_default() will auto-detect this cpu capability bitmap. it is * recommended to not change this value unless you know the cpu detection is * somehow flawed on your target hardware. The asm function tables are * process global, the first encoder configures them for all encoders */ int cpuid; - /* Enable wavefront parallel processing, greatly increases parallelism for - * less than 1% compression efficiency loss */ - int bEnableWavefront; + /*== Parallelism Features ==*/ - /* Number of threads to allocate for the process global thread pool, if no - * thread pool has yet been created. 0 implies auto-detection. By default - * x265 will try to allocate one worker thread per CPU core */ - int poolNumThreads; - - /* Number of concurrently encoded frames, 0 implies auto-detection. By - * default x265 will use a number of frame threads emperically determined to - * be optimal for your CPU core count, between 2 and 6. Using more than one - * frame thread causes motion search in the down direction to be clamped but - * otherwise encode behavior is unaffected. With CQP rate control the output - * bitstream is deterministic for all values of frameNumThreads greater than - * 1. All other forms of rate-control can be negatively impacted by - * increases to the number of frame threads because the extra concurrency - * adds uncertainty to the bitrate estimations. There is no limit to the - * number of frame threads you use for each encoder, but frame parallelism - * is generally limited by the the number of CU rows */ + /* Number of concurrently encoded frames between 1 and X265_MAX_FRAME_THREADS + * or 0 for auto-detection. By default x265 will use a number of frame + * threads empirically determined to be optimal for your CPU core count, + * between 2 and 6. Using more than one frame thread causes motion search + * in the down direction to be clamped but otherwise encode behavior is + * unaffected. With CQP rate control the output bitstream is deterministic + * for all values of frameNumThreads greater than 1. All other forms of + * rate-control can be negatively impacted by increases to the number of + * frame threads because the extra concurrency adds uncertainty to the + * bitrate estimations. Frame parallelism is generally limited by the the + * is generally limited by the the number of CU rows + * + * When thread pools are used, each frame thread is assigned to a single + * pool and the frame thread itself is given the node affinity of its pool. + * But when no thread pools are used no node affinity is assigned. */ int frameNumThreads; + /* Comma seperated list of threads per NUMA node. If "none", then no worker + * pools are created and only frame parallelism is possible. If NULL or "" + * (default) x265 will use all available threads on each NUMA node. + * + * '+' is a special value indicating all cores detected on the node + * '*' is a special value indicating all cores detected on the node and all + * remaining nodes. + * '-' is a special value indicating no cores on the node, same as '0' + * + * example strings for a 4-node system: + * "" - default, unspecified, all numa nodes are used for thread pools + * "*" - same as default + * "none" - no thread pools are created, only frame parallelism possible + * "-" - same as "none" + * "10" - allocate one pool, using up to 10 cores on node 0 + * "-,+" - allocate one pool, using all cores on node 1 + * "+,-,+" - allocate two pools, using all cores on nodes 0 and 2 + * "+,-,+,-" - allocate two pools, using all cores on nodes 0 and 2 + * "-,*" - allocate three pools, using all cores on nodes 1, 2 and 3 + * "8,8,8,8" - allocate four pools with up to 8 threads in each pool + * + * The total number of threads will be determined by the number of threads + * assigned to all nodes. The worker threads will each be given affinity for + * their node, they will not be allowed to migrate between nodes, but they + * will be allowed to move between CPU cores within their node. + * + * If the three pool features: bEnableWavefront, bDistributeModeAnalysis and + * bDistributeMotionEstimation are all disabled, then numaPools is ignored + * and no thread pools are created. + * + * If "none" is specified, then all three of the thread pool features are + * implicitly disabled. + * + * Multiple thread pools will be allocated for any NUMA node with more than + * 64 logical CPU cores. But any given thread pool will always use at most + * one NUMA node. + * + * Frame encoders are distributed between the available thread pools, and + * the encoder will never generate more thread pools than frameNumThreads */ + char* numaPools; + + /* Enable wavefront parallel processing, greatly increases parallelism for + * less than 1% compression efficiency loss. Requires a thread pool, enabled + * by default */ + int bEnableWavefront; + /* Use multiple threads to measure CU mode costs. Recommended for many core * CPUs. On RD levels less than 5, it may not offload enough work to warrant * the overhead. It is useful with the slow preset since it has the @@ -392,11 +437,9 @@ * win, particularly in video sequences with low motion. Default disabled */ int bDistributeMotionEstimation; - /* The level of logging detail emitted by the encoder. X265_LOG_NONE to - * X265_LOG_FULL, default is X265_LOG_INFO */ - int logLevel; + /*== Logging Features ==*/ - /* Enable analysis and logging distribution of Cus encoded across various + /* Enable analysis and logging distribution of CUs encoded across various * modes during mode decision. Default disabled */ int bLogCuStats; @@ -406,6 +449,10 @@ /* Enable the measurement and reporting of SSIM. Default is disabled */ int bEnableSsim; + /* The level of logging detail emitted by the encoder. X265_LOG_NONE to + * X265_LOG_FULL, default is X265_LOG_INFO */ + int logLevel; + /* filename of CSV log. If logLevel greater than or equal to X265_LOG_FRAME, * the encoder will emit per-slice statistics to this log file in encode * order. Otherwise the encoder will emit per-stream statistics into the log @@ -413,13 +460,6 @@ * encode) */ char* csvfn; - /* Enable the generation of SEI messages for each encoded frame containing - * the hashes of the three reconstructed picture planes. Most decoders will - * validate those hashes against the reconstructed images it generates and - * report any mismatches. This is essentially a debugging feature. Hash - * types are MD5(1), CRC(2), Checksum(3). Default is 0, none */ - int decodedPictureHashSEI; - /*== Internal Picture Specification ==*/ /* Internal encoder bit depth. If x265 was compiled to use 8bit pixels @@ -427,10 +467,8 @@ * Future builds may support 12bit pixels. */ int internalBitDepth; - /* Color space of internal pictures. Only X265_CSP_I420 and X265_CSP_I444 - * are supported. Eventually, i422 will also be supported as an internal - * color space and other packed formats will be supported in - * x265_picture.colorSpace */ + /* Color space of internal pictures, must match color space of input + * pictures */ int internalCsp; /* Numerator and denominator of frame rate */ @@ -447,6 +485,22 @@ * minimum requirement. All valid HEVC heights are supported */ int sourceHeight; + /* Interlace type of source pictures. 0 - progressive pictures (default). + * 1 - top field first, 2 - bottom field first. HEVC encodes interlaced + * content as fields, they must be provided to the encoder in the correct + * temporal order */ + int interlaceMode; + + /* Total Number of frames to be encoded, calculated from the user input + * (--frames) and (--seek). In case, the input is read from a pipe, this can + * remain as 0. It is later used in 2 pass RateControl, hence storing the + * value in param */ + int totalFrames; + + /*== Profile / Tier / Level ==*/ + + /* Note: the profile is specified by x265_param_apply_profile() */ + /* Minimum decoder requirement level. Defaults to 0, which implies auto- * detection by the encoder. If specified, the encoder will attempt to bring * the encode specifications within that specified level. If the encoder is @@ -461,11 +515,14 @@ * Main (0) and High (1) tier. Default is Main tier (0) */ int bHighTier; - /* Interlace type of source pictures. 0 - progressive pictures (default). - * 1 - top field first, 2 - bottom field first. HEVC encodes interlaced - * content as fields, they must be provided to the encoder in the correct - * temporal order. EXPERIMENTAL */ - int interlaceMode; + /* The maximum number of L0 references a P or B slice may use. This + * influences the size of the decoded picture buffer. The higher this + * number, the more reference frames there will be available for motion + * search, improving compression efficiency of most video at a cost of + * performance. Value must be between 1 and 16, default is 3 */ + int maxNumReferences; + + /*== Bitstream Options ==*/ /* Flag indicating whether VPS, SPS and PPS headers should be output with * each keyframe. Default false */ @@ -476,7 +533,7 @@ int bEnableAccessUnitDelimiters; /* Enables the buffering period SEI and picture timing SEI to signal the HRD - * parameteres. Default is disabled */ + * parameters. Default is disabled */ int bEmitHRDSEI; /* Enables the emission of a user data SEI with the stream headers which @@ -485,37 +542,30 @@ * Default enabled */ int bEmitInfoSEI; - /*== Coding Unit (CU) definitions ==*/ - - /* Maxiumum CU width and height in pixels. The size must be 64, 32, or 16. - * The higher the size, the more efficiently x265 can encode areas of low - * complexity, greatly improving compression efficiency at large - * resolutions. The smaller the size, the more effective wavefront and - * frame parallelism will become because of the increase in rows. default 64 */ - uint32_t maxCUSize; - - /* The additional depth the residual quadtree is allowed to recurse beyond - * the coding quadtree, for inter coded blocks. This must be between 1 and - * 4. The higher the value the more efficiently the residual can be - * compressed by the DCT transforms, at the expense of much more compute */ - uint32_t tuQTMaxInterDepth; + /* Enable the generation of SEI messages for each encoded frame containing + * the hashes of the three reconstructed picture planes. Most decoders will + * validate those hashes against the reconstructed images it generates and + * report any mismatches. This is essentially a debugging feature. Hash + * types are MD5(1), CRC(2), Checksum(3). Default is 0, none */ + int decodedPictureHashSEI; - /* The additional depth the residual quadtree is allowed to recurse beyond - * the coding quadtree, for intra coded blocks. This must be between 1 and - * 4. The higher the value the more efficiently the residual can be - * compressed by the DCT transforms, at the expense of much more compute */ - uint32_t tuQTMaxIntraDepth; + /* Enable Temporal Sub Layers while encoding, signals NAL units of coded + * slices with their temporalId. Output bitstreams can be extracted either + * at the base temporal layer (layer 0) with roughly half the frame rate or + * at a higher temporal layer (layer 1) that decodes all the frames in the + * sequence. */ + int bEnableTemporalSubLayers; - /*== GOP Structure and Lokoahead ==*/ + /*== GOP structure and slice type decisions (lookahead) ==*/ - /* Enable open GOP - meaning I slices are not necessariy IDR and thus frames + /* Enable open GOP - meaning I slices are not necessarily IDR and thus frames * encoded after an I slice may reference frames encoded prior to the I * frame which have remained in the decoded picture buffer. Open GOP - * generally has better compression efficiency and negligable encoder + * generally has better compression efficiency and negligible encoder * performance impact, but the use case may preclude it. Default true */ int bOpenGOP; - /* Scenecuts closer together than this are coded as I, not IDR. */ + /* Scene cuts closer together than this are coded as I, not IDR. */ int keyframeMin; /* Maximum keyframe distance or intra period in number of frames. If 0 or 1, @@ -523,35 +573,22 @@ * which effectively makes frame 0 the only I frame. Default is 250 */ int keyframeMax; - /* The maximum number of L0 references a P or B slice may use. This - * influences the size of the decoded picture buffer. The higher this - * number, the more reference frames there will be available for motion - * search, improving compression efficiency of most video at a cost of - * performance. Value must be between 1 and 16, default is 3 */ - int maxNumReferences; - - /* Sets the operating mode of the lookahead. With b-adapt 0, the GOP - * structure is fixed based on the values of keyframeMax and bframes. - * With b-adapt 1 a light lookahead is used to chose B frame placement. - * With b-adapt 2 (trellis) a viterbi B path selection is performed */ - int bFrameAdaptive; - - /* Maximum consecutive B frames that can be emitted by the lookehead. When + /* Maximum consecutive B frames that can be emitted by the lookahead. When * b-adapt is 0 and keyframMax is greater than bframes, the lookahead emits * a fixed pattern of `bframes` B frames between each P. With b-adapt 1 the * lookahead ignores the value of bframes for the most part. With b-adapt 2 - * the value of bframes determines the search (POC) distance performeed in - * both directions, quadradically increasing the compute load of the + * the value of bframes determines the search (POC) distance performed in + * both directions, quadratically increasing the compute load of the * lookahead. The higher the value, the more B frames the lookahead may * possibly use consecutively, usually improving compression. Default is 3, * maximum is 16 */ int bframes; - /* Total Number of frames to be encoded, caclulated from the user input - * (--frames) and (--seek). In case, the input is read from a pipe, this can - * remain as 0. It is later used in 2 pass RateControl, hence storing the - * value in param */ - int totalFrames; + /* Sets the operating mode of the lookahead. With b-adapt 0, the GOP + * structure is fixed based on the values of keyframeMax and bframes. + * With b-adapt 1 a light lookahead is used to chose B frame placement. + * With b-adapt 2 (trellis) a viterbi B path selection is performed */ + int bFrameAdaptive; /* When enabled, the encoder will use the B frame in the middle of each * mini-GOP larger than 2 B frames as a motion reference for the surrounding @@ -560,30 +597,128 @@ * frame by rate control. Default is enabled. */ int bBPyramid; + /* A value which is added to the cost estimate of B frames in the lookahead. + * It may be a positive value (making B frames appear more expensive, which + * causes the lookahead to chose more P frames) or negative, which makes the + * lookahead chose more B frames. Default is 0, there are no limits */ + int bFrameBias; + /* The number of frames that must be queued in the lookahead before it may * make slice decisions. Increasing this value directly increases the encode * latency. The longer the queue the more optimally the lookahead may make - * slice decisions, particularly with b-adapt 2. When mb-tree is enabled, + * slice decisions, particularly with b-adapt 2. When cu-tree is enabled, * the length of the queue linearly increases the effectiveness of the - * mb-tree analysis. Default is 40 frames, maximum is 250 */ + * cu-tree analysis. Default is 40 frames, maximum is 250 */ int lookaheadDepth; - /* A value which is added to the cost estimate of B frames in the lookahead. - * It may be a positive value (making B frames appear more expensive, which - * causes the lookahead to chose more P frames) or negative, which makes the - * lookahead chose more B frames. Default is 0, there are no limits */ - int bFrameBias; + /* Use multiple worker threads to measure the estimated cost of each frame + * within the lookahead. When bFrameAdaptive is 2, most frame cost estimates + * will be performed in batch mode, many cost estimates at the same time, + * and lookaheadSlices is ignored for batched estimates. The effect on + * performance can be quite small. The higher this parameter, the less + * accurate the frame costs will be (since context is lost across slice + * boundaries) which will result in less accurate B-frame and scene-cut + * decisions. Default is 0 - disabled. 1 is the same as 0. Max 16 */ + int lookaheadSlices; - /* An arbitrary threshold which determines how agressively the lookahead + /* An arbitrary threshold which determines how aggressively the lookahead * should detect scene cuts. The default (40) is recommended. */ int scenecutThreshold; + /*== Coding Unit (CU) definitions ==*/ + + /* Maximum CU width and height in pixels. The size must be 64, 32, or 16. + * The higher the size, the more efficiently x265 can encode areas of low + * complexity, greatly improving compression efficiency at large + * resolutions. The smaller the size, the more effective wavefront and + * frame parallelism will become because of the increase in rows. default 64 + * All encoders within the same process must use the same maxCUSize, until + * all encoders are closed and x265_cleanup() is called to reset the value. */ + uint32_t maxCUSize; + + /* Minimum CU width and height in pixels. The size must be 64, 32, 16, or + * 8. Default 8. All encoders within the same process must use the same + * minCUSize. */ + uint32_t minCUSize; + + /* Enable rectangular motion prediction partitions (vertical and + * horizontal), available at all CU depths from 64x64 to 8x8. Default is + * disabled */ + int bEnableRectInter; + + /* Enable asymmetrical motion predictions. At CU depths 64, 32, and 16, it + * is possible to use 25%/75% split partitions in the up, down, right, left + * directions. For some material this can improve compression efficiency at + * the cost of extra analysis. bEnableRectInter must be enabled for this + * feature to be used. Default disabled */ + int bEnableAMP; + + /*== Residual Quadtree Transform Unit (TU) definitions ==*/ + + /* Maximum TU width and height in pixels. The size must be 32, 16, 8 or 4. + * The larger the size the more efficiently the residual can be compressed + * by the DCT transforms, at the expense of more computation */ + uint32_t maxTUSize; + + /* The additional depth the residual quad-tree is allowed to recurse beyond + * the coding quad-tree, for inter coded blocks. This must be between 1 and + * 4. The higher the value the more efficiently the residual can be + * compressed by the DCT transforms, at the expense of much more compute */ + uint32_t tuQTMaxInterDepth; + + /* The additional depth the residual quad-tree is allowed to recurse beyond + * the coding quad-tree, for intra coded blocks. This must be between 1 and + * 4. The higher the value the more efficiently the residual can be + * compressed by the DCT transforms, at the expense of much more compute */ + uint32_t tuQTMaxIntraDepth; + + /* Set the amount of rate-distortion analysis to use within quant. 0 implies + * no rate-distortion optimization. At level 1 rate-distortion cost is used to + * find optimal rounding values for each level (and allows psy-rdoq to be + * enabled). At level 2 rate-distortion cost is used to make decimate decisions + * on each 4x4 coding group (including the cost of signaling the group within + * the group bitmap). Psy-rdoq is less effective at preserving energy when + * RDOQ is at level 2 */ + int rdoqLevel; + + /* Enable the implicit signaling of the sign bit of the last coefficient of + * each transform unit. This saves one bit per TU at the expense of figuring + * out which coefficient can be toggled with the least distortion. + * Default is enabled */ + int bEnableSignHiding; + + /* Allow intra coded blocks to be encoded directly as residual without the + * DCT transform, when this improves efficiency. Checking whether the block + * will benefit from this option incurs a performance penalty. Default is + * disabled */ + int bEnableTransformSkip; + + /* An integer value in range of 0 to 2000, which denotes strength of noise + * reduction in intra CUs. 0 means disabled */ + int noiseReductionIntra; + + /* An integer value in range of 0 to 2000, which denotes strength of noise + * reduction in inter CUs. 0 means disabled */ + int noiseReductionInter; + + /* Quantization scaling lists. HEVC supports 6 quantization scaling lists to + * be defined; one each for Y, Cb, Cr for intra prediction and one each for + * inter prediction. + * + * - NULL and "off" will disable quant scaling (default) + * - "default" will enable the HEVC default scaling lists, which + * do not need to be signaled since they are specified + * - all other strings indicate a filename containing custom scaling lists + * in the HM format. The encode will fail if the file is not parsed + * correctly. Custom lists must be signaled in the SPS. */ + const char *scalingLists; + /*== Intra Coding Tools ==*/ /* Enable constrained intra prediction. This causes intra prediction to * input samples that were inter predicted. For some use cases this is * believed to me more robust to stream errors, but it has a compression - * penalty on P and (particularly) B slices. Defaults to diabled */ + * penalty on P and (particularly) B slices. Defaults to disabled */ int bEnableConstrainedIntra; /* Enable strong intra smoothing for 32x32 blocks where the reference @@ -591,22 +726,35 @@ * depending on your source material. Defaults to disabled */ int bEnableStrongIntraSmoothing; - /* Use a faster search method to find the best intra mode. Default is 0 */ - int bEnableFastIntra; - /*== Inter Coding Tools ==*/ + /* The maximum number of merge candidates that are considered during inter + * analysis. This number (between 1 and 5) is signaled in the stream + * headers and determines the number of bits required to signal a merge so + * it can have significant trade-offs. The smaller this number the higher + * the performance but the less compression efficiency. Default is 3 */ + uint32_t maxNumMergeCand; + + /* Limit the motion references used for each search based on the results of + * previous motion searches already performed for the same CU: If 0 all + * references are always searched. If X265_REF_LIMIT_CU all motion searches + * will restrict themselves to the references selected by the 2Nx2N search + * at the same depth. If X265_REF_LIMIT_DEPTH the 2Nx2N motion search will + * only use references that were selected by the best motion searches of the + * 4 split CUs at the next lower CU depth. The two flags may be combined */ + uint32_t limitReferences; + /* ME search method (DIA, HEX, UMH, STAR, FULL). The search patterns * (methods) are sorted in increasing complexity, with diamond being the * simplest and fastest and full being the slowest. DIA, HEX, and UMH were * adapted from x264 directly. STAR is an adaption of the HEVC reference * encoder's three step search, while full is a naive exhaustive search. The * default is the star search, it has a good balance of performance and - * compression efficiecy */ + * compression efficiency */ int searchMethod; /* A value between 0 and X265_MAX_SUBPEL_LEVEL which adjusts the amount of - * effort performed during subpel refine. Default is 5 */ + * effort performed during sub-pel refine. Default is 5 */ int subpelRefine; /* The maximum distance from the motion prediction that the full pel motion @@ -619,14 +767,7 @@ * smaller CU size is used, the search range should be similarly reduced */ int searchRange; - /* The maximum number of merge candidates that are considered during inter - * analysis. This number (between 1 and 5) is signaled in the stream - * headers and determines the number of bits required to signal a merge so - * it can have significant trade-offs. The smaller this number the higher - * the performance but the less compression efficiency. Default is 3 */ - uint32_t maxNumMergeCand; - - /* Disable availability of temporal motion vector for AMVP */ + /* Enable availability of temporal motion vector for AMVP, default is enabled */ int bEnableTemporalMvp; /* Enable weighted prediction in P slices. This enables weighting analysis @@ -640,29 +781,69 @@ /* Enable weighted prediction in B slices. Default is disabled */ int bEnableWeightedBiPred; - /*== Analysis tools ==*/ + /*== Loop Filters ==*/ - /* Enable asymmetrical motion predictions. At CU depths 64, 32, and 16, it - * is possible to use 25%/75% split partitions in the up, down, right, left - * directions. For some material this can improve compression efficiency at - * the cost of extra analysis. bEnableRectInter must be enabled for this - * feature to be used. Default enabled */ - int bEnableAMP; + /* Enable the deblocking loop filter, which improves visual quality by + * reducing blocking effects at block edges, particularly at lower bitrates + * or higher QP. When enabled it adds another CU row of reference lag, + * reducing frame parallelism effectiveness. Default is enabled */ + int bEnableLoopFilter; - /* Enable rectangular motion prediction partitions (vertical and - * horizontal), available at all CU depths from 64x64 to 8x8. Default is - * enabled */ - int bEnableRectInter; + /* deblocking filter tC offset [-6, 6] -6 light filter, 6 strong. + * This is the coded div2 value, actual offset is doubled at use */ + int deblockingFilterTCOffset; - /* Enable the use of `coded block flags` (flags set to true when a residual - * has been coded for a given block) to avoid intra analysis in likely skip - * blocks. Only applicable in RD levels 5 and 6. Default is disabled */ - int bEnableCbfFastMode; + /* deblocking filter Beta offset [-6, 6] -6 light filter, 6 strong + * This is the coded div2 value, actual offset is doubled at use */ + int deblockingFilterBetaOffset; + + /* Enable the Sample Adaptive Offset loop filter, which reduces distortion + * effects by adjusting reconstructed sample values based on histogram + * analysis to better approximate the original samples. When enabled it adds + * a CU row of reference lag, reducing frame parallelism effectiveness. + * Default is enabled */ + int bEnableSAO; + + /* Note: when deblocking and SAO are both enabled, the loop filter CU lag is + * only one row, as they operate in series on the same row. */ + + /* Select the method in which SAO deals with deblocking boundary pixels. If + * disabled the right and bottom boundary areas are skipped. If enabled, + * non-deblocked pixels are used entirely. Default is disabled */ + int bSaoNonDeblocked; + + /*== Analysis tools ==*/ + + /* A value between X265_NO_RDO_NO_RDOQ and X265_RDO_LEVEL which determines + * the level of rate distortion optimizations to perform during mode + * decisions and quantization. The more RDO the better the compression + * efficiency at a major cost of performance. Default is no RDO (0) */ + int rdLevel; /* Enable early skip decisions to avoid intra and inter analysis in likely * skip blocks. Default is disabled */ int bEnableEarlySkip; + /* Use a faster search method to find the best intra mode. Default is 0 */ + int bEnableFastIntra; + + /* Enable a faster determination of whether skipping the DCT transform will + * be beneficial. Slight performance gain for some compression loss. Default + * is enabled */ + int bEnableTSkipFast; + + /* The CU Lossless flag, when enabled, compares the rate-distortion costs + * for normal and lossless encoding, and chooses the best mode for each CU. + * If lossless mode is chosen, the cu-transquant-bypass flag is set for that + * CU */ + int bCULossless; + + /* Specify whether to attempt to encode intra modes in B frames. By default + * enabled, but only applicable for the presets which use rdLevel 5 or 6 + * (veryslow and placebo). All other presets will not try intra in B frames + * regardless of this setting */ + int bIntraInBFrames; + /* Apply an optional penalty to the estimated cost of 32x32 intra blocks in * non-intra slices. 0 is disabled, 1 enables a small penalty, and 2 enables * a full penalty. This favors inter-coding and its low bitrate over @@ -670,30 +851,12 @@ * Default is 0 */ int rdPenalty; - /* A value between X265_NO_RDO_NO_RDOQ and X265_RDO_LEVEL which determines - * the level of rate distortion optimizations to perform during mode - * decisions and quantization. The more RDO the better the compression - * efficiency at a major cost of performance. Default is no RDO (0) */ - int rdLevel; - /* Psycho-visual rate-distortion strength. Only has an effect in presets * which use RDO. It makes mode decision favor options which preserve the * energy of the source, at the cost of lost compression. The value must - * be between 0 and 2.0, 1.0 is typical. Default 1.0 */ + * be between 0 and 2.0, 1.0 is typical. Default 0.3 */ double psyRd; - /* Quantization scaling lists. HEVC supports 6 quantization scaling lists to - * be defined; one each for Y, Cb, Cr for intra prediction and one each for - * inter prediction. - * - * - NULL and "off" will disable quant scaling (default) - * - "default" will enable the HEVC default scaling lists, which - * do not need to be signaled since they are specified - * - all other strings indicate a filename containing custom scaling lists - * in the HM format. The encode will fail if the file is not parsed - * correctly. Custom lists must be signaled in the SPS. */ - const char *scalingLists; - /* Strength of psycho-visual optimizations in quantization. Only has an * effect in presets which use RDOQ (rd-levels 4 and 5). The value must be * between 0 and 50, 1.0 is typical. Default 1.0 */ @@ -708,52 +871,12 @@ /* Filename for analysisMode save/load. Default name is "x265_analysis.dat" */ char* analysisFileName; - /*== Coding tools ==*/ - /* Enable the implicit signaling of the sign bit of the last coefficient of - * each transform unit. This saves one bit per TU at the expense of figuring - * out which coefficient can be toggled with the least distortion. - * Default is enabled */ - int bEnableSignHiding; - - /* Allow intra coded blocks to be encoded directly as residual without the - * DCT transform, when this improves efficiency. Checking whether the block - * will benefit from this option incurs a performance penalty. Default is - * enabled */ - int bEnableTransformSkip; - - /* Enable a faster determination of whether skippig the DCT transform will - * be beneficial. Slight performance gain for some compression loss. Default - * is enabled */ - int bEnableTSkipFast; - - /* Enable the deblocking loop filter, which improves visual quality by - * reducing blocking effects at block edges, particularly at lower bitrates - * or higher QP. When enabled it adds another CU row of reference lag, - * reducing frame parallelism effectiveness. Default is enabled */ - int bEnableLoopFilter; - - /* deblocking filter tC offset [-6, 6] -6 light filter, 6 strong. - * This is the coded div2 value, actual offset is doubled at use */ - int deblockingFilterTCOffset; - - /* deblocking filter Beta offset [-6, 6] -6 light filter, 6 strong - * This is the coded div2 value, actual offset is doubled at use */ - int deblockingFilterBetaOffset; - - /* Enable the Sample Adaptive Offset loop filter, which reduces distortion - * effects by adjusting reconstructed sample values based on histogram - * analysis to better approximate the original samples. When enabled it adds - * a CU row of reference lag, reducing frame parallelism effectiveness. - * Default is enabled */ - int bEnableSAO; - - /* Note: when deblocking and SAO are both enabled, the loop filter CU lag is - * only one row, as they operate in series on the same row. */ + /*== Rate Control ==*/ - /* Select the method in which SAO deals with deblocking boundary pixels. If - * disabled the right and bottom boundary areas are skipped. If enabled, - * non-deblocked pixels are used entirely. Default is disabled */ - int bSaoNonDeblocked; + /* The lossless flag enables true lossless coding, bypassing scaling, + * transform, quantization and in-loop filter processes. This is used for + * ultra-high bitrates with zero loss of quality. It implies no rate control */ + int bLossless; /* Generally a small signed integer which offsets the QP used to quantize * the Cb chroma residual (delta from luma QP specified by rate-control). @@ -765,33 +888,6 @@ * Default is 0, which is recommended */ int crQpOffset; - /* Specify whether to attempt to encode intra modes in B frames. By default - * enabled, but only applicable for the presets which use rdLevel 5 or 6 - * (veryslow and placebo). All other presets will not try intra in B frames - * regardless of this setting. */ - int bIntraInBFrames; - - /* An integer value in range of 0 to 2000, which denotes strength of noise - * reduction in intra CUs. 0 means disabled */ - int noiseReductionIntra; - - /* An integer value in range of 0 to 2000, which denotes strength of noise - * reduction in inter CUs. 0 means disabled */ - int noiseReductionInter; - - /* The lossless flag enables true lossless coding, by bypassing scaling, - * transform, quantization and in-loop filter processes. This is used for - * ultra-high bitrates with zero loss of quality. */ - int bLossless; - - /* The CU Lossless flag, when enabled, compares the rate-distortion costs - * for normal and lossless encoding, and chooses the best mode for each CU. - * If lossless mode is chosen, the cu-transquant-bypass flag is set for that - * CU. */ - int bCULossless; - - /*== Rate Control ==*/ - struct { /* Explicit mode of rate-control, necessary for API users. It must @@ -817,13 +913,13 @@ double ipFactor; double pbFactor; - /* Max QP difference between frames. Default: 4 */ - int qpStep; - /* Ratefactor constant: targets a certain constant "quality". * Acceptable values between 0 and 51. Default value: 28 */ double rfConstant; + /* Max QP difference between frames. Default: 4 */ + int qpStep; + /* Enable adaptive quantization. This mode distributes available bits between all * CTUs of a frame, assigning more bits to low complexity areas. Turning * this ON will usually affect PSNR negatively, however SSIM and visual quality @@ -846,7 +942,7 @@ * interpreted as the initial fill in kbits. Default is 0.9 */ double vbvBufferInit; - /* Enable CUTree ratecontrol. This keeps track of the CUs that propagate temporally + /* Enable CUTree rate-control. This keeps track of the CUs that propagate temporally * across frames and assigns more bits to these CUs. Improves encode efficiency. * Default: enabled */ int cuTree; @@ -858,7 +954,7 @@ double rfConstantMin; /* Multi-pass encoding */ - /* Enable writing the stats in a multipass encode to the stat output file */ + /* Enable writing the stats in a multi-pass encode to the stat output file */ int bStatWrite; /* Enable loading data from the stat input file in a multi pass encode */ @@ -877,7 +973,7 @@ /* Enable slow and a more detailed first pass encode in multi pass rate control */ int bEnableSlowFirstPass; - /* ratecontrol overrides */ + /* rate-control overrides */ int zoneCount; x265_zone* zones; @@ -890,7 +986,7 @@ const char* lambdaFileName; /* Enable stricter conditions to check bitrate deviations in CBR mode. May compromise - quality to maintain bitrate adherence */ + * quality to maintain bitrate adherence */ int bStrictCbr; } rc; @@ -990,12 +1086,6 @@ } vui; } x265_param; -/*** - * If not called, first encoder allocated will auto-detect the CPU and - * initialize performance primitives, which are process global. - * DEPRECATED: use x265_param.cpuid to specify CPU */ -void x265_setup_primitives(x265_param *param, int cpu); - /* x265_param_alloc: * Allocates an x265_param instance. The returned param structure is not * special in any way, but using this method together with x265_param_free() @@ -1008,9 +1098,8 @@ * allocated by x265_param_alloc() */ void x265_param_free(x265_param *); -/*** - * Initialize an x265_param structure to default values - */ +/* x265_param_default: + * Initialize an x265_param structure to default values */ void x265_param_default(x265_param *param); /* x265_param_parse: @@ -1065,11 +1154,11 @@ * Use x265_picture_free() to release storage for an x265_picture instance * allocated by x265_picture_alloc() */ void x265_picture_free(x265_picture *); -/*** - * Initialize an x265_picture structure to default values. It sets the pixel - * depth and color space to the encoder's internal values and sets the slice - * type to auto - so the lookahead will determine slice type. - */ + +/* x265_picture_init: + * Initialize an x265_picture structure to default values. It sets the pixel + * depth and color space to the encoder's internal values and sets the slice + * type to auto - so the lookahead will determine slice type. */ void x265_picture_init(x265_param *param, x265_picture *pic); /* x265_max_bit_depth: @@ -1139,11 +1228,57 @@ * close an encoder handler */ void x265_encoder_close(x265_encoder *); -/*** - * Release library static allocations - */ +/* x265_cleanup: + * release library static allocations, reset configured CTU size */ void x265_cleanup(void); + +/* === Multi-lib API === + * By using this method to gain access to the libx265 interfaces, you allow shim + * implementations of x265_api_get() to choose between various available libx265 + * libraries based on the encoder parameters. The most likely use case is to + * choose between 8bpp and 16bpp builds of libx265. */ + +typedef struct x265_api +{ + /* libx265 public API functions, documented above with x265_ prefixes */ + x265_param* (*param_alloc)(void); + void (*param_free)(x265_param*); + void (*param_default)(x265_param*); + int (*param_parse)(x265_param*, const char*, const char*); + int (*param_apply_profile)(x265_param*, const char*); + int (*param_default_preset)(x265_param*, const char*, const char *); + x265_picture* (*picture_alloc)(void); + void (*picture_free)(x265_picture*); + void (*picture_init)(x265_param*, x265_picture*); + x265_encoder* (*encoder_open)(x265_param*); + void (*encoder_parameters)(x265_encoder*, x265_param*); + int (*encoder_headers)(x265_encoder*, x265_nal**, uint32_t*); + int (*encoder_encode)(x265_encoder*, x265_nal**, uint32_t*, x265_picture*, x265_picture*); + void (*encoder_get_stats)(x265_encoder*, x265_stats*, uint32_t); + void (*encoder_log)(x265_encoder*, int, char**); + void (*encoder_close)(x265_encoder*); + void (*cleanup)(void); + const char* version_str; + const char* build_info_str; + int max_bit_depth; +} x265_api; + +/* Force a link error in the case of linking against an incompatible API version. + * Glue #defines exist to force correct macro expansion; the final output of the macro + * is x265_api_get_##X265_BUILD (for purposes of dlopen). */ +#define x265_api_glue1(x, y) x ## y +#define x265_api_glue2(x, y) x265_api_glue1(x, y) +#define x265_api_get x265_api_glue2(x265_api_get_, X265_BUILD) + +/* x265_api_get: + * Retrieve the programming interface for a linked x265 library. + * May return NULL if no library is available that supports the + * requested bit depth. If bitDepth is 0 the function is guarunteed + * to return a non-NULL x265_api pointer, from the system default + * libx265 */ +const x265_api* x265_api_get(int bitDepth); + #ifdef __cplusplus } #endif