diff -Nru x264-0.148.2795+gitaaa9aa8/common/aarch64/pixel-a.S x264-0.152.2854+gite9a5903/common/aarch64/pixel-a.S --- x264-0.148.2795+gitaaa9aa8/common/aarch64/pixel-a.S 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/aarch64/pixel-a.S 2017-12-31 12:50:51.000000000 +0000 @@ -569,57 +569,65 @@ .macro pixel_var2_8 h function x264_pixel_var2_8x\h\()_neon, export=1 - ld1 {v16.8b}, [x0], x1 - ld1 {v18.8b}, [x2], x3 - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x2], x3 - mov x5, \h - 4 - usubl v6.8h, v16.8b, v18.8b - usubl v7.8h, v17.8b, v19.8b - ld1 {v16.8b}, [x0], x1 - ld1 {v18.8b}, [x2], x3 - smull v2.4s, v6.4h, v6.4h - smull2 v3.4s, v6.8h, v6.8h - add v0.8h, v6.8h, v7.8h - smlal v2.4s, v7.4h, v7.4h - smlal2 v3.4s, v7.8h, v7.8h + mov x3, #16 + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 + mov x5, \h - 2 + usubl v0.8h, v16.8b, v18.8b + usubl v1.8h, v17.8b, v19.8b + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + smull v2.4s, v0.4h, v0.4h + smull2 v3.4s, v0.8h, v0.8h + smull v4.4s, v1.4h, v1.4h + smull2 v5.4s, v1.8h, v1.8h usubl v6.8h, v16.8b, v18.8b -1: subs x5, x5, #2 - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x2], x3 +1: subs x5, x5, #1 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h - ld1 {v16.8b}, [x0], x1 - ld1 {v18.8b}, [x2], x3 - smlal v2.4s, v7.4h, v7.4h - smlal2 v3.4s, v7.8h, v7.8h + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + smlal v4.4s, v7.4h, v7.4h + smlal2 v5.4s, v7.8h, v7.8h usubl v6.8h, v16.8b, v18.8b - add v0.8h, v0.8h, v7.8h + add v1.8h, v1.8h, v7.8h b.gt 1b - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x2], x3 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h - smlal v2.4s, v7.4h, v7.4h - add v0.8h, v0.8h, v7.8h - smlal2 v3.4s, v7.8h, v7.8h + smlal v4.4s, v7.4h, v7.4h + add v1.8h, v1.8h, v7.8h + smlal2 v5.4s, v7.8h, v7.8h saddlv s0, v0.8h + saddlv s1, v1.8h add v2.4s, v2.4s, v3.4s + add v4.4s, v4.4s, v5.4s mov w0, v0.s[0] - addv s1, v2.4s - sxtw x0, w0 mov w1, v1.s[0] - mul x0, x0, x0 - str w1, [x4] - sub x0, x1, x0, lsr # 6 + (\h >> 4) + addv s2, v2.4s + addv s4, v4.4s + mul w0, w0, w0 + mul w1, w1, w1 + mov w3, v2.s[0] + mov w4, v4.s[0] + sub w0, w3, w0, lsr # 6 + (\h >> 4) + sub w1, w4, w1, lsr # 6 + (\h >> 4) + str w3, [x2] + add w0, w0, w1 + str w4, [x2, #4] ret endfunc diff -Nru x264-0.148.2795+gitaaa9aa8/common/aarch64/pixel.h x264-0.152.2854+gite9a5903/common/aarch64/pixel.h --- x264-0.148.2795+gitaaa9aa8/common/aarch64/pixel.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/aarch64/pixel.h 2017-12-31 12:50:51.000000000 +0000 @@ -61,8 +61,8 @@ uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); -int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * ); uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); diff -Nru x264-0.148.2795+gitaaa9aa8/common/arm/pixel-a.S x264-0.152.2854+gite9a5903/common/arm/pixel-a.S --- x264-0.148.2795+gitaaa9aa8/common/arm/pixel-a.S 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/arm/pixel-a.S 2017-12-31 12:50:51.000000000 +0000 @@ -719,13 +719,24 @@ bx lr endfunc -.macro DIFF_SUM diff da db lastdiff - vld1.64 {\da}, [r0,:64], r1 - vld1.64 {\db}, [r2,:64], r3 -.ifnb \lastdiff - vadd.s16 q0, q0, \lastdiff +.macro DIFF_SUM diff1 diff2 da1 db1 da2 db2 lastdiff1 lastdiff2 acc1 acc2 + vld1.64 {\da1}, [r0,:64]! + vld1.64 {\db1}, [r1,:64], r3 +.ifnb \lastdiff1 + vadd.s16 \acc1, \acc1, \lastdiff1 + vadd.s16 \acc2, \acc2, \lastdiff2 .endif - vsubl.u8 \diff, \da, \db + vld1.64 {\da2}, [r0,:64]! + vld1.64 {\db2}, [r1,:64], r3 + vsubl.u8 \diff1, \da1, \db1 + vsubl.u8 \diff2, \da2, \db2 +.endm + +.macro SQR_ACC_DOUBLE acc1 acc2 d0 d1 d2 d3 vmlal=vmlal.s16 + \vmlal \acc1, \d0, \d0 + vmlal.s16 \acc1, \d1, \d1 + \vmlal \acc2, \d2, \d2 + vmlal.s16 \acc2, \d3, \d3 .endm .macro SQR_ACC acc d0 d1 vmlal=vmlal.s16 @@ -734,77 +745,89 @@ .endm function x264_pixel_var2_8x8_neon - DIFF_SUM q0, d0, d1 - DIFF_SUM q8, d16, d17 - SQR_ACC q1, d0, d1, vmull.s16 - DIFF_SUM q9, d18, d19, q8 - SQR_ACC q2, d16, d17, vmull.s16 + mov r3, #16 + DIFF_SUM q0, q10, d0, d1, d20, d21 + DIFF_SUM q8, q11, d16, d17, d22, d23 + SQR_ACC_DOUBLE q1, q13, d0, d1, d20, d21, vmull.s16 + DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10 + SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23, vmull.s16 .rept 2 - DIFF_SUM q8, d16, d17, q9 - SQR_ACC q1, d18, d19 - DIFF_SUM q9, d18, d19, q8 - SQR_ACC q2, d16, d17 + DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10 + SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25 + DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10 + SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23 .endr - DIFF_SUM q8, d16, d17, q9 - SQR_ACC q1, d18, d19 + DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10 + SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25 vadd.s16 q0, q0, q8 - SQR_ACC q2, d16, d17 + vadd.s16 q10, q10, q11 + SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23 - ldr ip, [sp] vadd.s16 d0, d0, d1 + vadd.s16 d20, d20, d21 vadd.s32 q1, q1, q2 + vadd.s32 q13, q13, q14 vpaddl.s16 d0, d0 + vpaddl.s16 d20, d20 vadd.s32 d1, d2, d3 - vpadd.s32 d0, d0, d1 + vadd.s32 d26, d26, d27 + vpadd.s32 d0, d0, d20 @ sum + vpadd.s32 d1, d1, d26 @ sqr + vmul.s32 d0, d0, d0 @ sum*sum + vshr.s32 d0, d0, #6 + vsub.s32 d0, d1, d0 + vpadd.s32 d0, d0, d0 vmov r0, r1, d0 - vst1.32 {d0[1]}, [ip,:32] - mul r0, r0, r0 - sub r0, r1, r0, lsr #6 + vst1.32 {d1}, [r2,:64] bx lr endfunc function x264_pixel_var2_8x16_neon - vld1.64 {d16}, [r0,:64], r1 - vld1.64 {d17}, [r2,:64], r3 - vld1.64 {d18}, [r0,:64], r1 - vld1.64 {d19}, [r2,:64], r3 - vsubl.u8 q10, d16, d17 - vsubl.u8 q11, d18, d19 - SQR_ACC q1, d20, d21, vmull.s16 - vld1.64 {d16}, [r0,:64], r1 - vadd.s16 q0, q10, q11 - vld1.64 {d17}, [r2,:64], r3 - SQR_ACC q2, d22, d23, vmull.s16 - mov ip, #14 -1: subs ip, ip, #2 - vld1.64 {d18}, [r0,:64], r1 + mov r3, #16 + vld1.64 {d16}, [r0,:64]! + vld1.64 {d17}, [r1,:64], r3 + vld1.64 {d18}, [r0,:64]! + vld1.64 {d19}, [r1,:64], r3 + vsubl.u8 q0, d16, d17 + vsubl.u8 q3, d18, d19 + SQR_ACC q1, d0, d1, vmull.s16 + vld1.64 {d16}, [r0,:64]! + mov ip, #15 + vld1.64 {d17}, [r1,:64], r3 + SQR_ACC q2, d6, d7, vmull.s16 +1: subs ip, ip, #1 + vld1.64 {d18}, [r0,:64]! vsubl.u8 q10, d16, d17 - vld1.64 {d19}, [r2,:64], r3 + vld1.64 {d19}, [r1,:64], r3 vadd.s16 q0, q0, q10 SQR_ACC q1, d20, d21 vsubl.u8 q11, d18, d19 beq 2f - vld1.64 {d16}, [r0,:64], r1 - vadd.s16 q0, q0, q11 - vld1.64 {d17}, [r2,:64], r3 + vld1.64 {d16}, [r0,:64]! + vadd.s16 q3, q3, q11 + vld1.64 {d17}, [r1,:64], r3 SQR_ACC q2, d22, d23 b 1b 2: - vadd.s16 q0, q0, q11 + vadd.s16 q3, q3, q11 SQR_ACC q2, d22, d23 - ldr ip, [sp] vadd.s16 d0, d0, d1 - vadd.s32 q1, q1, q2 + vadd.s16 d6, d6, d7 vpaddl.s16 d0, d0 - vadd.s32 d1, d2, d3 - vpadd.s32 d0, d0, d1 + vpaddl.s16 d6, d6 + vadd.s32 d2, d2, d3 + vadd.s32 d4, d4, d5 + vpadd.s32 d0, d0, d6 @ sum + vpadd.s32 d2, d2, d4 @ sqr + vmul.s32 d0, d0, d0 @ sum*sum + vshr.s32 d0, d0, #7 + vsub.s32 d0, d2, d0 + vpadd.s32 d0, d0, d0 vmov r0, r1, d0 - vst1.32 {d0[1]}, [ip,:32] - mul r0, r0, r0 - sub r0, r1, r0, lsr #7 + vst1.32 {d2}, [r2,:64] bx lr endfunc diff -Nru x264-0.148.2795+gitaaa9aa8/common/arm/pixel.h x264-0.152.2854+gite9a5903/common/arm/pixel.h --- x264-0.148.2795+gitaaa9aa8/common/arm/pixel.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/arm/pixel.h 2017-12-31 12:50:51.000000000 +0000 @@ -63,8 +63,8 @@ uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); -int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * ); uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); diff -Nru x264-0.148.2795+gitaaa9aa8/common/bitstream.c x264-0.152.2854+gite9a5903/common/bitstream.c --- x264-0.148.2795+gitaaa9aa8/common/bitstream.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/bitstream.c 2017-12-31 12:50:51.000000000 +0000 @@ -43,16 +43,19 @@ uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end ); uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end ); void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_internal_sse2_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end ); @@ -116,7 +119,7 @@ pf->nal_escape = x264_nal_escape_c; #if HAVE_MMX -#if ARCH_X86_64 +#if ARCH_X86_64 && !defined( __MACH__ ) pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2; pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2; pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2; @@ -126,18 +129,17 @@ pf->nal_escape = x264_nal_escape_mmx2; if( cpu&X264_CPU_SSE2 ) { -#if ARCH_X86_64 - if( cpu&X264_CPU_LZCNT ) - { - pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt; - pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt; - pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt; - } -#endif if( cpu&X264_CPU_SSE2_IS_FAST ) pf->nal_escape = x264_nal_escape_sse2; } -#if ARCH_X86_64 +#if ARCH_X86_64 && !defined( __MACH__ ) + if( cpu&X264_CPU_LZCNT ) + { + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt; + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt; + } + if( cpu&X264_CPU_SSSE3 ) { pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3; @@ -152,8 +154,14 @@ if( cpu&X264_CPU_AVX2 ) { pf->nal_escape = x264_nal_escape_avx2; - if( cpu&X264_CPU_BMI2 ) - pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2; + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2; + } + + if( cpu&X264_CPU_AVX512 ) + { + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512; + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512; } #endif #endif diff -Nru x264-0.148.2795+gitaaa9aa8/common/cabac.h x264-0.152.2854+gite9a5903/common/cabac.h --- x264-0.148.2795+gitaaa9aa8/common/cabac.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/cabac.h 2017-12-31 12:50:51.000000000 +0000 @@ -42,7 +42,7 @@ uint8_t *p_end; /* aligned for memcpy_aligned starting here */ - ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision() + ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision() /* context */ uint8_t state[1024]; diff -Nru x264-0.148.2795+gitaaa9aa8/common/common.c x264-0.152.2854+gite9a5903/common/common.c --- x264-0.148.2795+gitaaa9aa8/common/common.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/common.c 2017-12-31 12:50:51.000000000 +0000 @@ -669,7 +669,7 @@ { if( !strcmp(value, "1b") ) p->i_level_idc = 9; - else if( atof(value) < 6 ) + else if( atof(value) < 7 ) p->i_level_idc = (int)(10*atof(value)+.5); else p->i_level_idc = atoi(value); @@ -1143,6 +1143,8 @@ [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, }, + [X264_CSP_YUYV] = { 1, { 256*2 }, { 256*1 }, }, + [X264_CSP_UYVY] = { 1, { 256*2 }, { 256*1 }, }, [X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_BGR] = { 1, { 256*3 }, { 256*1 }, }, diff -Nru x264-0.148.2795+gitaaa9aa8/common/common.h x264-0.152.2854+gite9a5903/common/common.h --- x264-0.148.2795+gitaaa9aa8/common/common.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/common.h 2017-12-31 12:50:51.000000000 +0000 @@ -635,11 +635,11 @@ /* Current MB DCT coeffs */ struct { - ALIGNED_32( dctcoef luma16x16_dc[3][16] ); + ALIGNED_64( dctcoef luma16x16_dc[3][16] ); ALIGNED_16( dctcoef chroma_dc[2][8] ); // FIXME share memory? - ALIGNED_32( dctcoef luma8x8[12][64] ); - ALIGNED_32( dctcoef luma4x4[16*3][16] ); + ALIGNED_64( dctcoef luma8x8[12][64] ); + ALIGNED_64( dctcoef luma4x4[16*3][16] ); } dct; /* MB table and cache for current frame/mb */ @@ -729,7 +729,7 @@ int8_t *type; /* mb type */ uint8_t *partition; /* mb partition */ int8_t *qp; /* mb qp */ - int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc (all set for PCM)*/ + int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x200 and 0x400: chroma dc, 0x1000 PCM (all set for PCM) */ int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */ /* actually has only 7 entries; set to 8 for write-combining optimizations */ uint8_t (*non_zero_count)[16*3]; /* nzc. for I_PCM set to 16 */ @@ -740,8 +740,7 @@ int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */ int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */ int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */ - uint16_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of - * NOTE: this will fail on resolutions above 2^16 MBs... */ + uint32_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of */ uint8_t *field; /* buffer for weighted versions of the reference frames */ @@ -778,26 +777,27 @@ /* space for p_fenc and p_fdec */ #define FENC_STRIDE 16 #define FDEC_STRIDE 32 - ALIGNED_32( pixel fenc_buf[48*FENC_STRIDE] ); - ALIGNED_32( pixel fdec_buf[52*FDEC_STRIDE] ); + ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] ); + ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] ); /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ ALIGNED_16( pixel i4x4_fdec_buf[16*16] ); ALIGNED_16( pixel i8x8_fdec_buf[16*16] ); - ALIGNED_16( dctcoef i8x8_dct_buf[3][64] ); - ALIGNED_16( dctcoef i4x4_dct_buf[15][16] ); + ALIGNED_64( dctcoef i8x8_dct_buf[3][64] ); + ALIGNED_64( dctcoef i4x4_dct_buf[15][16] ); uint32_t i4x4_nnz_buf[4]; uint32_t i8x8_nnz_buf[4]; - int i4x4_cbp; - int i8x8_cbp; /* Psy trellis DCT data */ ALIGNED_16( dctcoef fenc_dct8[4][64] ); ALIGNED_16( dctcoef fenc_dct4[16][16] ); /* Psy RD SATD/SA8D scores cache */ - ALIGNED_32( uint64_t fenc_hadamard_cache[9] ); - ALIGNED_32( uint32_t fenc_satd_cache[32] ); + ALIGNED_64( uint32_t fenc_satd_cache[32] ); + ALIGNED_16( uint64_t fenc_hadamard_cache[9] ); + + int i4x4_cbp; + int i8x8_cbp; /* pointer over mb of the frame to be compressed */ pixel *p_fenc[3]; /* y,u,v */ @@ -822,10 +822,10 @@ struct { /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */ - ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] ); + ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] ); - /* i_non_zero_count if available else 0x80 */ - ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] ); + /* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */ + ALIGNED_8( uint8_t non_zero_count[X264_SCAN8_SIZE] ); /* -1 if unused, -2 if unavailable */ ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); diff -Nru x264-0.148.2795+gitaaa9aa8/common/cpu.c x264-0.152.2854+gite9a5903/common/cpu.c --- x264-0.148.2795+gitaaa9aa8/common/cpu.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/cpu.c 2017-12-31 12:50:51.000000000 +0000 @@ -47,8 +47,7 @@ { #if HAVE_MMX // {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore -// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it -#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV +#define MMX2 X264_CPU_MMX|X264_CPU_MMX2 {"MMX2", MMX2}, {"MMXEXT", MMX2}, {"SSE", MMX2|X264_CPU_SSE}, @@ -56,6 +55,7 @@ {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW}, {"SSE2", SSE2}, {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST}, + {"LZCNT", SSE2|X264_CPU_LZCNT}, {"SSE3", SSE2|X264_CPU_SSE3}, {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, @@ -66,16 +66,17 @@ {"XOP", AVX|X264_CPU_XOP}, {"FMA4", AVX|X264_CPU_FMA4}, {"FMA3", AVX|X264_CPU_FMA3}, - {"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2}, + {"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1}, + {"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2}, +#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2 + {"AVX2", AVX2}, + {"AVX512", AVX2|X264_CPU_AVX512}, +#undef AVX2 #undef AVX #undef SSE2 #undef MMX2 {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, - {"LZCNT", X264_CPU_LZCNT}, - {"BMI1", X264_CPU_BMI1}, - {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2}, - {"SlowCTZ", X264_CPU_SLOW_CTZ}, {"SlowAtom", X264_CPU_SLOW_ATOM}, {"SlowPshufb", X264_CPU_SLOW_PSHUFB}, {"SlowPalignr", X264_CPU_SLOW_PALIGNR}, @@ -118,7 +119,7 @@ #if HAVE_MMX int x264_cpu_cpuid_test( void ); void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx ); -void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx ); +uint64_t x264_cpu_xgetbv( int xcr ); uint32_t x264_cpu_detect( void ) { @@ -126,15 +127,13 @@ uint32_t eax, ebx, ecx, edx; uint32_t vendor[4] = {0}; uint32_t max_extended_cap, max_basic_cap; - int cache; #if !ARCH_X86_64 if( !x264_cpu_cpuid_test() ) return 0; #endif - x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 ); - max_basic_cap = eax; + x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 ); if( max_basic_cap == 0 ) return 0; @@ -145,50 +144,46 @@ return cpu; if( edx&0x02000000 ) cpu |= X264_CPU_MMX2|X264_CPU_SSE; - if( edx&0x00008000 ) - cpu |= X264_CPU_CMOV; - else - return cpu; if( edx&0x04000000 ) cpu |= X264_CPU_SSE2; if( ecx&0x00000001 ) cpu |= X264_CPU_SSE3; if( ecx&0x00000200 ) - cpu |= X264_CPU_SSSE3; + cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST; if( ecx&0x00080000 ) cpu |= X264_CPU_SSE4; if( ecx&0x00100000 ) cpu |= X264_CPU_SSE42; - /* Check OXSAVE and AVX bits */ - if( (ecx&0x18000000) == 0x18000000 ) + + if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */ { - /* Check for OS support */ - x264_cpu_xgetbv( 0, &eax, &edx ); - if( (eax&0x6) == 0x6 ) + uint64_t xcr0 = x264_cpu_xgetbv( 0 ); + if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */ { - cpu |= X264_CPU_AVX; + if( ecx&0x10000000 ) + cpu |= X264_CPU_AVX; if( ecx&0x00001000 ) cpu |= X264_CPU_FMA3; - } - } - if( max_basic_cap >= 7 ) - { - x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx ); - /* AVX2 requires OS support, but BMI1/2 don't. */ - if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) ) - cpu |= X264_CPU_AVX2; - if( ebx&0x00000008 ) - { - cpu |= X264_CPU_BMI1; - if( ebx&0x00000100 ) - cpu |= X264_CPU_BMI2; + if( max_basic_cap >= 7 ) + { + x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx ); + if( ebx&0x00000008 ) + cpu |= X264_CPU_BMI1; + if( ebx&0x00000100 ) + cpu |= X264_CPU_BMI2; + if( ebx&0x00000020 ) + cpu |= X264_CPU_AVX2; + + if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */ + { + if( (ebx&0xD0030000) == 0xD0030000 ) + cpu |= X264_CPU_AVX512; + } + } } } - if( cpu & X264_CPU_SSSE3 ) - cpu |= X264_CPU_SSE2_IS_FAST; - x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; @@ -228,8 +223,6 @@ { if( edx&0x00400000 ) cpu |= X264_CPU_MMX2; - if( !(cpu&X264_CPU_LZCNT) ) - cpu |= X264_CPU_SLOW_CTZ; if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) ) cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ } @@ -254,7 +247,6 @@ else if( model == 28 ) { cpu |= X264_CPU_SLOW_ATOM; - cpu |= X264_CPU_SLOW_CTZ; cpu |= X264_CPU_SLOW_PSHUFB; } /* Conroe has a slow shuffle unit. Check the model number to make sure not @@ -268,7 +260,7 @@ { /* cacheline size is specified in 3 places, any of which may be missing */ x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); - cache = (ebx&0xff00)>>5; // cflush size + int cache = (ebx&0xff00)>>5; // cflush size if( !cache && max_extended_cap >= 0x80000006 ) { x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx ); diff -Nru x264-0.148.2795+gitaaa9aa8/common/cpu.h x264-0.152.2854+gite9a5903/common/cpu.h --- x264-0.148.2795+gitaaa9aa8/common/cpu.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/cpu.h 2017-12-31 12:50:50.000000000 +0000 @@ -56,7 +56,7 @@ * alignment between functions (osdep.h handles manual alignment of arrays * if it doesn't). */ -#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX +#if HAVE_MMX && (STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4)) intptr_t x264_stack_align( void (*func)(), ... ); #define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__) #else @@ -65,7 +65,7 @@ typedef struct { - const char name[16]; + const char *name; uint32_t flags; } x264_cpu_name_t; extern const x264_cpu_name_t x264_cpu_names[]; diff -Nru x264-0.148.2795+gitaaa9aa8/common/dct.c x264-0.152.2854+gite9a5903/common/dct.c --- x264-0.148.2795+gitaaa9aa8/common/dct.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/dct.c 2017-12-31 12:50:51.000000000 +0000 @@ -711,6 +711,16 @@ dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; #endif } + + if( cpu&X264_CPU_AVX512 ) + { + dctf->sub4x4_dct = x264_sub4x4_dct_avx512; + dctf->sub8x8_dct = x264_sub8x8_dct_avx512; + dctf->sub16x16_dct = x264_sub16x16_dct_avx512; + dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512; + dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512; + dctf->add8x8_idct = x264_add8x8_idct_avx512; + } #endif //HAVE_MMX #if HAVE_ALTIVEC @@ -986,6 +996,13 @@ pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; } #endif // ARCH_X86_64 + if( cpu&X264_CPU_AVX512 ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; + } #endif // HAVE_MMX #else #if HAVE_MMX @@ -1026,6 +1043,13 @@ pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop; pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop; } + if( cpu&X264_CPU_AVX512 ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; + } #endif // HAVE_MMX #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) @@ -1068,6 +1092,11 @@ pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; } + if( cpu&X264_CPU_AVX512 ) + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; + } #else if( cpu&X264_CPU_MMX ) { @@ -1091,6 +1120,11 @@ pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2; } + if( cpu&X264_CPU_AVX512 ) + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; + } #endif // HIGH_BIT_DEPTH #endif #if !HIGH_BIT_DEPTH diff -Nru x264-0.148.2795+gitaaa9aa8/common/deblock.c x264-0.152.2854+gite9a5903/common/deblock.c --- x264-0.148.2795+gitaaa9aa8/common/deblock.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/deblock.c 2017-12-31 12:50:51.000000000 +0000 @@ -676,21 +676,21 @@ void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); -void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); -void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); -void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); -void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); -void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); +void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_avx512( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); @@ -803,7 +803,6 @@ #if !HIGH_BIT_DEPTH pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2; #endif - pf->deblock_strength = x264_deblock_strength_mmx2; if( cpu&X264_CPU_SSE2 ) { pf->deblock_strength = x264_deblock_strength_sse2; @@ -852,6 +851,10 @@ { pf->deblock_strength = x264_deblock_strength_avx2; } + if( cpu&X264_CPU_AVX512 ) + { + pf->deblock_strength = x264_deblock_strength_avx512; + } } #endif diff -Nru x264-0.148.2795+gitaaa9aa8/common/frame.c x264-0.152.2854+gite9a5903/common/frame.c --- x264-0.148.2795+gitaaa9aa8/common/frame.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/frame.c 2017-12-31 12:50:51.000000000 +0000 @@ -54,6 +54,8 @@ case X264_CSP_NV16: case X264_CSP_I422: case X264_CSP_YV16: + case X264_CSP_YUYV: + case X264_CSP_UYVY: case X264_CSP_V210: return X264_CSP_NV16; case X264_CSP_I444: @@ -76,7 +78,7 @@ int i_padv = PADV << PARAM_INTERLACED; int align = 16; #if ARCH_X86 || ARCH_X86_64 - if( h->param.cpu&X264_CPU_CACHELINE_64 ) + if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 ) align = 64; else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX ) align = 32; @@ -221,11 +223,13 @@ PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); } - PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) ); + PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) - PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); + PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) ); + /* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */ + prealloc_size += NATIVE_ALIGN; } if( h->param.rc.i_aq_mode ) { @@ -408,7 +412,13 @@ uint8_t *pix[3]; int stride[3]; - if( i_csp == X264_CSP_V210 ) + if( i_csp == X264_CSP_YUYV || i_csp == X264_CSP_UYVY ) + { + int p = i_csp == X264_CSP_UYVY; + h->mc.plane_copy_deinterleave_yuyv( dst->plane[p], dst->i_stride[p], dst->plane[p^1], dst->i_stride[p^1], + (pixel*)src->img.plane[0], src->img.i_stride[0], h->param.i_width, h->param.i_height ); + } + else if( i_csp == X264_CSP_V210 ) { stride[0] = src->img.i_stride[0]; pix[0] = src->img.plane[0]; diff -Nru x264-0.148.2795+gitaaa9aa8/common/macroblock.c x264-0.152.2854+gite9a5903/common/macroblock.c --- x264-0.148.2795+gitaaa9aa8/common/macroblock.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/macroblock.c 2017-12-31 12:50:51.000000000 +0000 @@ -260,7 +260,7 @@ PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) ); PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) ); PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) ); - PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) ); + PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint32_t) ); /* 0 -> 3 top(4), 4 -> 6 : left(3) */ PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) ); @@ -326,7 +326,7 @@ PREALLOC_END( h->mb.base ); - memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) ); + memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint32_t) ); for( int i = 0; i < 2; i++ ) { @@ -388,7 +388,7 @@ ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } - int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t); + int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+15)&~15) * sizeof(int16_t); scratch_size = X264_MAX( scratch_size, buf_mbtree ); if( scratch_size ) CHECKED_MALLOC( h->scratch_buffer, scratch_size ); @@ -532,16 +532,16 @@ h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE; h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE; - h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE; + h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE; if( CHROMA444 ) { h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE; - h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE; + h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE; } else { h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8; - h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16; + h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16; } } @@ -1738,7 +1738,7 @@ h->mb.i_last_dqp = 0; h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2; h->mb.i_cbp_luma = 0xf; - h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x700; + h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x1700; h->mb.b_transform_8x8 = 0; for( int i = 0; i < 48; i++ ) h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16; diff -Nru x264-0.148.2795+gitaaa9aa8/common/mc.c x264-0.152.2854+gite9a5903/common/mc.c --- x264-0.148.2795+gitaaa9aa8/common/mc.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/mc.c 2017-12-31 12:50:51.000000000 +0000 @@ -325,15 +325,14 @@ } } -static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu, - pixel *dstv, intptr_t i_dstv, - pixel *src, intptr_t i_src, int w, int h ) +void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, + pixel *src, intptr_t i_src, int w, int h ) { - for( int y=0; yplane_copy_swap = x264_plane_copy_swap_c; pf->plane_copy_interleave = x264_plane_copy_interleave_c; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_c; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c; diff -Nru x264-0.148.2795+gitaaa9aa8/common/mc.h x264-0.152.2854+gite9a5903/common/mc.h --- x264-0.148.2795+gitaaa9aa8/common/mc.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/mc.h 2017-12-31 12:50:51.000000000 +0000 @@ -160,6 +160,39 @@ x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\ } +void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, + pixel *src, intptr_t i_src, int w, int h ); + +/* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV + * input with the additional constraint that we cannot overread src. */ +#define PLANE_COPY_YUYV(align, cpu)\ +static void x264_plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\ + pixel *src, intptr_t i_src, int w, int h )\ +{\ + int c_w = (align>>1) / sizeof(pixel) - 1;\ + if( !(w&c_w) )\ + x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\ + else if( w > c_w )\ + {\ + if( --h > 0 )\ + {\ + if( i_src > 0 )\ + {\ + x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\ + dsta += i_dsta * h;\ + dstb += i_dstb * h;\ + src += i_src * h;\ + }\ + else\ + x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\ + src+i_src, i_src, w, h );\ + }\ + x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\ + }\ + else\ + x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\ +} + void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); @@ -260,6 +293,8 @@ /* may write up to 15 pixels off the end of each plane */ void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); + void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, + pixel *src, intptr_t i_src, int w, int h ); void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty, diff -Nru x264-0.148.2795+gitaaa9aa8/common/osdep.h x264-0.152.2854+gite9a5903/common/osdep.h --- x264-0.148.2795+gitaaa9aa8/common/osdep.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/osdep.h 2017-12-31 12:50:51.000000000 +0000 @@ -139,17 +139,23 @@ #define EXPAND(x) x #if ARCH_X86 || ARCH_X86_64 -#define NATIVE_ALIGN 32 +#define NATIVE_ALIGN 64 #define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 ) +#define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 ) #if STACK_ALIGNMENT >= 32 #define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ ) #else #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) ) #endif +#if STACK_ALIGNMENT >= 64 +#define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ ) +#else #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) ) +#endif #else #define NATIVE_ALIGN 16 #define ALIGNED_32 ALIGNED_16 +#define ALIGNED_64 ALIGNED_16 #define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16 #define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16 #endif diff -Nru x264-0.148.2795+gitaaa9aa8/common/pixel.c x264-0.152.2854+gite9a5903/common/pixel.c --- x264-0.148.2795+gitaaa9aa8/common/pixel.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/pixel.c 2017-12-31 12:50:51.000000000 +0000 @@ -201,28 +201,32 @@ /**************************************************************************** * pixel_var2_wxh ****************************************************************************/ -#define PIXEL_VAR2_C( name, w, h, shift ) \ -static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \ +#define PIXEL_VAR2_C( name, h, shift ) \ +static int name( pixel *fenc, pixel *fdec, int ssd[2] ) \ { \ - int var = 0, sum = 0, sqr = 0; \ + int sum_u = 0, sum_v = 0, sqr_u = 0, sqr_v = 0; \ for( int y = 0; y < h; y++ ) \ { \ - for( int x = 0; x < w; x++ ) \ + for( int x = 0; x < 8; x++ ) \ { \ - int diff = pix1[x] - pix2[x]; \ - sum += diff; \ - sqr += diff * diff; \ + int diff_u = fenc[x] - fdec[x]; \ + int diff_v = fenc[x+FENC_STRIDE/2] - fdec[x+FDEC_STRIDE/2]; \ + sum_u += diff_u; \ + sum_v += diff_v; \ + sqr_u += diff_u * diff_u; \ + sqr_v += diff_v * diff_v; \ } \ - pix1 += i_stride1; \ - pix2 += i_stride2; \ + fenc += FENC_STRIDE; \ + fdec += FDEC_STRIDE; \ } \ - var = sqr - ((int64_t)sum * sum >> shift); \ - *ssd = sqr; \ - return var; \ + ssd[0] = sqr_u; \ + ssd[1] = sqr_v; \ + return sqr_u - ((int64_t)sum_u * sum_u >> shift) + \ + sqr_v - ((int64_t)sum_v * sum_v >> shift); \ } -PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16, 7 ) -PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 8, 6 ) +PIXEL_VAR2_C( x264_pixel_var2_8x16, 16, 7 ) +PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 6 ) #if BIT_DEPTH > 8 typedef uint32_t sum_t; @@ -885,13 +889,6 @@ INIT8( ssd, _mmx2 ); INIT_ADS( _mmx2 ); - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2; -#if ARCH_X86 - pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2; - pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2; -#endif - pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2; @@ -962,7 +959,9 @@ INIT7( sad, _ssse3 ); INIT7( sad_x3, _ssse3 ); INIT7( sad_x4, _ssse3 ); +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _ssse3 ); +#endif INIT6( satd, _ssse3 ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3; @@ -1003,7 +1002,9 @@ if( cpu&X264_CPU_AVX ) { INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */ +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _avx ); +#endif INIT6( satd, _avx ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) @@ -1028,8 +1029,6 @@ INIT5( sad_x3, _xop ); INIT5( sad_x4, _xop ); pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; pixf->vsad = x264_pixel_vsad_xop; pixf->asd8 = x264_pixel_asd8_xop; #if ARCH_X86_64 @@ -1044,10 +1043,19 @@ INIT2( sad_x3, _avx2 ); INIT2( sad_x4, _avx2 ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2; pixf->vsad = x264_pixel_vsad_avx2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2; } + if( cpu&X264_CPU_AVX512 ) + { + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX @@ -1067,16 +1075,11 @@ INIT7( satd_x4, _mmx2 ); INIT4( hadamard_ac, _mmx2 ); INIT_ADS( _mmx2 ); - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2; - pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2; #if ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2; - pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2; - pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2; pixf->vsad = x264_pixel_vsad_mmx2; if( cpu&X264_CPU_CACHELINE_32 ) @@ -1197,7 +1200,9 @@ pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3; #endif } +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _ssse3 ); +#endif if( cpu&X264_CPU_SLOW_ATOM ) { pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom; @@ -1280,7 +1285,9 @@ INIT8( satd, _avx ); INIT7( satd_x3, _avx ); INIT7( satd_x4, _avx ); +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _avx ); +#endif INIT4( hadamard_ac, _avx ); if( !(cpu&X264_CPU_STACK_MOD4) ) { @@ -1321,11 +1328,6 @@ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; - pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; - pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop; - pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; #endif @@ -1338,7 +1340,9 @@ INIT2( sad_x4, _avx2 ); INIT4( satd, _avx2 ); INIT2( hadamard_ac, _avx2 ); +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _avx2 ); +#endif pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2; @@ -1351,6 +1355,21 @@ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2; #endif } + + if( cpu&X264_CPU_AVX512 ) + { + INIT8( sad, _avx512 ); + INIT8_NAME( sad_aligned, sad, _avx512 ); + INIT7( sad_x3, _avx512 ); + INIT7( sad_x4, _avx512 ); + INIT8( satd, _avx512 ); + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512; + } #endif //HAVE_MMX #if HAVE_ARMV6 @@ -1480,10 +1499,10 @@ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa; - pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa; - pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa; - pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16; - pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8; + //pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa; + //pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa; + pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_msa; + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_msa; } #endif // HAVE_MSA diff -Nru x264-0.148.2795+gitaaa9aa8/common/pixel.h x264-0.152.2854+gite9a5903/common/pixel.h --- x264-0.148.2795+gitaaa9aa8/common/pixel.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/pixel.h 2017-12-31 12:50:51.000000000 +0000 @@ -93,8 +93,7 @@ uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); uint64_t (*var[4])( pixel *pix, intptr_t stride ); - int (*var2[4])( pixel *pix1, intptr_t stride1, - pixel *pix2, intptr_t stride2, int *ssd ); + int (*var2[4])( pixel *fenc, pixel *fdec, int ssd[2] ); uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride ); void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1, diff -Nru x264-0.148.2795+gitaaa9aa8/common/ppc/dct.c x264-0.152.2854+gite9a5903/common/ppc/dct.c --- x264-0.148.2795+gitaaa9aa8/common/ppc/dct.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/ppc/dct.c 2017-12-31 12:50:51.000000000 +0000 @@ -293,12 +293,8 @@ vec_vsx_st( dcvsum8, 0, dest ); \ } -static void idct8_dc_altivec( uint8_t *dst, int16_t dc1, int16_t dc2 ) +static void idct8_dc_altivec( uint8_t *dst, vec_s16_t dcv ) { - dc1 = (dc1 + 32) >> 6; - dc2 = (dc2 + 32) >> 6; - vec_s16_t dcv = { dc1, dc1, dc1, dc1, dc2, dc2, dc2, dc2 }; - LOAD_ZERO; ALTIVEC_STORE8_DC_SUM_CLIP( &dst[0*FDEC_STRIDE], dcv ); ALTIVEC_STORE8_DC_SUM_CLIP( &dst[1*FDEC_STRIDE], dcv ); @@ -308,8 +304,18 @@ void x264_add8x8_idct_dc_altivec( uint8_t *p_dst, int16_t dct[4] ) { - idct8_dc_altivec( &p_dst[0], dct[0], dct[1] ); - idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2], dct[3] ); + vec_s16_t dcv; + vec_s16_t v32 = vec_sl( vec_splat_s16( 8 ), vec_splat_u16( 2 ) ); + vec_u16_t v6 = vec_splat_u16( 6 ); + vec_s16_t dctv = vec_vsx_ld( 0, dct ); + + dctv = vec_sra( vec_add( dctv, v32 ), v6 ); + dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 0 ), (vec_s32_t)vec_splat( dctv, 1 ) ); + dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv ); + idct8_dc_altivec( &p_dst[0], dcv ); + dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 2 ), (vec_s32_t)vec_splat( dctv, 3 ) ); + dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv ); + idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dcv ); } #define IDCT_1D_ALTIVEC(s0, s1, s2, s3, d0, d1, d2, d3) \ diff -Nru x264-0.148.2795+gitaaa9aa8/common/quant.c x264-0.152.2854+gite9a5903/common/quant.c --- x264-0.148.2795+gitaaa9aa8/common/quant.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/quant.c 2017-12-31 12:50:51.000000000 +0000 @@ -460,9 +460,6 @@ { #if ARCH_X86 pf->denoise_dct = x264_denoise_dct_mmx; - pf->decimate_score15 = x264_decimate_score15_mmx2; - pf->decimate_score16 = x264_decimate_score16_mmx2; - pf->decimate_score64 = x264_decimate_score64_mmx2; pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; @@ -473,8 +470,6 @@ #endif pf->coeff_last4 = x264_coeff_last4_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; - if( cpu&X264_CPU_LZCNT ) - pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; } if( cpu&X264_CPU_SSE2 ) { @@ -499,17 +494,18 @@ pf->coeff_level_run8 = x264_coeff_level_run8_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; - pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt; - pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; - pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt; - pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt; - pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt; - } + } + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last4 = x264_coeff_last4_lzcnt; + pf->coeff_last8 = x264_coeff_last8_lzcnt; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt; + pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt; } if( cpu&X264_CPU_SSSE3 ) { @@ -557,8 +553,20 @@ pf->dequant_8x8 = x264_dequant_8x8_avx2; pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; pf->denoise_dct = x264_denoise_dct_avx2; - if( cpu&X264_CPU_LZCNT ) - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2; + } + if( cpu&X264_CPU_AVX512 ) + { + pf->dequant_4x4 = x264_dequant_4x4_avx512; + pf->dequant_8x8 = x264_dequant_8x8_avx512; + pf->decimate_score15 = x264_decimate_score15_avx512; + pf->decimate_score16 = x264_decimate_score16_avx512; + pf->decimate_score64 = x264_decimate_score64_avx512; + pf->coeff_last4 = x264_coeff_last4_avx512; + pf->coeff_last8 = x264_coeff_last8_avx512; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512; } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH @@ -586,9 +594,6 @@ pf->quant_4x4 = x264_quant_4x4_mmx2; pf->quant_8x8 = x264_quant_8x8_mmx2; pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2; - pf->decimate_score15 = x264_decimate_score15_mmx2; - pf->decimate_score16 = x264_decimate_score16_mmx2; - pf->decimate_score64 = x264_decimate_score64_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2; @@ -599,13 +604,6 @@ pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; pf->coeff_level_run8 = x264_coeff_level_run8_mmx2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; - pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt; - pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; - pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt; - } } if( cpu&X264_CPU_SSE2 ) @@ -634,14 +632,19 @@ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; - pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt; - pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt; - } + } + + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last4 = x264_coeff_last4_lzcnt; + pf->coeff_last8 = x264_coeff_last8_lzcnt; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt; + pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt; } if( cpu&X264_CPU_SSSE3 ) @@ -657,17 +660,19 @@ pf->decimate_score16 = x264_decimate_score16_ssse3; pf->decimate_score64 = x264_decimate_score64_ssse3; INIT_TRELLIS( ssse3 ); +#if ARCH_X86 || !defined( __MACH__ ) pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3; if( cpu&X264_CPU_LZCNT ) { - pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; - pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; + pf->coeff_level_run4 = x264_coeff_level_run4_ssse3_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_ssse3_lzcnt; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt; } +#endif } if( cpu&X264_CPU_SSE4 ) @@ -717,12 +722,28 @@ } pf->decimate_score64 = x264_decimate_score64_avx2; pf->denoise_dct = x264_denoise_dct_avx2; - if( cpu&X264_CPU_LZCNT ) + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2; +#if ARCH_X86 || !defined( __MACH__ ) + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2; +#endif + } + if( cpu&X264_CPU_AVX512 ) + { + if( h->param.i_cqm_preset == X264_CQM_FLAT ) + pf->dequant_8x8 = x264_dequant_8x8_flat16_avx512; + else { - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt; - pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt; - pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt; + pf->dequant_4x4 = x264_dequant_4x4_avx512; + pf->dequant_8x8 = x264_dequant_8x8_avx512; } + pf->decimate_score15 = x264_decimate_score15_avx512; + pf->decimate_score16 = x264_decimate_score16_avx512; + pf->decimate_score64 = x264_decimate_score64_avx512; + pf->coeff_last8 = x264_coeff_last8_avx512; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512; } #endif // HAVE_MMX diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/cabac-a.asm x264-0.152.2854+gite9a5903/common/x86/cabac-a.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/cabac-a.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/cabac-a.asm 2017-12-31 12:50:51.000000000 +0000 @@ -53,21 +53,32 @@ %endmacro cextern coeff_last4_mmx2 -cextern coeff_last4_mmx2_lzcnt +cextern coeff_last4_lzcnt +%if HIGH_BIT_DEPTH +cextern coeff_last4_avx512 +%endif cextern coeff_last15_sse2 -cextern coeff_last15_sse2_lzcnt +cextern coeff_last15_lzcnt +cextern coeff_last15_avx512 cextern coeff_last16_sse2 -cextern coeff_last16_sse2_lzcnt +cextern coeff_last16_lzcnt +cextern coeff_last16_avx512 cextern coeff_last64_sse2 -cextern coeff_last64_sse2_lzcnt -cextern coeff_last64_avx2_lzcnt +cextern coeff_last64_lzcnt +cextern coeff_last64_avx2 +cextern coeff_last64_avx512 %ifdef PIC SECTION .data %endif -coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 -coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 -coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_lzcnt: COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_avx2: COEFF_LAST_TABLE lzcnt, avx2, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +%if HIGH_BIT_DEPTH +coeff_last_avx512: COEFF_LAST_TABLE avx512, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +%else +coeff_last_avx512: COEFF_LAST_TABLE lzcnt, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +%endif %endif SECTION .text @@ -100,7 +111,7 @@ .start: pointer 1 .p: pointer 1 .end: pointer 1 - align 16, resb 1 + align 64, resb 1 .bits_encoded: resd 1 .state: resb 1024 endstruc @@ -352,25 +363,33 @@ %endmacro %macro ABS_DCTCOEFS 2 -%assign i 0 -%rep %2/16 %if HIGH_BIT_DEPTH - ABSD m0, [%1+ 0+i*64], m4 - ABSD m1, [%1+16+i*64], m5 - ABSD m2, [%1+32+i*64], m4 - ABSD m3, [%1+48+i*64], m5 - mova [rsp+ 0+i*64], m0 - mova [rsp+16+i*64], m1 - mova [rsp+32+i*64], m2 - mova [rsp+48+i*64], m3 -%else - ABSW m0, [%1+ 0+i*32], m2 - ABSW m1, [%1+16+i*32], m3 - mova [rsp+ 0+i*32], m0 - mova [rsp+16+i*32], m1 + %define %%abs ABSD +%else + %define %%abs ABSW %endif +%if mmsize == %2*SIZEOF_DCTCOEF + %%abs m0, [%1], m1 + mova [rsp], m0 +%elif mmsize == %2*SIZEOF_DCTCOEF/2 + %%abs m0, [%1+0*mmsize], m2 + %%abs m1, [%1+1*mmsize], m3 + mova [rsp+0*mmsize], m0 + mova [rsp+1*mmsize], m1 +%else +%assign i 0 +%rep %2*SIZEOF_DCTCOEF/(4*mmsize) + %%abs m0, [%1+(4*i+0)*mmsize], m4 + %%abs m1, [%1+(4*i+1)*mmsize], m5 + %%abs m2, [%1+(4*i+2)*mmsize], m4 + %%abs m3, [%1+(4*i+3)*mmsize], m5 + mova [rsp+(4*i+0)*mmsize], m0 + mova [rsp+(4*i+1)*mmsize], m1 + mova [rsp+(4*i+2)*mmsize], m2 + mova [rsp+(4*i+3)*mmsize], m3 %assign i i+1 %endrep +%endif %endmacro %macro SIG_OFFSET 1 @@ -403,16 +422,14 @@ %endif %ifdef PIC - cglobal func, 4,13 + cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF lea r12, [$$] %define GLOBAL +r12-$$ %else - cglobal func, 4,12 + cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF %define GLOBAL %endif -%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15) - SUB rsp, pad shl r1d, 4 ; MB_INTERLACED*16 %if %1 lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8 @@ -429,15 +446,13 @@ ABS_DCTCOEFS r0, 64 %else mov r4, r0 ; r4 = dct - mov r6, ~SIZEOF_DCTCOEF - and r6, r4 ; handle AC coefficient case - ABS_DCTCOEFS r6, 16 - sub r4, r6 ; calculate our new dct pointer + and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case + ABS_DCTCOEFS r4, 16 + xor r4, r0 ; calculate our new dct pointer add r4, rsp ; restore AC coefficient offset %endif - mov r1, [%2+gprsize*r2 GLOBAL] ; for improved OOE performance, run coeff_last on the original coefficients. - call r1 ; coeff_last[ctx_block_cat]( dct ) + call [%2+gprsize*r2 GLOBAL] ; coeff_last[ctx_block_cat]( dct ) ; we know on 64-bit that the SSE2 versions of this function only ; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we ; don't need r2 in 8x8 mode. @@ -521,7 +536,6 @@ jge .coeff_loop .end: mov [r3+cb.bits_encoded-cb.state], r0d - ADD rsp, pad RET %endmacro @@ -529,15 +543,23 @@ INIT_XMM sse2 CABAC_RESIDUAL_RD 0, coeff_last_sse2 CABAC_RESIDUAL_RD 1, coeff_last_sse2 -INIT_XMM sse2,lzcnt -CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt -CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt +INIT_XMM lzcnt +CABAC_RESIDUAL_RD 0, coeff_last_lzcnt +CABAC_RESIDUAL_RD 1, coeff_last_lzcnt INIT_XMM ssse3 CABAC_RESIDUAL_RD 0, coeff_last_sse2 CABAC_RESIDUAL_RD 1, coeff_last_sse2 INIT_XMM ssse3,lzcnt -CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt -CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt +CABAC_RESIDUAL_RD 0, coeff_last_lzcnt +CABAC_RESIDUAL_RD 1, coeff_last_lzcnt +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +%else +INIT_YMM avx512 +%endif +CABAC_RESIDUAL_RD 0, coeff_last_avx512 +INIT_ZMM avx512 +CABAC_RESIDUAL_RD 1, coeff_last_avx512 %endif ;----------------------------------------------------------------------------- @@ -615,7 +637,7 @@ %endmacro %macro CABAC_RESIDUAL 1 -cglobal cabac_block_residual_internal, 4,15 +cglobal cabac_block_residual_internal, 4,15,0,-4*64 %ifdef PIC ; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register. lea r7, [$$] @@ -625,8 +647,6 @@ %define lastm r7d %define GLOBAL %endif -%assign pad gprsize+4*2+4*64-(stack_offset&15) - SUB rsp, pad shl r1d, 4 %define sigoffq r8 @@ -653,8 +673,7 @@ mov dct, r0 mov leveloffm, leveloffd - mov r1, [%1+gprsize*r2 GLOBAL] - call r1 + call [%1+gprsize*r2 GLOBAL] mov lastm, eax ; put cabac in r0; needed for cabac_encode_decision mov r0, r3 @@ -718,14 +737,14 @@ push r7 push r8 %else - sub rsp, 32 ; shadow space + sub rsp, 40 ; shadow space and alignment %endif call cabac_encode_ue_bypass %if UNIX64 pop r8 pop r7 %else - add rsp, 32 + add rsp, 40 %endif pop r0 .level_gt1_end: @@ -742,15 +761,16 @@ %endif dec coeffidxd jge .level_loop - ADD rsp, pad RET %endmacro %if ARCH_X86_64 INIT_XMM sse2 CABAC_RESIDUAL coeff_last_sse2 -INIT_XMM sse2,lzcnt -CABAC_RESIDUAL coeff_last_sse2_lzcnt -INIT_XMM avx2,bmi2 -CABAC_RESIDUAL coeff_last_avx2_lzcnt +INIT_XMM lzcnt +CABAC_RESIDUAL coeff_last_lzcnt +INIT_XMM avx2 +CABAC_RESIDUAL coeff_last_avx2 +INIT_XMM avx512 +CABAC_RESIDUAL coeff_last_avx512 %endif diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/cpu-a.asm x264-0.152.2854+gite9a5903/common/x86/cpu-a.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/cpu-a.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/cpu-a.asm 2017-12-31 12:50:51.000000000 +0000 @@ -53,18 +53,16 @@ RET ;----------------------------------------------------------------------------- -; void cpu_xgetbv( int op, int *eax, int *edx ) +; uint64_t cpu_xgetbv( int xcr ) ;----------------------------------------------------------------------------- -cglobal cpu_xgetbv, 3,7 - push r2 - push r1 - mov ecx, r0d +cglobal cpu_xgetbv + movifnidn ecx, r0m xgetbv - pop r4 - mov [r4], eax - pop r4 - mov [r4], edx - RET +%if ARCH_X86_64 + shl rdx, 32 + or rax, rdx +%endif + ret %if ARCH_X86_64 @@ -77,7 +75,7 @@ %if WIN64 sub rsp, 32 ; shadow space %endif - and rsp, ~31 + and rsp, ~(STACK_ALIGNMENT-1) mov rax, r0 mov r0, r1 mov r1, r2 @@ -118,7 +116,7 @@ push ebp mov ebp, esp sub esp, 12 - and esp, ~31 + and esp, ~(STACK_ALIGNMENT-1) mov ecx, [ebp+8] mov edx, [ebp+12] mov [esp], edx diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/dct-a.asm x264-0.152.2854+gite9a5903/common/x86/dct-a.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/dct-a.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/dct-a.asm 2017-12-31 12:50:51.000000000 +0000 @@ -30,7 +30,41 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 +; AVX-512 permutation indices are bit-packed to save cache +%if HIGH_BIT_DEPTH +scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame + dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1 + dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2 + dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3 + ; bits 19-23: 8x8_frame4 +scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4: 8x8_field1 + dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2 + dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3 + dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4 +cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4: interleave1 + dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9: interleave2 + dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3 + dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4 +%else +dct_avx512: dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits 0-4: dct8x8_fenc bits 5-9: dct8x8_fdec + dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits 10-13: dct16x16_fenc bits 14-18: dct16x16_fdec + dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1 bits(e) 28-31: idct8x8_idct2 + dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather +scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame + dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1 + dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2 + dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30 +scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5: 8x8_field1 + dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2 + dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a + dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2 +cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5: interleave1 + dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11: interleave2 + dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd + dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd +%endif + pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 @@ -580,6 +614,217 @@ DCT4_1D 0, 1, 2, 3, 4 STORE16_DCT_AVX2 0, 1, 2, 3, 4 ret + +%macro DCT4x4_AVX512 0 + psubw m0, m2 ; 0 1 + psubw m1, m3 ; 3 2 + SUMSUB_BA w, 1, 0, 2 + SBUTTERFLY wd, 1, 0, 2 + paddw m2, m1, m0 + psubw m3, m1, m0 + paddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1 + psubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3 + shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2 + punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1 + SUMSUB_BA w, 1, 2, 3 + shufps m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2 + shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3 + paddw m2, m1, m3 + psubw m0, m1, m3 + paddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1 + psubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3' +%endmacro + +INIT_XMM avx512 +cglobal sub4x4_dct + mov eax, 0xf0aa + kmovw k1, eax + PROLOGUE 3,3 + movd m0, [r1+0*FENC_STRIDE] + movd m2, [r2+0*FDEC_STRIDE] + vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE] + vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE] + movd m1, [r1+3*FENC_STRIDE] + movd m3, [r2+3*FDEC_STRIDE] + vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE] + vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE] + kshiftrw k2, k1, 8 + pxor m4, m4 + punpcklbw m0, m4 + punpcklbw m2, m4 + punpcklbw m1, m4 + punpcklbw m3, m4 + DCT4x4_AVX512 + mova [r0], m2 + mova [r0+16], m0 + RET + +INIT_ZMM avx512 +cglobal dct4x4x4_internal + punpcklbw m0, m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m1, m4 + punpckhbw m3, m4 + DCT4x4_AVX512 + mova m1, m2 + vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0 + vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1 + ret + +%macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2 + movu %1, [r1+%3*FENC_STRIDE] + vpermt2d %1, %2, [r1+%4*FENC_STRIDE] +%endmacro + +%macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2 + movu %1, [r2+(%4 )*FDEC_STRIDE] + vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE] + movu %3, [r2+(%5 )*FDEC_STRIDE] + vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE] + vpermt2d %1, %2, %3 +%endmacro + +cglobal sub8x8_dct, 3,3 + mova m0, [dct_avx512] + DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3 + mov r1d, 0xaaaaaaaa + kmovd k1, r1d + psrld m0, 5 + DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4 + mov r1d, 0xf0f0f0f0 + kmovd k2, r1d + pxor xm4, xm4 + knotw k3, k2 + call dct4x4x4_internal_avx512 + mova [r0], m0 + mova [r0+64], m1 + RET + +%macro SUB4x16_DCT_AVX512 2 ; dst, src + vpermd m1, m5, [r1+1*%2*64] + mova m3, [r2+2*%2*64] + vpermt2d m3, m6, [r2+2*%2*64+64] + call dct4x4x4_internal_avx512 + mova [r0+%1*64 ], m0 + mova [r0+%1*64+128], m1 +%endmacro + +cglobal sub16x16_dct + psrld m5, [dct_avx512], 10 + mov eax, 0xaaaaaaaa + kmovd k1, eax + mov eax, 0xf0f0f0f0 + kmovd k2, eax + PROLOGUE 3,3 + pxor xm4, xm4 + knotw k3, k2 + psrld m6, m5, 4 + SUB4x16_DCT_AVX512 0, 0 + SUB4x16_DCT_AVX512 1, 1 + SUB4x16_DCT_AVX512 4, 2 + SUB4x16_DCT_AVX512 5, 3 + RET + +cglobal sub8x8_dct_dc, 3,3 + mova m3, [dct_avx512] + DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3 + mov r1d, 0xaa + kmovb k1, r1d + psrld m3, 5 + DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4 + pxor xm3, xm3 + psadbw m0, m3 + psadbw m1, m3 + psubw m0, m1 + vpmovqw xmm0, m0 + vprold xmm1, xmm0, 16 + paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3 + punpckhqdq xmm2, xmm0, xmm0 + psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3 + paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3 + punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3 + punpcklqdq xmm1, xmm0, xmm0 + psubw xmm0 {k1}, xm3, xmm0 + paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3 + movhps [r0], xmm0 + RET + +cglobal sub8x16_dct_dc, 3,3 + mova m5, [dct_avx512] + DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8 ; 0 4 1 5 + DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7 + mov r1d, 0xaa + kmovb k1, r1d + psrld m5, 5 + DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8 + DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12 + pxor xm4, xm4 + psadbw m0, m4 + psadbw m1, m4 + psadbw m2, m4 + psadbw m3, m4 + psubw m0, m2 + psubw m1, m3 + SBUTTERFLY qdq, 0, 1, 2 + paddw m0, m1 + vpmovqw xmm0, m0 ; 0 2 4 6 1 3 5 7 + psrlq xmm2, xmm0, 32 + psubw xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7 + paddw xmm0, xmm2 ; 0+4 2+6 1+5 3+7 + punpckhdq xmm2, xmm0, xmm1 + punpckldq xmm0, xmm1 + psubw xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7 + paddw xmm0, xmm2 ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7 + punpcklwd xmm0, xmm1 + psrlq xmm2, xmm0, 32 + psubw xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7 + paddw xmm0, xmm2 ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7 + shufps xmm0, xmm1, q0220 + mova [r0], xmm0 + RET + +%macro SARSUMSUB 3 ; a, b, tmp + mova m%3, m%1 + vpsraw m%1 {k1}, 1 + psubw m%1, m%2 ; 0-2 1>>1-3 + vpsraw m%2 {k1}, 1 + paddw m%2, m%3 ; 0+2 1+3>>1 +%endmacro + +cglobal add8x8_idct, 2,2 + mova m1, [r1] + mova m2, [r1+64] + mova m3, [dct_avx512] + vbroadcasti32x4 m4, [pw_32] + mov r1d, 0xf0f0f0f0 + kxnorb k2, k2, k2 + kmovd k1, r1d + kmovb k3, k2 + vshufi32x4 m0, m1, m2, q2020 ; 0 1 4 5 8 9 c d + vshufi32x4 m1, m2, q3131 ; 2 3 6 7 a b e f + psrlq m5, m3, 56 ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE + vpgatherqq m6 {k2}, [r0+m5] + SARSUMSUB 0, 1, 2 + SBUTTERFLY wd, 1, 0, 2 + psrlq m7, m3, 28 + SUMSUB_BA w, 0, 1, 2 ; 0+1+2+3>>1 0+1>>1-2-3 + vprold m1, 16 ; 0-1>>1-2+3 0-1+2-3>>1 + SBUTTERFLY dq, 0, 1, 2 + psrlq m3, 24 + SARSUMSUB 0, 1, 2 + vpermi2q m3, m1, m0 + vpermt2q m1, m7, m0 + paddw m3, m4 ; += 32 + SUMSUB_BA w, 1, 3, 0 + psraw m1, 6 ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3' + psraw m3, 6 ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3' + pxor xm0, xm0 + SBUTTERFLY bw, 6, 0, 2 + paddsw m1, m6 + paddsw m3, m0 + packuswb m1, m3 + vpscatterqq [r0+m5] {k3}, m1 + RET %endif ; HIGH_BIT_DEPTH INIT_MMX @@ -1883,3 +2128,161 @@ mov [r2+8], r0w RET %endif ; !HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal zigzag_scan_4x4_frame, 2,2 + mova m0, [scan_frame_avx512] + vpermd m0, m0, [r1] + mova [r0], m0 + RET + +cglobal zigzag_scan_4x4_field, 2,2 + mova m0, [r1] + pshufd xmm1, [r1+8], q3102 + mova [r0], m0 + movu [r0+8], xmm1 + RET + +cglobal zigzag_scan_8x8_frame, 2,2 + psrld m0, [scan_frame_avx512], 4 + mova m1, [r1+0*64] + mova m2, [r1+1*64] + mova m3, [r1+2*64] + mova m4, [r1+3*64] + mov r1d, 0x01fe7f80 + kmovd k1, r1d + kshiftrd k2, k1, 16 + vpermd m5, m0, m3 ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40 + psrld m6, m0, 5 + vpermi2d m0, m1, m2 ; 0 8 1 2 9 16 24 17 10 3 4 11 18 25 __ __ + vmovdqa64 m0 {k1}, m5 + mova [r0+0*64], m0 + mova m5, m1 + vpermt2d m1, m6, m2 ; __ 26 19 12 5 6 13 20 27 __ __ __ __ __ __ __ + psrld m0, m6, 5 + vpermi2d m6, m3, m4 ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35 + vmovdqa32 m6 {k2}, m1 + mova [r0+1*64], m6 + vpermt2d m5, m0, m2 ; 28 21 14 7 15 22 29 __ __ __ __ __ __ __ __ 30 + psrld m1, m0, 5 + vpermi2d m0, m3, m4 ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __ + vmovdqa32 m5 {k1}, m0 + mova [r0+2*64], m5 + vpermt2d m3, m1, m4 ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63 + vpermd m2, m1, m2 ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __ + vmovdqa64 m2 {k2}, m3 + mova [r0+3*64], m2 + RET + +cglobal zigzag_scan_8x8_field, 2,2 + mova m0, [scan_field_avx512] + mova m1, [r1+0*64] + mova m2, [r1+1*64] + mova m3, [r1+2*64] + mova m4, [r1+3*64] + mov r1d, 0x3f + kmovb k1, r1d + psrld m5, m0, 5 + vpermi2d m0, m1, m2 + vmovdqa64 m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15 + vpermt2d m1, m5, m2 + psrld m5, 5 + vmovdqa64 m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31 + vpermt2d m2, m5, m3 + psrld m5, 5 + vpermt2d m3, m5, m4 + mova [r0+0*64], m0 + mova [r0+1*64], m1 + mova [r0+2*64], m2 + mova [r0+3*64], m3 + RET + +cglobal zigzag_interleave_8x8_cavlc, 3,3 + mova m0, [cavlc_shuf_avx512] + mova m1, [r1+0*64] + mova m2, [r1+1*64] + mova m3, [r1+2*64] + mova m4, [r1+3*64] + kxnorb k1, k1, k1 + por m7, m1, m2 + psrld m5, m0, 5 + vpermi2d m0, m1, m2 ; a0 a1 b0 b1 + vpternlogd m7, m3, m4, 0xfe ; m1|m2|m3|m4 + psrld m6, m5, 5 + vpermi2d m5, m3, m4 ; b2 b3 a2 a3 + vptestmd k0, m7, m7 + vpermt2d m1, m6, m2 ; c0 c1 d0 d1 + psrld m6, 5 + vpermt2d m3, m6, m4 ; d2 d3 c2 c3 + vshufi32x4 m2, m0, m5, q1032 ; b0 b1 b2 b3 + vmovdqa32 m5 {k1}, m0 ; a0 a1 a2 a3 + vshufi32x4 m4, m1, m3, q1032 ; d0 d1 d2 d3 + vmovdqa32 m3 {k1}, m1 ; c0 c1 c2 c3 + mova [r0+0*64], m5 + mova [r0+1*64], m2 + mova [r0+2*64], m3 + mova [r0+3*64], m4 + kmovw r1d, k0 + test r1d, 0x1111 + setnz [r2] + test r1d, 0x2222 + setnz [r2+1] + test r1d, 0x4444 + setnz [r2+8] + test r1d, 0x8888 + setnz [r2+9] + RET + +%else ; !HIGH_BIT_DEPTH +INIT_YMM avx512 +cglobal zigzag_scan_4x4_frame, 2,2 + mova m0, [scan_frame_avx512] + vpermw m0, m0, [r1] + mova [r0], m0 + RET + +cglobal zigzag_scan_4x4_field, 2,2 + mova m0, [r1] + pshuflw xmm1, [r1+4], q3102 + mova [r0], m0 + movq [r0+4], xmm1 + RET + +INIT_ZMM avx512 +cglobal zigzag_scan_8x8_frame, 2,2 + psrlw m0, [scan_frame_avx512], 4 +scan8_avx512: + mova m1, [r1] + mova m2, [r1+64] + psrlw m3, m0, 6 + vpermi2w m0, m1, m2 + vpermt2w m1, m3, m2 + mova [r0], m0 + mova [r0+64], m1 + RET + +cglobal zigzag_scan_8x8_field, 2,2 + mova m0, [scan_field_avx512] + jmp scan8_avx512 + +cglobal zigzag_interleave_8x8_cavlc, 3,3 + mova m0, [cavlc_shuf_avx512] + mova m1, [r1] + mova m2, [r1+64] + psrlw m3, m0, 6 + vpermi2w m0, m1, m2 + vpermt2w m1, m3, m2 + kxnorb k2, k2, k2 + vptestmd k0, m0, m0 + vptestmd k1, m1, m1 + mova [r0], m0 + mova [r0+64], m1 + ktestw k2, k0 + setnz [r2] + setnc [r2+1] + ktestw k2, k1 + setnz [r2+8] + setnc [r2+9] + RET +%endif ; !HIGH_BIT_DEPTH diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/dct.h x264-0.152.2854+gite9a5903/common/x86/dct.h --- x264-0.148.2795+gitaaa9aa8/common/x86/dct.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/dct.h 2017-12-31 12:50:51.000000000 +0000 @@ -34,6 +34,7 @@ void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub4x4_dct_avx512 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); @@ -41,12 +42,16 @@ void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); -void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); -void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); +void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); +void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 ); +void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 ); +void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 ); void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] ); void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] ); @@ -59,6 +64,7 @@ void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] ); void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] ); void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] ); +void x264_add8x8_idct_avx512 ( uint8_t *p_dst, int16_t dct[ 4][16] ); void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] ); void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] ); void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] ); @@ -101,22 +107,26 @@ void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] ); void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] ); -void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] ); -void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] ); -void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] ); -void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] ); -void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] ); -void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] ); -void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] ); -void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] ); -void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] ); -void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] ); -void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] ); -void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] ); +void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] ); +void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] ); +void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] ); +void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] ); +void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] ); +void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] ); +void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] ); int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); @@ -125,9 +135,10 @@ int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); -void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz ); -void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz ); -void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); -void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_sse2 ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_avx2 ( int16_t *dst, int16_t *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_avx512( dctcoef *dst, dctcoef *src, uint8_t *nnz ); #endif diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/deblock-a.asm x264-0.152.2854+gite9a5903/common/x86/deblock-a.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/deblock-a.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/deblock-a.asm 2017-12-31 12:50:51.000000000 +0000 @@ -28,10 +28,14 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 -load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15 -insert_top_shuf: dd 0,1,4,5,7,2,3,6 +load_bytes_zmm_shuf: dd 0x50404032, 0x70606053, 0xd0c0c0b4, 0xf0e0e0d5 + dd 0x50404036, 0x70606057, 0xd0c0c0b8, 0xf0e0e0d9 + dd 0x50104001, 0x70306023, 0xd090c083, 0xf0b0e0a5 + dd 0x50104005, 0x70306027, 0xd090c087, 0xf0b0e0a9 +load_bytes_ymm_shuf: dd 0x06050403, 0x0e0d0c1b, 0x07060544, 0x0f0e0d5c + dd 0x06050473, 0x0e0d0c2b, 0x07060534, 0x0f0e0d6c transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15 SECTION .text @@ -2276,13 +2280,10 @@ RET %endif ; !HIGH_BIT_DEPTH - - ;----------------------------------------------------------------------------- ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2], ; uint8_t bs[2][4][4], int mvy_limit, int bframe ) ;----------------------------------------------------------------------------- - %define scan8start (4+1*8) %define nnz r0+scan8start %define ref r1+scan8start @@ -2290,145 +2291,54 @@ %define bs0 r3 %define bs1 r3+32 -%macro LOAD_BYTES_MMX 1 - movd m2, [%1+8*0-1] - movd m0, [%1+8*0] - movd m3, [%1+8*2-1] - movd m1, [%1+8*2] - punpckldq m2, [%1+8*1-1] - punpckldq m0, [%1+8*1] - punpckldq m3, [%1+8*3-1] - punpckldq m1, [%1+8*3] -%endmacro - -%macro DEBLOCK_STRENGTH_REFS_MMX 0 - LOAD_BYTES_MMX ref - pxor m2, m0 - pxor m3, m1 - por m2, [bs0+0] - por m3, [bs0+8] - movq [bs0+0], m2 - movq [bs0+8], m3 - - movd m2, [ref-8*1] - movd m3, [ref+8*1] - punpckldq m2, m0 ; row -1, row 0 - punpckldq m3, m1 ; row 1, row 2 - pxor m0, m2 - pxor m1, m3 - por m0, [bs1+0] - por m1, [bs1+8] - movq [bs1+0], m0 - movq [bs1+8], m1 -%endmacro - -%macro DEBLOCK_STRENGTH_MVS_MMX 2 - mova m0, [mv-%2] - mova m1, [mv-%2+8] - psubw m0, [mv] - psubw m1, [mv+8] - packsswb m0, m1 - ABSB m0, m1 - psubusb m0, m7 - packsswb m0, m0 - por m0, [%1] - movd [%1], m0 -%endmacro - -%macro DEBLOCK_STRENGTH_NNZ_MMX 1 - por m2, m0 - por m3, m1 - mova m4, [%1] - mova m5, [%1+8] - pminub m2, m6 - pminub m3, m6 - pminub m4, m6 ; mv ? 1 : 0 - pminub m5, m6 - paddb m2, m2 ; nnz ? 2 : 0 - paddb m3, m3 - pmaxub m2, m4 - pmaxub m3, m5 -%endmacro - -%macro LOAD_BYTES_XMM 1 - movu m2, [%1-4] ; FIXME could be aligned if we changed nnz's allocation +%macro LOAD_BYTES_XMM 2 ; src, aligned +%if %2 + mova m2, [%1-4] + mova m1, [%1+12] +%else + movu m2, [%1-4] movu m1, [%1+12] - pslldq m0, m2, 1 +%endif + psllq m0, m2, 8 shufps m2, m1, q3131 ; cur nnz, all rows - pslldq m1, 1 + psllq m1, 8 shufps m0, m1, q3131 ; left neighbors +%if cpuflag(avx) || (%2 && cpuflag(ssse3)) + palignr m1, m2, [%1-20], 12 +%else pslldq m1, m2, 4 - movd m3, [%1-8] ; could be palignr if nnz was aligned + movd m3, [%1-8] por m1, m3 ; top neighbors +%endif %endmacro -INIT_MMX mmx2 -cglobal deblock_strength, 6,6 - ; Prepare mv comparison register - shl r4d, 8 - add r4d, 3 - (1<<8) - movd m7, r4d - SPLATW m7, m7 - mova m6, [pb_1] - pxor m0, m0 - mova [bs0+0], m0 - mova [bs0+8], m0 - mova [bs1+0], m0 - mova [bs1+8], m0 - -.lists: - DEBLOCK_STRENGTH_REFS_MMX - mov r4d, 4 -.mvs: - DEBLOCK_STRENGTH_MVS_MMX bs0, 4 - DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8 - add r2, 4*8 - add r3, 4 - dec r4d - jg .mvs - add r1, 40 - add r2, 4*8 - sub r3, 16 - dec r5d - jge .lists - - ; Check nnz - LOAD_BYTES_MMX nnz - DEBLOCK_STRENGTH_NNZ_MMX bs0 - ; Transpose column output - SBUTTERFLY bw, 2, 3, 4 - SBUTTERFLY bw, 2, 3, 4 - mova [bs0+0], m2 - mova [bs0+8], m3 - movd m2, [nnz-8*1] - movd m3, [nnz+8*1] - punpckldq m2, m0 ; row -1, row 0 - punpckldq m3, m1 ; row 1, row 2 - DEBLOCK_STRENGTH_NNZ_MMX bs1 - mova [bs1+0], m2 - mova [bs1+8], m3 - RET +%if UNIX64 + DECLARE_REG_TMP 5 +%else + DECLARE_REG_TMP 4 +%endif %macro DEBLOCK_STRENGTH_XMM 0 -cglobal deblock_strength, 6,6,7 +cglobal deblock_strength, 5,5,7 ; Prepare mv comparison register shl r4d, 8 add r4d, 3 - (1<<8) movd m6, r4d + movifnidn t0d, r5m SPLATW m6, m6 pxor m4, m4 ; bs0 pxor m5, m5 ; bs1 .lists: ; Check refs - LOAD_BYTES_XMM ref + LOAD_BYTES_XMM ref, 0 pxor m0, m2 pxor m1, m2 por m4, m0 por m5, m1 ; Check mvs -%if cpuflag(ssse3) +%if cpuflag(ssse3) && notcpuflag(avx) mova m0, [mv+4*8*0] mova m1, [mv+4*8*1] palignr m3, m0, [mv+4*8*0-16], 12 @@ -2481,11 +2391,11 @@ por m5, m0 add r1, 40 add r2, 4*8*5 - dec r5d + dec t0d jge .lists ; Check nnz - LOAD_BYTES_XMM nnz + LOAD_BYTES_XMM nnz, 1 por m0, m2 por m1, m2 mova m6, [pb_1] @@ -2518,68 +2428,121 @@ DEBLOCK_STRENGTH_XMM %macro LOAD_BYTES_YMM 1 - movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX - pshufb m0, [load_bytes_shuf] ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX - mova m2, [insert_top_shuf] - vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2 - vpermd m0, m2, m0 ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS - vpbroadcastd m2, [%1-8] ; ABCD .... - vpblendd m0, m0, m2, 00010000b ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS + movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX + pshufb m0, m6 ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX + vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2 + vpbroadcastd m2, [%1-8] ; ABCD .... + vpblendd m0, m0, m2, 0x80 + vpermd m0, m7, m0 ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS %endmacro INIT_YMM avx2 -cglobal deblock_strength, 6,6,7 +cglobal deblock_strength, 5,5,8 + mova m6, [load_bytes_ymm_shuf] ; Prepare mv comparison register - shl r4d, 8 - add r4d, 3 - (1<<8) - movd xm6, r4d - vpbroadcastw m6, xm6 - pxor m5, m5 ; bs0,bs1 + shl r4d, 8 + add r4d, 3 - (1<<8) + movd xm5, r4d + movifnidn t0d, r5m + vpbroadcastw m5, xm5 + psrld m7, m6, 4 + pxor m4, m4 ; bs0,bs1 .lists: ; Check refs LOAD_BYTES_YMM ref - pxor m0, m1 - por m5, m0 + pxor m0, m1 + por m4, m0 ; Check mvs - movu xm0, [mv-4+4*8*0] - vinserti128 m0, m0, [mv+4*8*-1], 1 - vbroadcasti128 m2, [mv+4*8* 0] - vinserti128 m1, m2, [mv-4+4*8*1], 0 - vbroadcasti128 m3, [mv+4*8* 1] - psubw m0, m2 - psubw m1, m3 - - vinserti128 m2, m3, [mv-4+4*8*2], 0 - vbroadcasti128 m4, [mv+4*8* 2] - vinserti128 m3, m4, [mv-4+4*8*3], 0 - psubw m2, m4 - vbroadcasti128 m4, [mv+4*8* 3] - psubw m3, m4 - packsswb m0, m1 - packsswb m2, m3 - pabsb m0, m0 - pabsb m2, m2 - psubusb m0, m6 - psubusb m2, m6 - packsswb m0, m2 - por m5, m0 - - add r1, 40 - add r2, 4*8*5 - dec r5d + movu xm0, [mv+0*4*8-4] + vinserti128 m0, m0, [mv-1*4*8 ], 1 + vbroadcasti128 m2, [mv+0*4*8 ] + vinserti128 m1, m2, [mv+1*4*8-4], 0 + psubw m0, m2 + vbroadcasti128 m2, [mv+1*4*8 ] + psubw m1, m2 + packsswb m0, m1 + vinserti128 m1, m2, [mv+2*4*8-4], 0 + vbroadcasti128 m3, [mv+2*4*8 ] + vinserti128 m2, m3, [mv+3*4*8-4], 0 + psubw m1, m3 + vbroadcasti128 m3, [mv+3*4*8 ] + psubw m2, m3 + packsswb m1, m2 + pabsb m0, m0 + pabsb m1, m1 + psubusb m0, m5 + psubusb m1, m5 + packsswb m0, m1 + por m4, m0 + add r1, 40 + add r2, 4*8*5 + dec t0d jge .lists ; Check nnz LOAD_BYTES_YMM nnz - por m0, m1 - mova m6, [pb_1] - pminub m0, m6 - pminub m5, m6 ; mv ? 1 : 0 - paddb m0, m0 ; nnz ? 2 : 0 - pmaxub m5, m0 - vextracti128 [bs1], m5, 1 - pshufb xm5, [transpose_shuf] - mova [bs0], xm5 + mova m2, [pb_1] + por m0, m1 + pminub m0, m2 + pminub m4, m2 ; mv ? 1 : 0 + paddb m0, m0 ; nnz ? 2 : 0 + pmaxub m0, m4 + vextracti128 [bs1], m0, 1 + pshufb xm0, [transpose_shuf] + mova [bs0], xm0 + RET + +%macro LOAD_BYTES_ZMM 1 + vpermd m1, m6, [%1-12] + pshufb m1, m7 ; EF FG GH HI JK KL LM MN OP PQ QR RS TU UV VW WX +%endmacro ; AF BG CH DI FK GL HM IN KP LQ MR NS PU QV RW SX + +INIT_ZMM avx512 +cglobal deblock_strength, 5,5 + mova m6, [load_bytes_zmm_shuf] + shl r4d, 8 + add r4d, 3 - (1<<8) + vpbroadcastw m5, r4d + mov r4d, 0x34cc34cc ; {1,-1} * 11001100b + kmovb k1, r4d + vpbroadcastd m4, r4d + movifnidn t0d, r5m + psrld m7, m6, 4 + pxor xm3, xm3 + +.lists: + vbroadcasti64x2 m2, [mv+32] + vinserti64x2 m0, m2, [mv-32], 2 + vbroadcasti64x2 m1, [mv+ 0] + vinserti64x2 m0, m0, [mv- 4], 0 + vbroadcasti64x2 m1 {k1}, [mv+64] + vinserti64x2 m0, m0, [mv+60], 1 + psubw m0, m1 + vinserti64x2 m1, m1, [mv+28], 0 + vbroadcasti64x2 m2 {k1}, [mv+96] + vinserti64x2 m1, m1, [mv+92], 1 + psubw m1, m2 + packsswb m0, m1 + pabsb m0, m0 + psubusb m0, m5 + + LOAD_BYTES_ZMM ref + pmaddubsw m1, m4 ; E-F F-G G-H H-I ... + vpternlogd m3, m0, m1, 0xfe ; m3 | m0 | m1 + add r1, 40 + add r2, 4*8*5 + dec t0d + jge .lists + + LOAD_BYTES_ZMM nnz + mova ym2, [pb_1] + vptestmw k1, m1, m1 + vptestmw k2, m3, m3 + vpaddb ym0 {k1}{z}, ym2, ym2 ; nnz ? 2 : 0 + vpmaxub ym0 {k2}, ym2 ; mv ? 1 : 0 + vextracti128 [bs1], ym0, 1 + pshufb xm0, [transpose_shuf] + mova [bs0], xm0 RET diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/mc-a2.asm x264-0.152.2854+gite9a5903/common/x86/mc-a2.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/mc-a2.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/mc-a2.asm 2017-12-31 12:50:51.000000000 +0000 @@ -30,18 +30,15 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 - -pw_1024: times 16 dw 1024 -filt_mul20: times 32 db 20 -filt_mul15: times 16 db 1, -5 -filt_mul51: times 16 db -5, 1 -hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 +SECTION_RODATA 64 %if HIGH_BIT_DEPTH -v210_mask: times 4 dq 0xc00ffc003ff003ff -v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15 -v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14 +v210_shuf_avx512: db 0, 0,34, 1,35,34, 4, 4,38, 5,39,38, 8, 8,42, 9, ; luma, chroma + db 43,42,12,12,46,13,47,46,16,16,50,17,51,50,20,20, + db 54,21,55,54,24,24,58,25,59,58,28,28,62,29,63,62 +v210_mask: dd 0x3ff003ff, 0xc00ffc00, 0x3ff003ff, 0xc00ffc00 +v210_luma_shuf: db 1, 2, 4, 5, 6, 7, 9,10,12,13,14,15,12,13,14,15 +v210_chroma_shuf: db 0, 1, 2, 3, 5, 6, 8, 9,10,11,13,14,10,11,13,14 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800 dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800 @@ -58,6 +55,13 @@ deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif ; !HIGH_BIT_DEPTH +pw_1024: times 16 dw 1024 +filt_mul20: times 32 db 20 +filt_mul15: times 16 db 1, -5 +filt_mul51: times 16 db -5, 1 +hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 + +mbtree_prop_list_avx512_shuf: dw 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6 db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14 mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14 @@ -1044,8 +1048,8 @@ %endif ; HIGH_BIT_DEPTH %endmacro -%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned - mova m0, [%3] +%macro DEINTERLEAVE 6 ; dsta, dstb, src, dsta==dstb+8, shuffle constant, is aligned + mov%6 m0, [%3] %if mmsize == 32 pshufb m0, %5 vpermq m0, m0, q3120 @@ -1056,7 +1060,7 @@ vextracti128 [%2], m0, 1 %endif %elif HIGH_BIT_DEPTH - mova m1, [%3+mmsize] + mov%6 m1, [%3+mmsize] psrld m2, m0, 16 psrld m3, m1, 16 pand m0, %5 @@ -1181,8 +1185,8 @@ %macro PLANE_DEINTERLEAVE 0 ;----------------------------------------------------------------------------- -; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu, -; pixel *dstv, intptr_t i_dstv, +; void plane_copy_deinterleave( pixel *dsta, intptr_t i_dsta, +; pixel *dstb, intptr_t i_dstb, ; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- %if ARCH_X86_64 @@ -1400,43 +1404,64 @@ %define org_w r6m %define h dword r7m %endif - FIX_STRIDES r1, r3, r6d - shl r5, 2 - add r0, r6 - add r2, r6 - neg r6 - mov src, r4 - mov org_w, r6 - mova m2, [v210_mask] - mova m3, [v210_luma_shuf] - mova m4, [v210_chroma_shuf] - mova m5, [v210_mult] ; also functions as vpermd index for avx2 - pshufd m6, m5, q1102 - + FIX_STRIDES r1, r3, r6d + shl r5, 2 + add r0, r6 + add r2, r6 + neg r6 + mov src, r4 + mov org_w, r6 +%if cpuflag(avx512) + vpbroadcastd m2, [v210_mask] + vpbroadcastd m3, [v210_shuf_avx512] + psrlw m3, 6 ; dw 0, 4 + mova m4, [v210_shuf_avx512] ; luma + psrlw m5, m4, 8 ; chroma +%else +%if mmsize == 32 + vbroadcasti128 m2, [v210_mask] + vbroadcasti128 m3, [v210_luma_shuf] + vbroadcasti128 m4, [v210_chroma_shuf] +%else + mova m2, [v210_mask] + mova m3, [v210_luma_shuf] + mova m4, [v210_chroma_shuf] +%endif + mova m5, [v210_mult] ; also functions as vpermd index for avx2 + pshufd m6, m5, q1102 +%endif ALIGN 16 .loop: - movu m1, [r4] - pandn m0, m2, m1 - pand m1, m2 - pshufb m0, m3 - pshufb m1, m4 - pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __ - pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __ + movu m1, [r4] + pandn m0, m2, m1 + pand m1, m2 +%if cpuflag(avx512) + psrld m0, 10 + vpsrlvw m1, m3 + mova m6, m0 + vpermt2w m0, m4, m1 + vpermt2w m1, m5, m6 +%else + pshufb m0, m3 + pshufb m1, m4 + pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __ + pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __ %if mmsize == 32 - vpermd m0, m5, m0 - vpermd m1, m5, m1 + vpermd m0, m5, m0 + vpermd m1, m5, m1 %endif - movu [r0+r6], m0 - movu [r2+r6], m1 - add r4, mmsize - add r6, 3*mmsize/4 - jl .loop - add r0, r1 - add r2, r3 - add src, r5 - mov r4, src - mov r6, org_w - dec h +%endif + movu [r0+r6], m0 + movu [r2+r6], m1 + add r4, mmsize + add r6, mmsize*3/4 + jl .loop + add r0, r1 + add r2, r3 + add src, r5 + mov r4, src + mov r6, org_w + dec h jg .loop RET %endmacro ; PLANE_DEINTERLEAVE_V210 @@ -1461,6 +1486,8 @@ INIT_YMM avx2 LOAD_DEINTERLEAVE_CHROMA PLANE_DEINTERLEAVE_V210 +INIT_ZMM avx512 +PLANE_DEINTERLEAVE_V210 %else INIT_XMM sse2 PLANE_DEINTERLEAVE_RGB @@ -1473,82 +1500,85 @@ PLANE_DEINTERLEAVE_RGB %endif -; These functions are not general-use; not only do the SSE ones require aligned input, -; but they also will fail if given a non-mod16 size. -; memzero SSE will fail for non-mod128. +; These functions are not general-use; not only do they require aligned input, but memcpy +; requires size to be a multiple of 16 and memzero requires size to be a multiple of 128. ;----------------------------------------------------------------------------- ; void *memcpy_aligned( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- %macro MEMCPY 0 cglobal memcpy_aligned, 3,3 -%if mmsize == 16 +%if mmsize == 32 test r2d, 16 - jz .copy2 - mova m0, [r1+r2-16] - mova [r0+r2-16], m0 + jz .copy32 + mova xm0, [r1+r2-16] + mova [r0+r2-16], xm0 sub r2d, 16 -.copy2: + jle .ret +.copy32: %endif - test r2d, 2*mmsize - jz .copy4start + test r2d, mmsize + jz .loop + mova m0, [r1+r2-mmsize] + mova [r0+r2-mmsize], m0 + sub r2d, mmsize + jle .ret +.loop: mova m0, [r1+r2-1*mmsize] mova m1, [r1+r2-2*mmsize] mova [r0+r2-1*mmsize], m0 mova [r0+r2-2*mmsize], m1 sub r2d, 2*mmsize -.copy4start: - test r2d, r2d - jz .ret -.copy4: - mova m0, [r1+r2-1*mmsize] - mova m1, [r1+r2-2*mmsize] - mova m2, [r1+r2-3*mmsize] - mova m3, [r1+r2-4*mmsize] - mova [r0+r2-1*mmsize], m0 - mova [r0+r2-2*mmsize], m1 - mova [r0+r2-3*mmsize], m2 - mova [r0+r2-4*mmsize], m3 - sub r2d, 4*mmsize - jg .copy4 + jg .loop .ret: - REP_RET + RET %endmacro -INIT_MMX mmx -MEMCPY -INIT_XMM sse -MEMCPY - ;----------------------------------------------------------------------------- ; void *memzero_aligned( void *dst, size_t n ); ;----------------------------------------------------------------------------- -%macro MEMZERO 1 +%macro MEMZERO 0 cglobal memzero_aligned, 2,2 - add r0, r1 - neg r1 -%if mmsize == 8 - pxor m0, m0 -%else xorps m0, m0 -%endif .loop: -%assign i 0 -%rep %1 - mova [r0 + r1 + i], m0 -%assign i i+mmsize +%assign %%i mmsize +%rep 128 / mmsize + movaps [r0 + r1 - %%i], m0 +%assign %%i %%i+mmsize %endrep - add r1, mmsize*%1 - jl .loop + sub r1d, 128 + jg .loop RET %endmacro -INIT_MMX mmx -MEMZERO 8 INIT_XMM sse -MEMZERO 8 +MEMCPY +MEMZERO INIT_YMM avx -MEMZERO 4 +MEMCPY +MEMZERO +INIT_ZMM avx512 +MEMZERO + +cglobal memcpy_aligned, 3,4 + dec r2d ; offset of the last byte + rorx r3d, r2d, 2 + and r2d, ~63 + and r3d, 15 ; n = number of dwords minus one to copy in the tail + mova m0, [r1+r2] + not r3d ; bits 0-4: (n^15)+16, bits 16-31: 0xffff + shrx r3d, r3d, r3d ; 0xffff >> (n^15) + kmovw k1, r3d ; (1 << (n+1)) - 1 + vmovdqa32 [r0+r2] {k1}, m0 + sub r2d, 64 + jl .ret +.loop: + mova m0, [r1+r2] + mova [r0+r2], m0 + sub r2d, 64 + jge .loop +.ret: + RET %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- @@ -2147,13 +2177,13 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) vbroadcastss m5, [r5] mov r5d, r6m - lea r0, [r0+r5*2] + lea r2, [r2+r5*2] add r5d, r5d - add r1, r5 - add r2, r5 - add r3, r5 add r4, r5 neg r5 + sub r1, r5 + sub r3, r5 + sub r0, r5 mova xm4, [pw_3fff] %if notcpuflag(avx2) pxor xm7, xm7 @@ -2165,9 +2195,8 @@ pmovzxwd m2, [r1+r5] ; prop pand xm3, xm4, [r3+r5] ; inter pmovzxwd m3, xm3 - pminsd m3, m0 pmaddwd m1, m0 - psubd m3, m0, m3 + psubusw m3, m0, m3 cvtdq2ps m0, m0 cvtdq2ps m1, m1 cvtdq2ps m2, m2 @@ -2184,7 +2213,7 @@ movu xm1, [r4+r5] movu xm2, [r1+r5] pand xm3, xm4, [r3+r5] - pminsw xm3, xm0 + psubusw xm3, xm0, xm3 INT16_UNPACK 0 INT16_UNPACK 1 INT16_UNPACK 2 @@ -2194,7 +2223,6 @@ cvtdq2ps m2, m2 cvtdq2ps m3, m3 mulps m1, m0 - subps m3, m0, m3 mulps m1, m5 ; intra*invq*fps_factor>>8 addps m1, m2 ; prop + (intra*invq*fps_factor>>8) rcpps m2, m0 ; 1 / intra 1st approximation @@ -2205,7 +2233,7 @@ subps m2, m0 ; 2nd approximation for 1/intra mulps m1, m2 ; / intra %endif - vcvtps2dq m1, m1 + cvtps2dq m1, m1 vextractf128 xm2, m1, 1 packssdw xm1, xm2 mova [r0+r5], xm1 @@ -2219,6 +2247,39 @@ INIT_YMM avx2 MBTREE_AVX +INIT_ZMM avx512 +cglobal mbtree_propagate_cost, 6,6 + vbroadcastss m5, [r5] + mov r5d, 0x3fff3fff + vpbroadcastd ym4, r5d + mov r5d, r6m + lea r2, [r2+r5*2] + add r5d, r5d + add r1, r5 + neg r5 + sub r4, r5 + sub r3, r5 + sub r0, r5 +.loop: + pmovzxwd m0, [r2+r5] ; intra + pmovzxwd m1, [r1+r5] ; prop + pmovzxwd m2, [r4+r5] ; invq + pand ym3, ym4, [r3+r5] ; inter + pmovzxwd m3, ym3 + psubusw m3, m0, m3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + vdivps m1, m0, {rn-sae} + fmaddps m1, m2, m5, m1 + mulps m1, m3 + cvtps2dq m1, m1 + vpmovsdw [r0+r5], m1 + add r5, 32 + jl .loop + RET + %macro MBTREE_PROPAGATE_LIST 0 ;----------------------------------------------------------------------------- ; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, @@ -2372,6 +2433,112 @@ jl .loop RET +%if ARCH_X86_64 +;----------------------------------------------------------------------------- +; void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, +; uint16_t *lowres_costs, int bipred_weight, int mb_y, +; int width, int height, int stride, int list_mask ); +;----------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal mbtree_propagate_list_internal, 5,7,21 + mova xm16, [pw_0xc000] + vpbroadcastw xm17, r5m ; bipred_weight << 9 + vpbroadcastw ym18, r10m ; 1 << (list+LOWRES_COST_SHIFT) + vbroadcasti32x8 m5, [mbtree_prop_list_avx512_shuf] + vbroadcasti32x8 m6, [pd_0123] + vpord m6, r6m {1to16} ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y + vbroadcasti128 m7, [pd_8] + vbroadcasti128 m8, [pw_31] + vbroadcasti128 m9, [pw_32] + psllw m10, m9, 4 + pcmpeqw ym19, ym19 ; pw_m1 + vpbroadcastw ym20, r7m ; width + psrld m11, m7, 3 ; pd_1 + psrld m12, m8, 16 ; pd_31 + vpbroadcastd m13, r8m ; height + vpbroadcastd m14, r9m ; stride + pslld m15, m14, 16 + por m15, m11 ; {1, stride, 1, stride} ... + lea r4, [r4+2*r0] ; lowres_costs + lea r3, [r3+2*r0] ; propagate_amount + lea r2, [r2+4*r0] ; mvs + neg r0 + mov r6d, 0x5555ffff + kmovd k4, r6d + kshiftrd k5, k4, 16 ; 0x5555 + kshiftlw k6, k4, 8 ; 0xff00 +.loop: + vbroadcasti128 ym1, [r4+2*r0] + mova xm4, [r3+2*r0] + vpcmpuw k1, xm1, xm16, 5 ; if (lists_used == 3) + vpmulhrsw xm4 {k1}, xm17 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 + vptestmw k1, ym1, ym18 + vpermw m4, m5, m4 + + vbroadcasti32x8 m3, [r2+4*r0] ; {mvx, mvy} + psraw m0, m3, 5 + paddw m0, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y} + paddd m6, m7 ; i_mb_x += 8 + pand m3, m8 ; {x, y} + vprold m1, m3, 20 ; {y, x} << 4 + psubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y} + psubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4 + pmullw m3, m1 + paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000) + pmulhrsw m2, m3, m4 ; idx01weight idx23weightp + + pslld ym1, ym0, 16 + psubw ym1, ym19 + vmovdqu16 ym1 {k5}, ym0 + vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width + kunpckwd k2, k2, k2 + psrad m1, m0, 16 + paddd m1 {k6}, m11 + vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height + + pmaddwd m0, m15 + paddd m0 {k6}, m14 ; idx0 | idx2 + vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight + vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes + + ; We're handling dwords, but the offsets are in words so there may be partial overlaps. + ; We can work around this by handling dword-aligned and -unaligned offsets separately. + vptestmd k0, m0, m11 + kandnw k2, k0, k1 ; dword-aligned offsets + kmovw k3, k2 + vpgatherdd m3 {k2}, [r1+2*m0] + + ; If there are conflicts in the offsets we have to handle them before storing the results. + ; By creating a permutation index using vplzcntd we can resolve all conflicts in parallel + ; in ceil(log2(n)) iterations where n is the largest number of duplicate offsets. + vpconflictd m4, m0 + vpbroadcastmw2d m1, k1 + vptestmd k2, m1, m4 + ktestw k2, k2 + jz .no_conflicts + pand m1, m4 ; mask away unused offsets to avoid false positives + vplzcntd m1, m1 + pxor m1, m12 ; lzcnt gives us the distance from the msb, we want it from the lsb +.conflict_loop: + vpermd m4 {k2}{z}, m1, m2 + vpermd m1 {k2}, m1, m1 ; shift the index one step forward + paddsw m2, m4 ; add the weights of conflicting offsets + vpcmpd k2, m1, m12, 2 + ktestw k2, k2 + jnz .conflict_loop +.no_conflicts: + paddsw m3, m2 + vpscatterdd [r1+2*m0] {k3}, m3 + kandw k1, k0, k1 ; dword-unaligned offsets + kmovw k2, k1 + vpgatherdd m1 {k1}, [r1+2*m0] + paddsw m1, m2 ; all conflicts have already been resolved + vpscatterdd [r1+2*m0] {k2}, m1 + add r0, 8 + jl .loop + RET +%endif + %macro MBTREE_FIX8 0 ;----------------------------------------------------------------------------- ; void mbtree_fix8_pack( uint16_t *dst, float *src, int count ) diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/mc-a.asm x264-0.152.2854+gite9a5903/common/x86/mc-a.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/mc-a.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/mc-a.asm 2017-12-31 12:50:51.000000000 +0000 @@ -83,11 +83,11 @@ %endmacro %endif -%macro AVG_END 0 - lea t4, [t4+t5*2*SIZEOF_PIXEL] +%macro AVG_END 0-1 2 ; rows lea t2, [t2+t3*2*SIZEOF_PIXEL] + lea t4, [t4+t5*2*SIZEOF_PIXEL] lea t0, [t0+t1*2*SIZEOF_PIXEL] - sub eax, 2 + sub eax, %1 jg .height_loop RET %endmacro @@ -147,17 +147,24 @@ %endmacro %macro BIWEIGHT_START_SSSE3 0 - movzx t6d, byte r6m ; FIXME x86_64 - mov t7d, 64 - sub t7d, t6d - shl t7d, 8 - add t6d, t7d - mova m4, [pw_512] - movd xm3, t6d + movzx t6d, byte r6m ; FIXME x86_64 +%if mmsize > 16 + vbroadcasti128 m4, [pw_512] +%else + mova m4, [pw_512] +%endif + lea t7d, [t6+(64<<8)] + shl t6d, 8 + sub t7d, t6d +%if cpuflag(avx512) + vpbroadcastw m3, t7d +%else + movd xm3, t7d %if cpuflag(avx2) - vpbroadcastw m3, xm3 + vpbroadcastw m3, xm3 %else - SPLATW m3, m3 ; weight_dst,src + SPLATW m3, m3 ; weight_dst,src +%endif %endif %endmacro @@ -268,6 +275,66 @@ mova [t0], xm0 vextracti128 [t0+t1], m0, 1 AVG_END + +INIT_YMM avx512 +cglobal pixel_avg_weight_w8 + BIWEIGHT_START + kxnorb k1, k1, k1 + kaddb k1, k1, k1 + AVG_START 5 +.height_loop: + movq xm0, [t2] + movq xm2, [t4] + movq xm1, [t2+t3] + movq xm5, [t4+t5] + lea t2, [t2+t3*2] + lea t4, [t4+t5*2] + vpbroadcastq m0 {k1}, [t2] + vpbroadcastq m2 {k1}, [t4] + vpbroadcastq m1 {k1}, [t2+t3] + vpbroadcastq m5 {k1}, [t4+t5] + punpcklbw m0, m2 + punpcklbw m1, m5 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + vextracti128 xmm1, m0, 1 + movq [t0], xm0 + movhps [t0+t1], xm0 + lea t0, [t0+t1*2] + movq [t0], xmm1 + movhps [t0+t1], xmm1 + AVG_END 4 + +INIT_ZMM avx512 +cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 ym0, [t2+t3], 1 + vinserti128 ym1, [t4+t5], 1 + lea t2, [t2+t3*2] + lea t4, [t4+t5*2] + vinserti32x4 m0, [t2], 2 + vinserti32x4 m1, [t4], 2 + vinserti32x4 m0, [t2+t3], 3 + vinserti32x4 m1, [t4+t5], 3 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], ym0, 1 + lea t0, [t0+t1*2] + vextracti32x4 [t0], m0, 2 + vextracti32x4 [t0+t1], m0, 3 + AVG_END 4 %endif ;HIGH_BIT_DEPTH ;============================================================================= @@ -738,6 +805,12 @@ AVG_FUNC 16, movdqu, movdqa AVGH 16, 16 AVGH 16, 8 +INIT_XMM avx512 +AVGH 16, 16 +AVGH 16, 8 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 %endif ;HIGH_BIT_DEPTH diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/mc-c.c x264-0.152.2854+gite9a5903/common/x86/mc-c.c --- x264-0.148.2795+gitaaa9aa8/common/x86/mc-c.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/mc-c.c 2017-12-31 12:50:51.000000000 +0000 @@ -32,7 +32,8 @@ void func##_mmx2 args;\ void func##_sse2 args;\ void func##_ssse3 args;\ - void func##_avx2 args; + void func##_avx2 args;\ + void func##_avx512 args; DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) @@ -99,17 +100,17 @@ void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); -void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu, - pixel *dstv, intptr_t i_dstv, +void x264_plane_copy_deinterleave_sse2( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, pixel *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu, - uint8_t *dstv, intptr_t i_dstv, +void x264_plane_copy_deinterleave_ssse3( uint8_t *dsta, intptr_t i_dsta, + uint8_t *dstb, intptr_t i_dstb, uint8_t *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu, - uint16_t *dstv, intptr_t i_dstv, +void x264_plane_copy_deinterleave_avx( uint16_t *dsta, intptr_t i_dsta, + uint16_t *dstb, intptr_t i_dstb, uint16_t *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_avx2( pixel *dstu, intptr_t i_dstu, - pixel *dstv, intptr_t i_dstv, +void x264_plane_copy_deinterleave_avx2( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, pixel *src, intptr_t i_src, int w, int h ); void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, @@ -123,15 +124,18 @@ pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); -void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu, - uint16_t *dstv, intptr_t i_dstv, - uint32_t *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu, - uint16_t *dstv, intptr_t i_dstv, - uint32_t *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu, - uint16_t *dstv, intptr_t i_dstv, - uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_ssse3 ( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_avx512( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); @@ -143,11 +147,12 @@ void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); -void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n ); -void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n ); -void x264_memzero_aligned_mmx( void *dst, size_t n ); -void x264_memzero_aligned_sse( void *dst, size_t n ); -void x264_memzero_aligned_avx( void *dst, size_t n ); +void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n ); +void *x264_memcpy_aligned_avx ( void *dst, const void *src, size_t n ); +void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n ); +void x264_memzero_aligned_sse ( void *dst, size_t n ); +void x264_memzero_aligned_avx ( void *dst, size_t n ); +void x264_memzero_aligned_avx512( void *dst, size_t n ); void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); @@ -160,14 +165,16 @@ void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride ); -void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count ); void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count ); void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count ); @@ -498,6 +505,15 @@ PLANE_COPY_SWAP(16, ssse3) PLANE_COPY_SWAP(32, avx2) +#if HIGH_BIT_DEPTH +PLANE_COPY_YUYV(64, sse2) +PLANE_COPY_YUYV(64, avx) +#else +PLANE_COPY_YUYV(32, sse2) +PLANE_COPY_YUYV(32, ssse3) +#endif +PLANE_COPY_YUYV(64, avx2) + PLANE_INTERLEAVE(mmx2) PLANE_INTERLEAVE(sse2) #if HIGH_BIT_DEPTH @@ -538,6 +554,21 @@ PROPAGATE_LIST(avx) PROPAGATE_LIST(avx2) +#if ARCH_X86_64 +void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, + uint16_t *lowres_costs, int bipred_weight, int mb_y, + int width, int height, int stride, int list_mask ); + +static void x264_mbtree_propagate_list_avx512( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], + int16_t *propagate_amount, uint16_t *lowres_costs, + int bipred_weight, int mb_y, int len, int list ) +{ + x264_mbtree_propagate_list_internal_avx512( len, ref_costs, mvs, propagate_amount, lowres_costs, bipred_weight << 9, + mb_y << 16, h->mb.i_mb_width, h->mb.i_mb_height, h->mb.i_mb_stride, + (1 << LOWRES_COST_SHIFT) << list ); +} +#endif + void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) @@ -547,8 +578,6 @@ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; - pf->memcpy_aligned = x264_memcpy_aligned_mmx; - pf->memzero_aligned = x264_memzero_aligned_mmx; pf->integral_init4v = x264_integral_init4v_mmx; pf->integral_init8v = x264_integral_init8v_mmx; @@ -606,6 +635,7 @@ pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2; if( cpu&X264_CPU_SSE2_IS_FAST ) { @@ -661,6 +691,7 @@ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx; pf->plane_copy_interleave = x264_plane_copy_interleave_avx; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx; pf->store_interleave_chroma = x264_store_interleave_chroma_avx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx; @@ -677,6 +708,11 @@ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2; } + + if( cpu&X264_CPU_AVX512 ) + { + pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx512; + } #else // !HIGH_BIT_DEPTH #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead @@ -702,6 +738,7 @@ pf->hpel_filter = x264_hpel_filter_sse2_amd; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2; @@ -763,6 +800,7 @@ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_ssse3; } if( !(cpu&X264_CPU_SLOW_PALIGNR) ) @@ -828,10 +866,20 @@ pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2; } + + if( cpu&X264_CPU_AVX512 ) + { + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx512; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512; + } #endif // HIGH_BIT_DEPTH if( !(cpu&X264_CPU_AVX) ) return; + pf->memcpy_aligned = x264_memcpy_aligned_avx; pf->memzero_aligned = x264_memzero_aligned_avx; pf->plane_copy = x264_plane_copy_avx; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; @@ -844,10 +892,20 @@ return; pf->plane_copy_swap = x264_plane_copy_swap_avx2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx2; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2; pf->get_ref = get_ref_avx2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2; pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2; pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2; + + if( !(cpu&X264_CPU_AVX512) ) + return; + pf->memcpy_aligned = x264_memcpy_aligned_avx512; + pf->memzero_aligned = x264_memzero_aligned_avx512; + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512; +#if ARCH_X86_64 + pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx512; +#endif } diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/pixel-a.asm x264-0.152.2854+gite9a5903/common/x86/pixel-a.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/pixel-a.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/pixel-a.asm 2017-12-31 12:50:51.000000000 +0000 @@ -32,6 +32,8 @@ %include "x86util.asm" SECTION_RODATA 32 +var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1 + db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1 hmul_16p: times 16 db 1 times 8 db 1, -1 hmul_8p: times 8 db 1 @@ -701,25 +703,32 @@ %if HIGH_BIT_DEPTH == 0 %if %1 mova m7, [pw_00ff] -%elif mmsize < 32 +%elif mmsize == 16 pxor m7, m7 ; zero %endif %endif ; !HIGH_BIT_DEPTH %endmacro -%macro VAR_END 2 -%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256 - HADDUW m5, m2 -%else - HADDW m5, m2 +%macro VAR_END 0 + pmaddwd m5, [pw_1] + SBUTTERFLY dq, 5, 6, 0 + paddd m5, m6 +%if mmsize == 32 + vextracti128 xm6, m5, 1 + paddd xm5, xm6 %endif - HADDD m6, m1 + MOVHL xm6, xm5 + paddd xm5, xm6 %if ARCH_X86_64 - punpckldq m5, m6 - movq rax, m5 + movq rax, xm5 %else - movd eax, m5 - movd edx, m6 + movd eax, xm5 +%if cpuflag(avx) + pextrd edx, xm5, 1 +%else + pshuflw xm5, xm5, q1032 + movd edx, xm5 +%endif %endif RET %endmacro @@ -739,61 +748,25 @@ paddd m6, m4 %endmacro -%macro VAR_2ROW 2 - mov r2d, %2 -.loop: -%if HIGH_BIT_DEPTH - mova m0, [r0] - mova m1, [r0+mmsize] - mova m3, [r0+%1] - mova m4, [r0+%1+mmsize] -%else ; !HIGH_BIT_DEPTH - mova m0, [r0] - mova m3, [r0+%1] - punpckhbw m1, m0, m7 - punpcklbw m0, m7 - punpckhbw m4, m3, m7 - punpcklbw m3, m7 -%endif ; HIGH_BIT_DEPTH -%ifidn %1, r1 - lea r0, [r0+%1*2] -%else - add r0, r1 -%endif - VAR_CORE - dec r2d - jg .loop -%endmacro - ;----------------------------------------------------------------------------- ; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -INIT_MMX mmx2 -cglobal pixel_var_16x16, 2,3 - FIX_STRIDES r1 - VAR_START 0 - VAR_2ROW 8*SIZEOF_PIXEL, 16 - VAR_END 16, 16 - -cglobal pixel_var_8x16, 2,3 - FIX_STRIDES r1 - VAR_START 0 - VAR_2ROW r1, 8 - VAR_END 8, 16 - -cglobal pixel_var_8x8, 2,3 - FIX_STRIDES r1 - VAR_START 0 - VAR_2ROW r1, 4 - VAR_END 8, 8 - %if HIGH_BIT_DEPTH %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 FIX_STRIDES r1 VAR_START 0 - VAR_2ROW r1, 8 - VAR_END 16, 16 + mov r2d, 8 +.loop: + mova m0, [r0] + mova m1, [r0+mmsize] + mova m3, [r0+r1] + mova m4, [r0+r1+mmsize] + lea r0, [r0+r1*2] + VAR_CORE + dec r2d + jg .loop + VAR_END cglobal pixel_var_8x8, 2,3,8 lea r2, [r1*3] @@ -809,18 +782,16 @@ mova m3, [r0+r1*4] mova m4, [r0+r2*2] VAR_CORE - VAR_END 8, 8 + VAR_END %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR -INIT_XMM xop -VAR -%endif ; HIGH_BIT_DEPTH -%if HIGH_BIT_DEPTH == 0 +%else ; HIGH_BIT_DEPTH == 0 + %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 VAR_START 1 @@ -833,7 +804,7 @@ VAR_CORE dec r2d jg .loop - VAR_END 16, 16 + VAR_END cglobal pixel_var_8x8, 2,4,8 VAR_START 1 @@ -849,7 +820,7 @@ VAR_CORE dec r2d jg .loop - VAR_END 8, 8 + VAR_END cglobal pixel_var_8x16, 2,4,8 VAR_START 1 @@ -865,15 +836,13 @@ VAR_CORE dec r2d jg .loop - VAR_END 8, 16 + VAR_END %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR -INIT_XMM xop -VAR %endif ; !HIGH_BIT_DEPTH INIT_YMM avx2 @@ -898,209 +867,357 @@ VAR_CORE dec r2d jg .loop - vextracti128 xm0, m5, 1 - vextracti128 xm1, m6, 1 - paddw xm5, xm0 - paddd xm6, xm1 - HADDW xm5, xm2 - HADDD xm6, xm1 -%if ARCH_X86_64 - punpckldq xm5, xm6 - movq rax, xm5 + VAR_END + +%macro VAR_AVX512_CORE 1 ; accum +%if %1 + paddw m0, m2 + pmaddwd m2, m2 + paddw m0, m3 + pmaddwd m3, m3 + paddd m1, m2 + paddd m1, m3 %else - movd eax, xm5 - movd edx, xm6 + paddw m0, m2, m3 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m1, m2, m3 %endif - RET +%endmacro -%macro VAR2_END 3 - HADDW %2, xm1 - movd r1d, %2 - imul r1d, r1d - HADDD %3, xm1 - shr r1d, %1 - movd eax, %3 - movd [r4], %3 - sub eax, r1d ; sqr - (sum * sum >> shift) - RET +%macro VAR_AVX512_CORE_16x16 1 ; accum +%if HIGH_BIT_DEPTH + mova ym2, [r0] + vinserti64x4 m2, [r0+r1], 1 + mova ym3, [r0+2*r1] + vinserti64x4 m3, [r0+r3], 1 +%else + vbroadcasti64x2 ym2, [r0] + vbroadcasti64x2 m2 {k1}, [r0+r1] + vbroadcasti64x2 ym3, [r0+2*r1] + vbroadcasti64x2 m3 {k1}, [r0+r3] + pshufb m2, m4 + pshufb m3, m4 +%endif + VAR_AVX512_CORE %1 %endmacro -;----------------------------------------------------------------------------- -; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * ) -;----------------------------------------------------------------------------- -%macro VAR2_8x8_MMX 2 -cglobal pixel_var2_8x%1, 5,6 - FIX_STRIDES r1, r3 - VAR_START 0 - mov r5d, %1 -.loop: +%macro VAR_AVX512_CORE_8x8 1 ; accum %if HIGH_BIT_DEPTH - mova m0, [r0] - mova m1, [r0+mmsize] - psubw m0, [r2] - psubw m1, [r2+mmsize] -%else ; !HIGH_BIT_DEPTH - movq m0, [r0] - movq m1, m0 - movq m2, [r2] - movq m3, m2 - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - psubw m0, m2 - psubw m1, m3 -%endif ; HIGH_BIT_DEPTH - paddw m5, m0 - paddw m5, m1 - pmaddwd m0, m0 - pmaddwd m1, m1 - paddd m6, m0 - paddd m6, m1 - add r0, r1 - add r2, r3 - dec r5d - jg .loop - VAR2_END %2, m5, m6 + mova xm2, [r0] + mova xm3, [r0+r1] +%else + movq xm2, [r0] + movq xm3, [r0+r1] +%endif + vinserti128 ym2, [r0+2*r1], 1 + vinserti128 ym3, [r0+r2], 1 + lea r0, [r0+4*r1] + vinserti32x4 m2, [r0], 2 + vinserti32x4 m3, [r0+r1], 2 + vinserti32x4 m2, [r0+2*r1], 3 + vinserti32x4 m3, [r0+r2], 3 +%if HIGH_BIT_DEPTH == 0 + punpcklbw m2, m4 + punpcklbw m3, m4 +%endif + VAR_AVX512_CORE %1 %endmacro +INIT_ZMM avx512 +cglobal pixel_var_16x16, 2,4 + FIX_STRIDES r1 + mov r2d, 0xf0 + lea r3, [3*r1] +%if HIGH_BIT_DEPTH == 0 + vbroadcasti64x4 m4, [var_shuf_avx512] + kmovb k1, r2d +%endif + VAR_AVX512_CORE_16x16 0 +.loop: + lea r0, [r0+4*r1] + VAR_AVX512_CORE_16x16 1 + sub r2d, 0x50 + jg .loop %if ARCH_X86_64 == 0 -INIT_MMX mmx2 -VAR2_8x8_MMX 8, 6 -VAR2_8x8_MMX 16, 7 + pop r3d + %assign regs_used 3 +%endif +var_avx512_end: + vbroadcasti32x4 m2, [pw_1] + pmaddwd m0, m2 + SBUTTERFLY dq, 0, 1, 2 + paddd m0, m1 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 + vextracti128 xm1, ym0, 1 + paddd xmm0, xm0, xm1 + punpckhqdq xmm1, xmm0, xmm0 + paddd xmm0, xmm1 +%if ARCH_X86_64 + movq rax, xmm0 +%else + movd eax, xmm0 + pextrd edx, xmm0, 1 +%endif + RET + +%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth +cglobal pixel_var_8x8, 2,3 + lea r2, [3*r1] + pxor xm4, xm4 + VAR_AVX512_CORE_8x8 0 + jmp var_avx512_end %endif +cglobal pixel_var_8x16, 2,3 + FIX_STRIDES r1 + lea r2, [3*r1] +%if HIGH_BIT_DEPTH == 0 + pxor xm4, xm4 +%endif + VAR_AVX512_CORE_8x8 0 + lea r0, [r0+4*r1] + VAR_AVX512_CORE_8x8 1 + jmp var_avx512_end + +;----------------------------------------------------------------------------- +; int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] ) +;----------------------------------------------------------------------------- + +%if ARCH_X86_64 + DECLARE_REG_TMP 6 +%else + DECLARE_REG_TMP 2 +%endif + +%macro VAR2_END 3 ; src, tmp, shift + movifnidn r2, r2mp + pshufd %2, %1, q3331 + pmuludq %1, %1 + movq [r2], %2 ; sqr_u sqr_v + psrld %1, %3 + psubd %2, %1 ; sqr - (sum * sum >> shift) + MOVHL %1, %2 + paddd %1, %2 + movd eax, %1 + RET +%endmacro + %macro VAR2_8x8_SSE2 2 -cglobal pixel_var2_8x%1, 5,6,8 - VAR_START 1 - mov r5d, %1/2 +%if HIGH_BIT_DEPTH +cglobal pixel_var2_8x%1, 2,3,6 + pxor m4, m4 + pxor m5, m5 +%define %%sum2 m4 +%define %%sqr2 m5 +%else +cglobal pixel_var2_8x%1, 2,3,7 + mova m6, [pw_00ff] +%define %%sum2 m0 +%define %%sqr2 m1 +%endif + pxor m0, m0 ; sum + pxor m1, m1 ; sqr + mov t0d, (%1-1)*FENC_STRIDEB .loop: %if HIGH_BIT_DEPTH - mova m0, [r0] - mova m1, [r0+r1*2] - mova m2, [r2] - mova m3, [r2+r3*2] -%else ; !HIGH_BIT_DEPTH - movq m1, [r0] - movhps m1, [r0+r1] - movq m3, [r2] - movhps m3, [r2+r3] - DEINTB 0, 1, 2, 3, 7 -%endif ; HIGH_BIT_DEPTH - psubw m0, m2 - psubw m1, m3 - paddw m5, m0 - paddw m5, m1 - pmaddwd m0, m0 - pmaddwd m1, m1 - paddd m6, m0 - paddd m6, m1 - lea r0, [r0+r1*2*SIZEOF_PIXEL] - lea r2, [r2+r3*2*SIZEOF_PIXEL] - dec r5d - jg .loop - VAR2_END %2, m5, m6 + mova m2, [r0+1*t0] + psubw m2, [r1+2*t0] + mova m3, [r0+1*t0+16] + psubw m3, [r1+2*t0+32] +%else + mova m3, [r0+1*t0] + movq m5, [r1+2*t0] + punpcklqdq m5, [r1+2*t0+16] + DEINTB 2, 3, 4, 5, 6 + psubw m2, m4 + psubw m3, m5 +%endif + paddw m0, m2 + pmaddwd m2, m2 + paddw %%sum2, m3 + pmaddwd m3, m3 + paddd m1, m2 + paddd %%sqr2, m3 + sub t0d, FENC_STRIDEB + jge .loop +%if HIGH_BIT_DEPTH + SBUTTERFLY dq, 0, 4, 2 + paddw m0, m4 ; sum_u sum_v + pmaddwd m0, [pw_1] + SBUTTERFLY dq, 1, 5, 2 + paddd m1, m5 ; sqr_u sqr_v + SBUTTERFLY dq, 0, 1, 2 + paddd m0, m1 +%else + pmaddwd m0, [pw_1] + shufps m2, m0, m1, q2020 + shufps m0, m1, q3131 + paddd m0, m2 + pshufd m0, m0, q3120 ; sum_u sqr_u sum_v sqr_v +%endif + VAR2_END m0, m1, %2 %endmacro INIT_XMM sse2 VAR2_8x8_SSE2 8, 6 VAR2_8x8_SSE2 16, 7 +%macro VAR2_CORE 3 ; src1, src2, accum +%if %3 + paddw m0, %1 + pmaddwd %1, %1 + paddw m0, %2 + pmaddwd %2, %2 + paddd m1, %1 + paddd m1, %2 +%else + paddw m0, %1, %2 + pmaddwd %1, %1 + pmaddwd %2, %2 + paddd m1, %1, %2 +%endif +%endmacro + %if HIGH_BIT_DEPTH == 0 -%macro VAR2_8x8_SSSE3 2 -cglobal pixel_var2_8x%1, 5,6,8 - pxor m5, m5 ; sum - pxor m6, m6 ; sum squared - mova m7, [hsub_mul] - mov r5d, %1/4 +INIT_XMM ssse3 +cglobal pixel_var2_internal + pxor m0, m0 ; sum + pxor m1, m1 ; sqr .loop: - movq m0, [r0] - movq m2, [r2] - movq m1, [r0+r1] - movq m3, [r2+r3] - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, [r0] - movq m3, [r2] - punpcklbw m2, m3 - movq m3, [r0+r1] - movq m4, [r2+r3] - punpcklbw m3, m4 - pmaddubsw m0, m7 - pmaddubsw m1, m7 - pmaddubsw m2, m7 - pmaddubsw m3, m7 - paddw m5, m0 - paddw m5, m1 - paddw m5, m2 - paddw m5, m3 - pmaddwd m0, m0 - pmaddwd m1, m1 - pmaddwd m2, m2 - pmaddwd m3, m3 - paddd m6, m0 - paddd m6, m1 - paddd m6, m2 - paddd m6, m3 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - dec r5d + movq m2, [r0+1*t0] + punpcklbw m2, [r1+2*t0] + movq m3, [r0+1*t0-1*FENC_STRIDE] + punpcklbw m3, [r1+2*t0-1*FDEC_STRIDE] + movq m4, [r0+1*t0-2*FENC_STRIDE] + punpcklbw m4, [r1+2*t0-2*FDEC_STRIDE] + movq m5, [r0+1*t0-3*FENC_STRIDE] + punpcklbw m5, [r1+2*t0-3*FDEC_STRIDE] + pmaddubsw m2, m7 + pmaddubsw m3, m7 + pmaddubsw m4, m7 + pmaddubsw m5, m7 + VAR2_CORE m2, m3, 1 + VAR2_CORE m4, m5, 1 + sub t0d, 4*FENC_STRIDE jg .loop - VAR2_END %2, m5, m6 + pmaddwd m0, [pw_1] + ret + +%macro VAR2_8x8_SSSE3 2 +cglobal pixel_var2_8x%1, 2,3,8 + mova m7, [hsub_mul] + mov t0d, (%1-1)*FENC_STRIDE + call pixel_var2_internal_ssse3 ; u + add r0, 8 + add r1, 16 + SBUTTERFLY qdq, 0, 1, 6 + paddd m1, m0 + mov t0d, (%1-1)*FENC_STRIDE + call pixel_var2_internal_ssse3 ; v + SBUTTERFLY qdq, 0, 6, 2 + paddd m0, m6 + phaddd m1, m0 ; sum_u sqr_u sum_v sqr_v + VAR2_END m1, m0, %2 %endmacro -INIT_XMM ssse3 -VAR2_8x8_SSSE3 8, 6 -VAR2_8x8_SSSE3 16, 7 -INIT_XMM xop VAR2_8x8_SSSE3 8, 6 VAR2_8x8_SSSE3 16, 7 +%endif ; !HIGH_BIT_DEPTH + +%macro VAR2_AVX2_LOAD 3 ; offset_reg, row1_offset, row2_offset +%if HIGH_BIT_DEPTH +%if mmsize == 64 + mova m2, [r1+2*%1+%2*FDEC_STRIDEB] + vshufi32x4 m2, [r1+2*%1+%2*FDEC_STRIDEB+64], q2020 + mova m3, [r1+2*%1+%3*FDEC_STRIDEB] + vshufi32x4 m3, [r1+2*%1+%3*FDEC_STRIDEB+64], q2020 +%else + mova xm2, [r1+2*%1+%2*FDEC_STRIDEB] + vinserti128 m2, [r1+2*%1+%2*FDEC_STRIDEB+32], 1 + mova xm3, [r1+2*%1+%3*FDEC_STRIDEB] + vinserti128 m3, [r1+2*%1+%3*FDEC_STRIDEB+32], 1 +%endif + psubw m2, [r0+1*%1+%2*FENC_STRIDEB] + psubw m3, [r0+1*%1+%3*FENC_STRIDEB] +%else + pmovzxbw m2, [r0+1*%1+%2*FENC_STRIDE] + mova m4, [r1+2*%1+%2*FDEC_STRIDE] + pmovzxbw m3, [r0+1*%1+%3*FENC_STRIDE] + mova m5, [r1+2*%1+%3*FDEC_STRIDE] + punpcklbw m4, m6 + punpcklbw m5, m6 + psubw m2, m4 + psubw m3, m5 +%endif +%endmacro %macro VAR2_8x8_AVX2 2 -cglobal pixel_var2_8x%1, 5,6,6 - pxor m3, m3 ; sum - pxor m4, m4 ; sum squared - mova m5, [hsub_mul] - mov r5d, %1/4 +%if HIGH_BIT_DEPTH +cglobal pixel_var2_8x%1, 2,3,4 +%else +cglobal pixel_var2_8x%1, 2,3,7 + pxor m6, m6 +%endif + mov t0d, (%1-3)*FENC_STRIDEB + VAR2_AVX2_LOAD t0, 2, 1 + VAR2_CORE m2, m3, 0 .loop: - movq xm0, [r0] - movq xm1, [r2] - vinserti128 m0, m0, [r0+r1], 1 - vinserti128 m1, m1, [r2+r3], 1 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - punpcklbw m0, m1 - movq xm1, [r0] - movq xm2, [r2] - vinserti128 m1, m1, [r0+r1], 1 - vinserti128 m2, m2, [r2+r3], 1 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - punpcklbw m1, m2 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - paddw m3, m0 - paddw m3, m1 - pmaddwd m0, m0 - pmaddwd m1, m1 - paddd m4, m0 - paddd m4, m1 - dec r5d + VAR2_AVX2_LOAD t0, 0, -1 + VAR2_CORE m2, m3, 1 + sub t0d, 2*FENC_STRIDEB jg .loop - vextracti128 xm0, m3, 1 - vextracti128 xm1, m4, 1 - paddw xm3, xm0 - paddd xm4, xm1 - VAR2_END %2, xm3, xm4 + + pmaddwd m0, [pw_1] + SBUTTERFLY qdq, 0, 1, 2 + paddd m0, m1 + vextracti128 xm1, m0, 1 + phaddd xm0, xm1 + VAR2_END xm0, xm1, %2 %endmacro INIT_YMM avx2 VAR2_8x8_AVX2 8, 6 VAR2_8x8_AVX2 16, 7 -%endif ; !HIGH_BIT_DEPTH +%macro VAR2_AVX512_END 1 ; shift + vbroadcasti32x4 m2, [pw_1] + pmaddwd m0, m2 + SBUTTERFLY qdq, 0, 1, 2 + paddd m0, m1 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 + psrlq ym1, ym0, 32 + paddd ym0, ym1 + vpmovqd xmm0, ym0 ; sum_u, sqr_u, sum_v, sqr_v + VAR2_END xmm0, xmm1, %1 +%endmacro + +INIT_ZMM avx512 +cglobal pixel_var2_8x8, 2,3 +%if HIGH_BIT_DEPTH == 0 + pxor xm6, xm6 +%endif + VAR2_AVX2_LOAD 0, 0, 2 + VAR2_CORE m2, m3, 0 + VAR2_AVX2_LOAD 0, 4, 6 + VAR2_CORE m2, m3, 1 + VAR2_AVX512_END 6 + +cglobal pixel_var2_8x16, 2,3 +%if HIGH_BIT_DEPTH == 0 + pxor xm6, xm6 +%endif + mov t0d, 10*FENC_STRIDEB + VAR2_AVX2_LOAD 0, 14, 12 + VAR2_CORE m2, m3, 0 +.loop: + VAR2_AVX2_LOAD t0, 0, -2 + VAR2_CORE m2, m3, 1 + sub t0d, 4*FENC_STRIDEB + jg .loop + VAR2_AVX512_END 7 ;============================================================================= ; SATD @@ -4583,6 +4700,244 @@ mov rsp, r6 mov eax, r2d RET + +%macro SATD_AVX512_LOAD4 2 ; size, opmask + vpbroadcast%1 m0, [r0] + vpbroadcast%1 m0 {%2}, [r0+2*r1] + vpbroadcast%1 m2, [r2] + vpbroadcast%1 m2 {%2}, [r2+2*r3] + add r0, r1 + add r2, r3 + vpbroadcast%1 m1, [r0] + vpbroadcast%1 m1 {%2}, [r0+2*r1] + vpbroadcast%1 m3, [r2] + vpbroadcast%1 m3 {%2}, [r2+2*r3] +%endmacro + +%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3 + vpbroadcast%1 %{2}0, [r0] + vpbroadcast%1 %{2}0 {%3}, [r0+2*r1] + vpbroadcast%1 %{2}2, [r2] + vpbroadcast%1 %{2}2 {%3}, [r2+2*r3] + vpbroadcast%1 m0 {%4}, [r0+4*r1] + vpbroadcast%1 m2 {%4}, [r2+4*r3] + vpbroadcast%1 m0 {%5}, [r0+2*r4] + vpbroadcast%1 m2 {%5}, [r2+2*r5] + vpbroadcast%1 %{2}1, [r0+r1] + vpbroadcast%1 %{2}1 {%3}, [r0+r4] + vpbroadcast%1 %{2}3, [r2+r3] + vpbroadcast%1 %{2}3 {%3}, [r2+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + vpbroadcast%1 m1 {%4}, [r0+r1] + vpbroadcast%1 m3 {%4}, [r2+r3] + vpbroadcast%1 m1 {%5}, [r0+r4] + vpbroadcast%1 m3 {%5}, [r2+r5] +%endmacro + +%macro SATD_AVX512_PACKED 0 + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 + SUMSUB_BA w, 0, 1, 2 + SBUTTERFLY qdq, 0, 1, 2 + SUMSUB_BA w, 0, 1, 2 + HMAXABSW2 0, 1, 2, 3 +%endmacro + +%macro SATD_AVX512_END 0-1 0 ; sa8d + paddw m0 {k1}{z}, m1 ; zero-extend to dwords +%if ARCH_X86_64 +%if mmsize == 64 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 +%endif +%if mmsize >= 32 + vextracti128 xm1, ym0, 1 + paddd xmm0, xm0, xm1 +%endif + punpckhqdq xmm1, xmm0, xmm0 + paddd xmm0, xmm1 + movq rax, xmm0 + rorx rdx, rax, 32 +%if %1 + lea eax, [rax+rdx+1] + shr eax, 1 +%else + add eax, edx +%endif +%else + HADDD m0, m1 + movd eax, xm0 +%if %1 + inc eax + shr eax, 1 +%endif +%endif + RET +%endmacro + +%macro HMAXABSW2 4 ; a, b, tmp1, tmp2 + pabsw m%1, m%1 + pabsw m%2, m%2 + psrldq m%3, m%1, 2 + psrld m%4, m%2, 16 + pmaxsw m%1, m%3 + pmaxsw m%2, m%4 +%endmacro + +INIT_ZMM avx512 +cglobal pixel_satd_16x8_internal + vbroadcasti64x4 m6, [hmul_16p] + kxnorb k2, k2, k2 + mov r4d, 0x55555555 + knotw k2, k2 + kmovd k1, r4d + lea r4, [3*r1] + lea r5, [3*r3] +satd_16x8_avx512: + vbroadcasti128 ym0, [r0] + vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4 + vbroadcasti128 ym4, [r2] + vbroadcasti32x4 m4 {k2}, [r2+4*r3] + vbroadcasti128 ym2, [r0+2*r1] + vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6 + vbroadcasti128 ym5, [r2+2*r3] + vbroadcasti32x4 m5 {k2}, [r2+2*r5] + DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6 + vbroadcasti128 ym1, [r0+r1] + vbroadcasti128 ym4, [r2+r3] + vbroadcasti128 ym3, [r0+r4] + vbroadcasti128 ym5, [r2+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5 + vbroadcasti32x4 m4 {k2}, [r2+r3] + vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7 + vbroadcasti32x4 m5 {k2}, [r2+r5] + DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6 + HADAMARD4_V 0, 1, 2, 3, 4 + HMAXABSW2 0, 2, 4, 5 + HMAXABSW2 1, 3, 4, 5 + paddw m4, m0, m2 ; m1 + paddw m2, m1, m3 ; m0 + ret + +cglobal pixel_satd_8x8_internal + vbroadcasti64x4 m4, [hmul_16p] + mov r4d, 0x55555555 + kmovd k1, r4d ; 01010101 + kshiftlb k2, k1, 5 ; 10100000 + kshiftlb k3, k1, 4 ; 01010000 + lea r4, [3*r1] + lea r5, [3*r3] +satd_8x8_avx512: + SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 + SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5 + ret + +cglobal pixel_satd_16x8, 4,6 + call pixel_satd_16x8_internal_avx512 + jmp satd_zmm_avx512_end + +cglobal pixel_satd_16x16, 4,6 + call pixel_satd_16x8_internal_avx512 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m7, m0, m1 + call satd_16x8_avx512 + paddw m1, m7 + jmp satd_zmm_avx512_end + +cglobal pixel_satd_8x8, 4,6 + call pixel_satd_8x8_internal_avx512 +satd_zmm_avx512_end: + SATD_AVX512_END + +cglobal pixel_satd_8x16, 4,6 + call pixel_satd_8x8_internal_avx512 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m5, m0, m1 + call satd_8x8_avx512 + paddw m1, m5 + jmp satd_zmm_avx512_end + +INIT_YMM avx512 +cglobal pixel_satd_4x8_internal + vbroadcasti128 m4, [hmul_4p] + mov r4d, 0x55550c + kmovd k2, r4d ; 00001100 + kshiftlb k3, k2, 2 ; 00110000 + kshiftlb k4, k2, 4 ; 11000000 + kshiftrd k1, k2, 8 ; 01010101 + lea r4, [3*r1] + lea r5, [3*r3] +satd_4x8_avx512: + SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6 +satd_ymm_avx512: ; 1 1 3 3 5 5 7 7 + SATD_AVX512_PACKED + ret + +cglobal pixel_satd_8x4, 4,5 + mova m4, [hmul_16p] + mov r4d, 0x5555 + kmovw k1, r4d + SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0 + call satd_ymm_avx512 ; 3 1 3 1 + jmp satd_ymm_avx512_end2 + +cglobal pixel_satd_4x8, 4,6 + call pixel_satd_4x8_internal_avx512 +satd_ymm_avx512_end: +%if ARCH_X86_64 == 0 + pop r5d + %assign regs_used 5 +%endif +satd_ymm_avx512_end2: + SATD_AVX512_END + +cglobal pixel_satd_4x16, 4,6 + call pixel_satd_4x8_internal_avx512 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m5, m0, m1 + call satd_4x8_avx512 + paddw m1, m5 + jmp satd_ymm_avx512_end + +INIT_XMM avx512 +cglobal pixel_satd_4x4, 4,5 + mova m4, [hmul_4p] + mov r4d, 0x550c + kmovw k2, r4d + kshiftrw k1, k2, 8 + SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2 + SATD_AVX512_PACKED ; 1 1 3 3 + SWAP 0, 1 + SATD_AVX512_END + +INIT_ZMM avx512 +cglobal pixel_sa8d_8x8, 4,6 + vbroadcasti64x4 m4, [hmul_16p] + mov r4d, 0x55555555 + kmovd k1, r4d ; 01010101 + kshiftlb k2, k1, 5 ; 10100000 + kshiftlb k3, k1, 4 ; 01010000 + lea r4, [3*r1] + lea r5, [3*r3] + SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5 + SUMSUB_BA w, 0, 1, 2 + SBUTTERFLY qdq, 0, 1, 2 + SUMSUB_BA w, 0, 1, 2 + shufps m2, m0, m1, q2020 + shufps m1, m0, m1, q3131 + SUMSUB_BA w, 2, 1, 0 + vshufi32x4 m0, m2, m1, q1010 + vshufi32x4 m1, m2, m1, q3232 + SUMSUB_BA w, 0, 1, 2 + HMAXABSW2 0, 1, 2, 3 + SATD_AVX512_END 1 + %endif ; HIGH_BIT_DEPTH ;============================================================================= diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/pixel.h x264-0.152.2854+gite9a5903/common/x86/pixel.h --- x264-0.148.2795+gitaaa9aa8/common/x86/pixel.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/pixel.h 2017-12-31 12:50:51.000000000 +0000 @@ -52,6 +52,7 @@ DECL_X1( sad, ssse3 ) DECL_X1( sad, ssse3_aligned ) DECL_X1( sad, avx2 ) +DECL_X1( sad, avx512 ) DECL_X4( sad, mmx2 ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) @@ -59,6 +60,7 @@ DECL_X4( sad, xop ) DECL_X4( sad, avx ) DECL_X4( sad, avx2 ) +DECL_X4( sad, avx512 ) DECL_X1( ssd, mmx ) DECL_X1( ssd, mmx2 ) DECL_X1( ssd, sse2slow ) @@ -75,6 +77,7 @@ DECL_X1( satd, avx ) DECL_X1( satd, xop ) DECL_X1( satd, avx2 ) +DECL_X1( satd, avx512 ) DECL_X1( sa8d, mmx2 ) DECL_X1( sa8d, sse2 ) DECL_X1( sa8d, ssse3 ) @@ -83,6 +86,7 @@ DECL_X1( sa8d, avx ) DECL_X1( sa8d, xop ) DECL_X1( sa8d, avx2 ) +DECL_X1( sa8d, avx512 ) DECL_X1( sad, cache32_mmx2 ); DECL_X1( sad, cache64_mmx2 ); DECL_X1( sad, cache64_sse2 ); @@ -92,11 +96,10 @@ DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); -DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride )) @@ -165,16 +168,14 @@ const pixel *pix2, intptr_t stride2, int sums[2][4] ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width ); -int x264_pixel_var2_8x8_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); -int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); -int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x8_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); -int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); -int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] ); +int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] ); +int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] ); int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height ); diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/quant-a.asm x264-0.152.2854+gite9a5903/common/x86/quant-a.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/quant-a.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/quant-a.asm 2017-12-31 12:50:51.000000000 +0000 @@ -30,7 +30,14 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 + +%if HIGH_BIT_DEPTH +decimate_shuf_avx512: dd 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15 +%else +dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30 + dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62 +%endif %macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3 @@ -42,14 +49,6 @@ dw %4, %2, %6, %2, %4, %2, %6, %2 %endmacro -dequant4_scale: - DQM4 10, 13, 16 - DQM4 11, 14, 18 - DQM4 13, 16, 20 - DQM4 14, 18, 23 - DQM4 16, 20, 25 - DQM4 18, 23, 29 - dequant8_scale: DQM8 20, 18, 32, 19, 25, 24 DQM8 22, 19, 35, 21, 28, 26 @@ -58,6 +57,14 @@ DQM8 32, 28, 51, 30, 40, 38 DQM8 36, 32, 58, 34, 46, 43 +dequant4_scale: + DQM4 10, 13, 16 + DQM4 11, 14, 18 + DQM4 13, 16, 20 + DQM4 14, 18, 23 + DQM4 16, 20, 25 + DQM4 18, 23, 29 + decimate_mask_table4: db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14 @@ -743,6 +750,163 @@ DEQUANT 8, 6, 4 %endif +%macro DEQUANT_START_AVX512 1-2 0 ; shift, flat +%if %2 == 0 + movifnidn t2d, r2m +%endif + imul t0d, t2d, 0x2b + shr t0d, 8 ; i_qbits = i_qp / 6 + lea t1d, [t0*5] + sub t2d, t0d + sub t2d, t1d ; i_mf = i_qp % 6 + shl t2d, %1 +%if %2 +%ifdef PIC +%define dmf r1+t2 + lea r1, [dequant8_scale] +%else +%define dmf t2+dequant8_scale +%endif +%elif ARCH_X86_64 +%define dmf r1+t2 +%else +%define dmf r1 + add r1, r1mp ; dequant_mf[i_mf] +%endif + movifnidn r0, r0mp +%endmacro + +INIT_ZMM avx512 +cglobal dequant_4x4, 0,3 + DEQUANT_START_AVX512 6 + mova m0, [dmf] +%if HIGH_BIT_DEPTH + pmaddwd m0, [r0] +%endif + sub t0d, 4 + jl .rshift +%if HIGH_BIT_DEPTH + vpbroadcastd m1, t0d + vpsllvd m0, m1 + mova [r0], m0 +%else + vpbroadcastw ym1, t0d + vpmovsdw ym0, m0 + pmullw ym0, [r0] + vpsllvw ym0, ym1 + mova [r0], ym0 +%endif + RET +.rshift: +%if HIGH_BIT_DEPTH == 0 + pmovzxwd m1, [r0] + pmaddwd m0, m1 +%endif + mov r1d, 1<<31 + shrx r1d, r1d, t0d ; 1 << (-i_qbits-1) + neg t0d + vpbroadcastd m1, r1d + vpbroadcastd m2, t0d + paddd m0, m1 + vpsravd m0, m2 +%if HIGH_BIT_DEPTH + mova [r0], m0 +%else + vpmovsdw [r0], m0 +%endif + RET + +cglobal dequant_8x8, 0,3 + DEQUANT_START_AVX512 8 + mova m0, [dmf+0*64] + mova m1, [dmf+1*64] + mova m2, [dmf+2*64] + mova m3, [dmf+3*64] +%if HIGH_BIT_DEPTH + pmaddwd m0, [r0+0*64] + pmaddwd m1, [r0+1*64] + pmaddwd m2, [r0+2*64] + pmaddwd m3, [r0+3*64] +%else + mova m6, [dequant_shuf_avx512] +%endif + sub t0d, 6 + jl .rshift +%if HIGH_BIT_DEPTH + vpbroadcastd m4, t0d + vpsllvd m0, m4 + vpsllvd m1, m4 + vpsllvd m2, m4 + vpsllvd m3, m4 + jmp .end +.rshift: +%else + vpbroadcastw m4, t0d + vpermt2w m0, m6, m1 + vpermt2w m2, m6, m3 + pmullw m0, [r0] + pmullw m2, [r0+64] + vpsllvw m0, m4 + vpsllvw m2, m4 + mova [r0], m0 + mova [r0+64], m2 + RET +.rshift: + pmovzxwd m4, [r0+0*32] + pmovzxwd m5, [r0+1*32] + pmaddwd m0, m4 + pmaddwd m1, m5 + pmovzxwd m4, [r0+2*32] + pmovzxwd m5, [r0+3*32] + pmaddwd m2, m4 + pmaddwd m3, m5 +%endif + mov r1d, 1<<31 + shrx r1d, r1d, t0d ; 1 << (-i_qbits-1) + neg t0d + vpbroadcastd m4, r1d + vpbroadcastd m5, t0d + paddd m0, m4 + paddd m1, m4 + vpsravd m0, m5 + vpsravd m1, m5 + paddd m2, m4 + paddd m3, m4 + vpsravd m2, m5 + vpsravd m3, m5 +%if HIGH_BIT_DEPTH +.end: + mova [r0+0*64], m0 + mova [r0+1*64], m1 + mova [r0+2*64], m2 + mova [r0+3*64], m3 +%else + vpermt2w m0, m6, m1 + vpermt2w m2, m6, m3 + mova [r0], m0 + mova [r0+64], m2 +%endif + RET + +%if HIGH_BIT_DEPTH == 0 +cglobal dequant_8x8_flat16, 0,3 + movifnidn t2d, r2m + cmp t2d, 12 + jl dequant_8x8_avx512 + sub t2d, 12 + DEQUANT_START_AVX512 6, 1 + vpbroadcastw m0, t0d + mova m1, [dmf] + vpsllvw m1, m0 + pmullw m0, m1, [r0] + pmullw m1, [r0+64] + mova [r0], m0 + mova [r0+64], m1 + RET +%endif + +%undef dmf + %macro DEQUANT_DC 2 cglobal dequant_4x4dc, 0,3,6 DEQUANT_START 6, 6 @@ -1208,13 +1372,12 @@ ; int decimate_score( dctcoef *dct ) ;----------------------------------------------------------------------------- -%macro DECIMATE_MASK 5 -%if mmsize==16 +%macro DECIMATE_MASK 4 %if HIGH_BIT_DEPTH - movdqa m0, [%3+ 0] - movdqa m1, [%3+32] - packssdw m0, [%3+16] - packssdw m1, [%3+48] + mova m0, [%3+0*16] + packssdw m0, [%3+1*16] + mova m1, [%3+2*16] + packssdw m1, [%3+3*16] ABSW2 m0, m1, m0, m1, m3, m4 %else ABSW m0, [%3+ 0], m3 @@ -1226,40 +1389,35 @@ pcmpgtb m0, %4 pmovmskb %1, m2 pmovmskb %2, m0 -%else ; mmsize==8 +%endmacro + +%macro DECIMATE_MASK16_AVX512 0 + mova m0, [r0] %if HIGH_BIT_DEPTH - movq m0, [%3+ 0] - movq m1, [%3+16] - movq m2, [%3+32] - movq m3, [%3+48] - packssdw m0, [%3+ 8] - packssdw m1, [%3+24] - packssdw m2, [%3+40] - packssdw m3, [%3+56] -%else - movq m0, [%3+ 0] - movq m1, [%3+ 8] - movq m2, [%3+16] - movq m3, [%3+24] + vptestmd k0, m0, m0 + pabsd m0, m0 + vpcmpud k1, m0, [pd_1] {1to16}, 6 +%else + vptestmw k0, m0, m0 + pabsw m0, m0 + vpcmpuw k1, m0, [pw_1], 6 %endif - ABSW2 m0, m1, m0, m1, m6, m7 - ABSW2 m2, m3, m2, m3, m6, m7 - packsswb m0, m1 - packsswb m2, m3 - pxor m4, m4 - pxor m6, m6 - pcmpeqb m4, m0 - pcmpeqb m6, m2 - pcmpgtb m0, %4 - pcmpgtb m2, %4 - pmovmskb %5, m4 - pmovmskb %1, m6 - shl %1, 8 - or %1, %5 - pmovmskb %5, m0 - pmovmskb %2, m2 - shl %2, 8 - or %2, %5 +%endmacro + +%macro SHRX 2 +%if cpuflag(bmi2) + shrx %1, %1, %2 +%else + shr %1, %2b ; %2 has to be rcx/ecx +%endif +%endmacro + +%macro BLSR 2 +%if cpuflag(bmi1) + blsr %1, %2 +%else + lea %1, [%2-1] + and %1, %2 %endif %endmacro @@ -1269,33 +1427,60 @@ %macro DECIMATE4x4 1 cglobal decimate_score%1, 1,3 -%ifdef PIC - lea r4, [decimate_table4] - lea r5, [decimate_mask_table4] - %define table r4 - %define mask_table r5 +%if cpuflag(avx512) + DECIMATE_MASK16_AVX512 + xor eax, eax + kmovw edx, k0 +%if %1 == 15 + shr edx, 1 %else - %define table decimate_table4 - %define mask_table decimate_mask_table4 + test edx, edx %endif - DECIMATE_MASK edx, eax, r0, [pb_1], ecx + jz .ret + ktestw k1, k1 + jnz .ret9 +%else + DECIMATE_MASK edx, eax, r0, [pb_1] xor edx, 0xffff - je .ret + jz .ret test eax, eax - jne .ret9 -%if %1==15 + jnz .ret9 +%if %1 == 15 shr edx, 1 %endif +%endif +%ifdef PIC + lea r4, [decimate_mask_table4] + %define mask_table r4 +%else + %define mask_table decimate_mask_table4 +%endif movzx ecx, dl movzx eax, byte [mask_table + rcx] +%if ARCH_X86_64 + xor edx, ecx + jz .ret +%if cpuflag(lzcnt) + lzcnt ecx, ecx + lea r5, [decimate_table4-32] + add r5, rcx +%else + bsr ecx, ecx + lea r5, [decimate_table4-1] + sub r5, rcx +%endif + %define table r5 +%else cmp edx, ecx - je .ret + jz .ret bsr ecx, ecx shr edx, 1 - shr edx, cl + SHRX edx, ecx + %define table decimate_table4 +%endif tzcnt ecx, edx shr edx, 1 - shr edx, cl + SHRX edx, ecx add al, byte [table + rcx] add al, byte [mask_table + rdx] .ret: @@ -1303,175 +1488,224 @@ .ret9: mov eax, 9 RET - %endmacro -%if ARCH_X86_64 == 0 -INIT_MMX mmx2 -DECIMATE4x4 15 -DECIMATE4x4 16 -%endif -INIT_XMM sse2 -DECIMATE4x4 15 -DECIMATE4x4 16 -INIT_XMM ssse3 -DECIMATE4x4 15 -DECIMATE4x4 16 - -; 2x gt1 output, 2x nz output, 1x mask -%macro DECIMATE_MASK64_AVX2 5 - pabsw m0, [r0+ 0] - pabsw m2, [r0+32] - pabsw m1, [r0+64] - pabsw m3, [r0+96] - packsswb m0, m2 - packsswb m1, m3 - pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so - pcmpgtb m3, m1, %5 ; we can save latency by doing them here - pmovmskb %1, m2 - pmovmskb %2, m3 - or %1, %2 - jne .ret9 +%macro DECIMATE_MASK64_AVX2 2 ; nz_low, nz_high + mova m0, [r0+0*32] + packsswb m0, [r0+1*32] + mova m1, [r0+2*32] + packsswb m1, [r0+3*32] + mova m4, [pb_1] + pabsb m2, m0 + pabsb m3, m1 + por m2, m3 ; the > 1 checks don't care about order, so + ptest m4, m2 ; we can save latency by doing them here + jnc .ret9 vpermq m0, m0, q3120 vpermq m1, m1, q3120 pxor m4, m4 pcmpeqb m0, m4 pcmpeqb m1, m4 - pmovmskb %3, m0 - pmovmskb %4, m1 + pmovmskb %1, m0 + pmovmskb %2, m1 %endmacro -%macro DECIMATE8x8 0 +%macro DECIMATE_MASK64_AVX512 0 + mova m0, [r0] +%if HIGH_BIT_DEPTH + packssdw m0, [r0+1*64] + mova m1, [r0+2*64] + packssdw m1, [r0+3*64] + packsswb m0, m1 + vbroadcasti32x4 m1, [pb_1] + pabsb m2, m0 + vpcmpub k0, m2, m1, 6 + ktestq k0, k0 + jnz .ret9 + mova m1, [decimate_shuf_avx512] + vpermd m0, m1, m0 + vptestmb k1, m0, m0 +%else + mova m1, [r0+64] + vbroadcasti32x4 m3, [pb_1] + packsswb m2, m0, m1 + pabsb m2, m2 + vpcmpub k0, m2, m3, 6 + ktestq k0, k0 + jnz .ret9 + vptestmw k1, m0, m0 + vptestmw k2, m1, m1 +%endif +%endmacro +%macro DECIMATE8x8 0 %if ARCH_X86_64 cglobal decimate_score64, 1,5 +%if mmsize == 64 + DECIMATE_MASK64_AVX512 + xor eax, eax +%if HIGH_BIT_DEPTH + kmovq r1, k1 + test r1, r1 + jz .ret +%else + kortestd k1, k2 + jz .ret + kunpckdq k1, k2, k1 + kmovq r1, k1 +%endif +%elif mmsize == 32 + DECIMATE_MASK64_AVX2 r1d, eax + not r1 + shl rax, 32 + xor r1, rax + jz .ret +%else + mova m5, [pb_1] + DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5 + test eax, eax + jnz .ret9 + DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5 + shl r2d, 16 + or r1d, r2d + DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5 + shl r2, 32 + or eax, r3d + or r1, r2 + DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5 + not r1 + shl r2, 48 + xor r1, r2 + jz .ret + add eax, r3d + jnz .ret9 +%endif %ifdef PIC lea r4, [decimate_table8] %define table r4 %else %define table decimate_table8 %endif - mova m5, [pb_1] -%if mmsize==32 - DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5 - shl r3, 32 - or r1, r3 - xor r1, -1 - je .ret -%else - DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null - test eax, eax - jne .ret9 - DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null - shl r2d, 16 - or r1d, r2d - DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null - shl r2, 32 - or eax, r3d - or r1, r2 - DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null - shl r2, 48 - or r1, r2 - xor r1, -1 - je .ret - add eax, r3d - jne .ret9 -%endif - mov al, -6 + mov al, -6 .loop: tzcnt rcx, r1 - shr r1, cl - add al, byte [table + rcx] - jge .ret9 - shr r1, 1 - jne .loop - add al, 6 + add al, byte [table + rcx] + jge .ret9 + shr r1, 1 + SHRX r1, rcx +%if cpuflag(bmi2) + test r1, r1 +%endif + jnz .loop + add al, 6 .ret: REP_RET .ret9: - mov eax, 9 + mov eax, 9 RET %else ; ARCH -%if mmsize == 8 -cglobal decimate_score64, 1,6 -%else -cglobal decimate_score64, 1,5 +cglobal decimate_score64, 1,4 +%if mmsize == 64 + DECIMATE_MASK64_AVX512 + xor eax, eax +%if HIGH_BIT_DEPTH + kshiftrq k2, k1, 32 %endif - mova m5, [pb_1] -%if mmsize==32 - DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5 - xor r3, -1 - je .tryret - xor r4, -1 -.cont: -%else - DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5 - test r2, r2 - jne .ret9 - DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5 - shl r4, 16 - or r3, r4 - DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5 - or r2, r1 - DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5 - shl r1, 16 - or r4, r1 - xor r3, -1 - je .tryret - xor r4, -1 -.cont: - add r0, r2 - jne .ret9 + kmovd r2, k1 + kmovd r3, k2 + test r2, r2 + jz .tryret +%elif mmsize == 32 + DECIMATE_MASK64_AVX2 r2, r3 + xor eax, eax + not r3 + xor r2, -1 + jz .tryret +%else + mova m5, [pb_1] + DECIMATE_MASK r2, r1, r0+SIZEOF_DCTCOEF* 0, m5 + test r1, r1 + jnz .ret9 + DECIMATE_MASK r3, r1, r0+SIZEOF_DCTCOEF*16, m5 + not r2 + shl r3, 16 + xor r2, r3 + mov r0m, r2 + DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF*32, m5 + or r2, r1 + DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5 + add r0, r2 + jnz .ret9 + mov r2, r0m + not r3 + shl r1, 16 + xor r3, r1 + test r2, r2 + jz .tryret %endif - mov al, -6 + mov al, -6 .loop: + tzcnt ecx, r2 + add al, byte [decimate_table8 + ecx] + jge .ret9 + sub ecx, 31 ; increase the shift count by one to shift away the lowest set bit as well + jz .run31 ; only bits 0-4 are used so we have to explicitly handle the case of 1<<31 + shrd r2, r3, cl + SHRX r3, ecx +%if notcpuflag(bmi2) + test r2, r2 +%endif + jnz .loop + BLSR r2, r3 + jz .end +.largerun: tzcnt ecx, r3 - test r3, r3 - je .largerun - shrd r3, r4, cl - shr r4, cl - add al, byte [decimate_table8 + ecx] - jge .ret9 - shrd r3, r4, 1 - shr r4, 1 - test r3, r3 - jne .loop - test r4, r4 - jne .loop - add al, 6 -.ret: - REP_RET -.tryret: - xor r4, -1 - jne .cont + shr r3, 1 + SHRX r3, ecx +.loop2: + tzcnt ecx, r3 + add al, byte [decimate_table8 + ecx] + jge .ret9 + shr r3, 1 + SHRX r3, ecx +.run31: + test r3, r3 + jnz .loop2 +.end: + add al, 6 RET +.tryret: + BLSR r2, r3 + jz .ret + mov al, -6 + jmp .largerun .ret9: mov eax, 9 - RET -.largerun: - mov r3, r4 - xor r4, r4 - tzcnt ecx, r3 - shr r3, cl - shr r3, 1 - jne .loop - add al, 6 - RET +.ret: + REP_RET %endif ; ARCH - %endmacro -%if ARCH_X86_64 == 0 -INIT_MMX mmx2 -DECIMATE8x8 -%endif INIT_XMM sse2 +DECIMATE4x4 15 +DECIMATE4x4 16 DECIMATE8x8 INIT_XMM ssse3 +DECIMATE4x4 15 +DECIMATE4x4 16 DECIMATE8x8 +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +%else INIT_YMM avx2 DECIMATE8x8 +INIT_YMM avx512 +%endif +DECIMATE4x4 15 +DECIMATE4x4 16 +INIT_ZMM avx512 +DECIMATE8x8 ;----------------------------------------------------------------------------- ; int coeff_last( dctcoef *dct ) @@ -1556,7 +1790,7 @@ INIT_MMX mmx2 COEFF_LAST4 -INIT_MMX mmx2, lzcnt +INIT_MMX lzcnt COEFF_LAST4 %macro COEFF_LAST8 0 @@ -1579,7 +1813,7 @@ %endif INIT_XMM sse2 COEFF_LAST8 -INIT_XMM sse2, lzcnt +INIT_XMM lzcnt COEFF_LAST8 %else ; !HIGH_BIT_DEPTH @@ -1642,7 +1876,7 @@ INIT_MMX mmx2 COEFF_LAST48 -INIT_MMX mmx2, lzcnt +INIT_MMX lzcnt COEFF_LAST48 %endif ; HIGH_BIT_DEPTH @@ -1707,7 +1941,7 @@ %endif INIT_XMM sse2 COEFF_LAST -INIT_XMM sse2, lzcnt +INIT_XMM lzcnt COEFF_LAST %macro LAST_MASK_AVX2 2 @@ -1729,7 +1963,7 @@ %endmacro %if ARCH_X86_64 == 0 -INIT_YMM avx2,lzcnt +INIT_YMM avx2 cglobal coeff_last64, 1,2 pxor m2, m2 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32 @@ -1744,7 +1978,7 @@ add eax, 32 RET %else -INIT_YMM avx2,lzcnt +INIT_YMM avx2 cglobal coeff_last64, 1,3 pxor m2, m2 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0 @@ -1756,6 +1990,70 @@ RET %endif +%macro COEFF_LAST_AVX512 2 ; num, w/d +cglobal coeff_last%1, 1,2 + mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF] + vptestm%2 k0, m0, m0 +%if %1 == 15 + mov eax, 30 + kmovw r1d, k0 + lzcnt r1d, r1d + sub eax, r1d +%else + kmovw eax, k0 + lzcnt eax, eax + xor eax, 31 +%endif + RET +%endmacro + +%macro COEFF_LAST64_AVX512 1 ; w/d +cglobal coeff_last64, 1,2 + pxor xm0, xm0 + vpcmp%1 k0, m0, [r0+0*64], 4 + vpcmp%1 k1, m0, [r0+1*64], 4 +%if HIGH_BIT_DEPTH + vpcmp%1 k2, m0, [r0+2*64], 4 + vpcmp%1 k3, m0, [r0+3*64], 4 + kunpckwd k0, k1, k0 + kunpckwd k1, k3, k2 +%endif +%if ARCH_X86_64 + kunpckdq k0, k1, k0 + kmovq rax, k0 + lzcnt rax, rax + xor eax, 63 +%else + kmovd r1d, k1 + kmovd eax, k0 + lzcnt r1d, r1d + lzcnt eax, eax + xor r1d, 32 + cmovnz eax, r1d + xor eax, 31 +%endif + RET +%endmacro + +%if HIGH_BIT_DEPTH +INIT_XMM avx512 +COEFF_LAST_AVX512 4, d +INIT_YMM avx512 +COEFF_LAST_AVX512 8, d +INIT_ZMM avx512 +COEFF_LAST_AVX512 15, d +COEFF_LAST_AVX512 16, d +COEFF_LAST64_AVX512 d +%else ; !HIGH_BIT_DEPTH +INIT_XMM avx512 +COEFF_LAST_AVX512 8, w +INIT_YMM avx512 +COEFF_LAST_AVX512 15, w +COEFF_LAST_AVX512 16, w +INIT_ZMM avx512 +COEFF_LAST64_AVX512 w +%endif ; !HIGH_BIT_DEPTH + ;----------------------------------------------------------------------------- ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel ) ;----------------------------------------------------------------------------- @@ -1833,15 +2131,17 @@ %endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 -INIT_XMM sse2, lzcnt +INIT_MMX lzcnt +COEFF_LEVELRUN 4 +%if HIGH_BIT_DEPTH == 0 +COEFF_LEVELRUN 8 +%endif +INIT_XMM lzcnt %if HIGH_BIT_DEPTH COEFF_LEVELRUN 8 %endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 -INIT_MMX mmx2, lzcnt -COEFF_LEVELRUN 4 -COEFF_LEVELRUN 8 ; Similar to the one above, but saves the DCT ; coefficients in m0/m1 so we don't have to load @@ -1968,7 +2268,7 @@ COEFF_LEVELRUN_LUT 8 COEFF_LEVELRUN_LUT 15 COEFF_LEVELRUN_LUT 16 -INIT_XMM avx2, lzcnt +INIT_XMM avx2 COEFF_LEVELRUN_LUT 15 COEFF_LEVELRUN_LUT 16 %endif diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/quant.h x264-0.152.2854+gite9a5903/common/x86/quant.h --- x264-0.148.2795+gitaaa9aa8/common/x86/quant.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/quant.h 2017-12-31 12:50:51.000000000 +0000 @@ -66,12 +66,15 @@ void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_8x8_flat16_avx512( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp ); @@ -85,16 +88,16 @@ void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); -int x264_decimate_score15_mmx2( dctcoef *dct ); int x264_decimate_score15_sse2( dctcoef *dct ); int x264_decimate_score15_ssse3( dctcoef *dct ); -int x264_decimate_score16_mmx2( dctcoef *dct ); +int x264_decimate_score15_avx512( dctcoef *dct ); int x264_decimate_score16_sse2( dctcoef *dct ); int x264_decimate_score16_ssse3( dctcoef *dct ); -int x264_decimate_score64_mmx2( dctcoef *dct ); +int x264_decimate_score16_avx512( dctcoef *dct ); int x264_decimate_score64_sse2( dctcoef *dct ); int x264_decimate_score64_ssse3( dctcoef *dct ); int x264_decimate_score64_avx2( int16_t *dct ); +int x264_decimate_score64_avx512( dctcoef *dct ); int x264_coeff_last4_mmx2( dctcoef *dct ); int x264_coeff_last8_mmx2( dctcoef *dct ); int x264_coeff_last15_mmx2( dctcoef *dct ); @@ -104,33 +107,37 @@ int x264_coeff_last15_sse2( dctcoef *dct ); int x264_coeff_last16_sse2( dctcoef *dct ); int x264_coeff_last64_sse2( dctcoef *dct ); -int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct ); -int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct ); -int x264_coeff_last8_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last15_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last16_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last64_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last64_avx2_lzcnt( dctcoef *dct ); +int x264_coeff_last4_lzcnt( dctcoef *dct ); +int x264_coeff_last8_lzcnt( dctcoef *dct ); +int x264_coeff_last15_lzcnt( dctcoef *dct ); +int x264_coeff_last16_lzcnt( dctcoef *dct ); +int x264_coeff_last64_lzcnt( dctcoef *dct ); +int x264_coeff_last64_avx2 ( dctcoef *dct ); +int x264_coeff_last4_avx512( int32_t *dct ); +int x264_coeff_last8_avx512( dctcoef *dct ); +int x264_coeff_last15_avx512( dctcoef *dct ); +int x264_coeff_last16_avx512( dctcoef *dct ); +int x264_coeff_last64_avx512( dctcoef *dct ); int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_avx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_avx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac ); diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/sad-a.asm x264-0.152.2854+gite9a5903/common/x86/sad-a.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/sad-a.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/sad-a.asm 2017-12-31 12:50:51.000000000 +0000 @@ -106,8 +106,6 @@ SAD 4, 8 SAD 4, 4 - - ;============================================================================= ; SAD XMM ;============================================================================= @@ -119,118 +117,64 @@ RET %endmacro -%macro SAD_W16 0 ;----------------------------------------------------------------------------- ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -cglobal pixel_sad_16x16, 4,4,8 - movu m0, [r2] - movu m1, [r2+r3] - lea r2, [r2+2*r3] - movu m2, [r2] - movu m3, [r2+r3] - lea r2, [r2+2*r3] - psadbw m0, [r0] - psadbw m1, [r0+r1] - lea r0, [r0+2*r1] - movu m4, [r2] - paddw m0, m1 - psadbw m2, [r0] - psadbw m3, [r0+r1] - lea r0, [r0+2*r1] - movu m5, [r2+r3] - lea r2, [r2+2*r3] - paddw m2, m3 - movu m6, [r2] - movu m7, [r2+r3] - lea r2, [r2+2*r3] - paddw m0, m2 - psadbw m4, [r0] - psadbw m5, [r0+r1] - lea r0, [r0+2*r1] - movu m1, [r2] - paddw m4, m5 - psadbw m6, [r0] - psadbw m7, [r0+r1] - lea r0, [r0+2*r1] - movu m2, [r2+r3] - lea r2, [r2+2*r3] - paddw m6, m7 - movu m3, [r2] - paddw m0, m4 - movu m4, [r2+r3] - lea r2, [r2+2*r3] - paddw m0, m6 - psadbw m1, [r0] - psadbw m2, [r0+r1] - lea r0, [r0+2*r1] - movu m5, [r2] - paddw m1, m2 - psadbw m3, [r0] - psadbw m4, [r0+r1] - lea r0, [r0+2*r1] - movu m6, [r2+r3] - lea r2, [r2+2*r3] - paddw m3, m4 - movu m7, [r2] - paddw m0, m1 - movu m1, [r2+r3] - paddw m0, m3 - psadbw m5, [r0] - psadbw m6, [r0+r1] - lea r0, [r0+2*r1] - paddw m5, m6 - psadbw m7, [r0] - psadbw m1, [r0+r1] - paddw m7, m1 - paddw m0, m5 - paddw m0, m7 - SAD_END_SSE2 - -;----------------------------------------------------------------------------- -; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) -;----------------------------------------------------------------------------- -cglobal pixel_sad_16x8, 4,4 - movu m0, [r2] - movu m2, [r2+r3] - lea r2, [r2+2*r3] - movu m3, [r2] - movu m4, [r2+r3] - psadbw m0, [r0] - psadbw m2, [r0+r1] - lea r0, [r0+2*r1] - psadbw m3, [r0] - psadbw m4, [r0+r1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - paddw m0, m2 - paddw m3, m4 - paddw m0, m3 - movu m1, [r2] - movu m2, [r2+r3] - lea r2, [r2+2*r3] - movu m3, [r2] - movu m4, [r2+r3] - psadbw m1, [r0] - psadbw m2, [r0+r1] - lea r0, [r0+2*r1] - psadbw m3, [r0] - psadbw m4, [r0+r1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - paddw m1, m2 - paddw m3, m4 - paddw m0, m1 - paddw m0, m3 +%macro SAD_W16 1 ; h +cglobal pixel_sad_16x%1, 4,4 +%ifidn cpuname, sse2 +.skip_prologue: +%endif +%assign %%i 0 +%if ARCH_X86_64 + lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile + lea r5, [3*r3] +%rep %1/4 + movu m1, [r2] + psadbw m1, [r0] + movu m3, [r2+r3] + psadbw m3, [r0+r1] + movu m2, [r2+2*r3] + psadbw m2, [r0+2*r1] + movu m4, [r2+r5] + psadbw m4, [r0+r6] +%if %%i != %1/4-1 + lea r2, [r2+4*r3] + lea r0, [r0+4*r1] +%endif + paddw m1, m3 + paddw m2, m4 + ACCUM paddw, 0, 1, %%i + paddw m0, m2 + %assign %%i %%i+1 +%endrep +%else ; The cost of having to save and restore registers on x86-32 +%rep %1/2 ; nullifies the benefit of having 3*stride in registers. + movu m1, [r2] + psadbw m1, [r0] + movu m2, [r2+r3] + psadbw m2, [r0+r1] +%if %%i != %1/2-1 + lea r2, [r2+2*r3] + lea r0, [r0+2*r1] +%endif + ACCUM paddw, 0, 1, %%i + paddw m0, m2 + %assign %%i %%i+1 +%endrep +%endif SAD_END_SSE2 %endmacro INIT_XMM sse2 -SAD_W16 +SAD_W16 16 +SAD_W16 8 INIT_XMM sse3 -SAD_W16 +SAD_W16 16 +SAD_W16 8 INIT_XMM sse2, aligned -SAD_W16 +SAD_W16 16 +SAD_W16 8 %macro SAD_INC_4x8P_SSE 1 movq m1, [r0] @@ -259,7 +203,132 @@ SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_END_SSE2 + +%macro SAD_W48_AVX512 3 ; w, h, d/q +cglobal pixel_sad_%1x%2, 4,4 + kxnorb k1, k1, k1 + kaddb k1, k1, k1 +%assign %%i 0 +%if ARCH_X86_64 && %2 != 4 + lea r6, [3*r1] + lea r5, [3*r3] +%rep %2/4 + mov%3 m1, [r0] + vpbroadcast%3 m1 {k1}, [r0+r1] + mov%3 m3, [r2] + vpbroadcast%3 m3 {k1}, [r2+r3] + mov%3 m2, [r0+2*r1] + vpbroadcast%3 m2 {k1}, [r0+r6] + mov%3 m4, [r2+2*r3] + vpbroadcast%3 m4 {k1}, [r2+r5] +%if %%i != %2/4-1 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%endif + psadbw m1, m3 + psadbw m2, m4 + ACCUM paddd, 0, 1, %%i + paddd m0, m2 + %assign %%i %%i+1 +%endrep +%else +%rep %2/2 + mov%3 m1, [r0] + vpbroadcast%3 m1 {k1}, [r0+r1] + mov%3 m2, [r2] + vpbroadcast%3 m2 {k1}, [r2+r3] +%if %%i != %2/2-1 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] +%endif + psadbw m1, m2 + ACCUM paddd, 0, 1, %%i + %assign %%i %%i+1 +%endrep +%endif +%if %1 == 8 + punpckhqdq m1, m0, m0 + paddd m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM avx512 +SAD_W48_AVX512 4, 4, d +SAD_W48_AVX512 4, 8, d +SAD_W48_AVX512 4, 16, d +SAD_W48_AVX512 8, 4, q +SAD_W48_AVX512 8, 8, q +SAD_W48_AVX512 8, 16, q + +%macro SAD_W16_AVX512_START 1 ; h + cmp r1d, FENC_STRIDE ; optimized for the most common fenc case, which + jne pixel_sad_16x%1_sse2.skip_prologue ; has the rows laid out contiguously in memory + lea r1, [3*r3] +%endmacro + +%macro SAD_W16_AVX512_END 0 + paddd m0, m1 + paddd m0, m2 + paddd m0, m3 +%if mmsize == 64 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 +%endif + vextracti128 xm1, ym0, 1 + paddd xmm0, xm0, xm1 + punpckhqdq xmm1, xmm0, xmm0 + paddd xmm0, xmm1 + movd eax, xmm0 RET +%endmacro + +INIT_YMM avx512 +cglobal pixel_sad_16x8, 4,4 + SAD_W16_AVX512_START 8 + movu xm0, [r2] + vinserti128 m0, [r2+r3], 1 + psadbw m0, [r0+0*32] + movu xm1, [r2+2*r3] + vinserti128 m1, [r2+r1], 1 + lea r2, [r2+4*r3] + psadbw m1, [r0+1*32] + movu xm2, [r2] + vinserti128 m2, [r2+r3], 1 + psadbw m2, [r0+2*32] + movu xm3, [r2+2*r3] + vinserti128 m3, [r2+r1], 1 + psadbw m3, [r0+3*32] + SAD_W16_AVX512_END + +INIT_ZMM avx512 +cglobal pixel_sad_16x16, 4,4 + SAD_W16_AVX512_START 16 + movu xm0, [r2] + vinserti128 ym0, [r2+r3], 1 + movu xm1, [r2+4*r3] + vinserti32x4 m0, [r2+2*r3], 2 + vinserti32x4 m1, [r2+2*r1], 2 + vinserti32x4 m0, [r2+r1], 3 + lea r2, [r2+4*r3] + vinserti32x4 m1, [r2+r3], 1 + psadbw m0, [r0+0*64] + vinserti32x4 m1, [r2+r1], 3 + lea r2, [r2+4*r3] + psadbw m1, [r0+1*64] + movu xm2, [r2] + vinserti128 ym2, [r2+r3], 1 + movu xm3, [r2+4*r3] + vinserti32x4 m2, [r2+2*r3], 2 + vinserti32x4 m3, [r2+2*r1], 2 + vinserti32x4 m2, [r2+r1], 3 + lea r2, [r2+4*r3] + vinserti32x4 m3, [r2+r3], 1 + psadbw m2, [r0+2*64] + vinserti32x4 m3, [r2+r1], 3 + psadbw m3, [r0+3*64] + SAD_W16_AVX512_END ;----------------------------------------------------------------------------- ; void pixel_vsad( pixel *src, intptr_t stride ); @@ -1548,6 +1617,225 @@ SAD_X_AVX2 4, 16, 16, 8 SAD_X_AVX2 4, 16, 8, 8 +%macro SAD_X_W4_AVX512 2 ; x, h +cglobal pixel_sad_x%1_4x%2, %1+2,%1+3 + mov t1d, 0xa + kmovb k1, t1d + lea t1, [3*t0] + kaddb k2, k1, k1 + kshiftlb k3, k1, 2 +%assign %%i 0 +%rep %2/4 + movu m6, [r0+%%i*64] + vmovddup m6 {k1}, [r0+%%i*64+32] + movd xmm2, [r1] + movd xmm4, [r1+t0] + vpbroadcastd xmm2 {k1}, [r1+2*t0] + vpbroadcastd xmm4 {k1}, [r1+t1] + vpbroadcastd xmm2 {k2}, [r2+t0] + vpbroadcastd xmm4 {k2}, [r2] + vpbroadcastd xmm2 {k3}, [r2+t1] ; a0 a2 b1 b3 + vpbroadcastd xmm4 {k3}, [r2+2*t0] ; a1 a3 b0 b2 + vpmovqd s1, m6 ; s0 s2 s1 s3 + movd xmm3, [r3] + movd xmm5, [r3+t0] + vpbroadcastd xmm3 {k1}, [r3+2*t0] + vpbroadcastd xmm5 {k1}, [r3+t1] +%if %1 == 4 + vpbroadcastd xmm3 {k2}, [r4+t0] + vpbroadcastd xmm5 {k2}, [r4] + vpbroadcastd xmm3 {k3}, [r4+t1] ; c0 c2 d1 d3 + vpbroadcastd xmm5 {k3}, [r4+2*t0] ; c1 c3 d0 d2 +%endif +%if %%i != %2/4-1 +%assign %%j 1 +%rep %1 + lea r%+%%j, [r%+%%j+4*t0] + %assign %%j %%j+1 +%endrep +%endif + pshufd s2, s1, q1032 + psadbw xmm2, s1 + psadbw xmm4, s2 + psadbw xmm3, s1 + psadbw xmm5, s2 +%if %%i + paddd xmm0, xmm2 + paddd xmm1, xmm3 + paddd xmm0, xmm4 + paddd xmm1, xmm5 +%else + paddd xmm0, xmm2, xmm4 + paddd xmm1, xmm3, xmm5 +%endif + %assign %%i %%i+1 +%endrep +%if %1 == 4 + movifnidn t2, r6mp +%else + movifnidn t2, r5mp +%endif + packusdw xmm0, xmm1 + mova [t2], xmm0 + RET +%endmacro + +%macro SAD_X_W8_AVX512 2 ; x, h +cglobal pixel_sad_x%1_8x%2, %1+2,%1+3 + kxnorb k3, k3, k3 + lea t1, [3*t0] + kaddb k1, k3, k3 + kshiftlb k2, k3, 2 + kshiftlb k3, k3, 3 +%assign %%i 0 +%rep %2/4 + movddup m6, [r0+%%i*64] ; s0 s0 s1 s1 + movq xm2, [r1] + movq xm4, [r1+2*t0] + vpbroadcastq xm2 {k1}, [r2] + vpbroadcastq xm4 {k1}, [r2+2*t0] + vpbroadcastq m2 {k2}, [r1+t0] + vpbroadcastq m4 {k2}, [r1+t1] + vpbroadcastq m2 {k3}, [r2+t0] ; a0 b0 a1 b1 + vpbroadcastq m4 {k3}, [r2+t1] ; a2 b2 a3 b3 + movddup m7, [r0+%%i*64+32] ; s2 s2 s3 s3 + movq xm3, [r3] + movq xm5, [r3+2*t0] +%if %1 == 4 + vpbroadcastq xm3 {k1}, [r4] + vpbroadcastq xm5 {k1}, [r4+2*t0] +%endif + vpbroadcastq m3 {k2}, [r3+t0] + vpbroadcastq m5 {k2}, [r3+t1] +%if %1 == 4 + vpbroadcastq m3 {k3}, [r4+t0] ; c0 d0 c1 d1 + vpbroadcastq m5 {k3}, [r4+t1] ; c2 d2 c3 d3 +%endif +%if %%i != %2/4-1 +%assign %%j 1 +%rep %1 + lea r%+%%j, [r%+%%j+4*t0] + %assign %%j %%j+1 +%endrep +%endif + psadbw m2, m6 + psadbw m4, m7 + psadbw m3, m6 + psadbw m5, m7 + ACCUM paddd, 0, 2, %%i + ACCUM paddd, 1, 3, %%i + paddd m0, m4 + paddd m1, m5 + %assign %%i %%i+1 +%endrep +%if %1 == 4 + movifnidn t2, r6mp +%else + movifnidn t2, r5mp +%endif + packusdw m0, m1 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + mova [t2], xm0 + RET +%endmacro + +%macro SAD_X_W16_AVX512 2 ; x, h +cglobal pixel_sad_x%1_16x%2, %1+2,%1+3 + lea t1, [3*t0] +%assign %%i 0 +%rep %2/4 + mova m6, [r0+%%i*64] ; s0 s1 s2 s3 + movu xm2, [r3] + movu xm4, [r3+t0] +%if %1 == 4 + vinserti128 ym2, [r4+t0], 1 + vinserti128 ym4, [r4], 1 +%endif + vinserti32x4 m2, [r1+2*t0], 2 + vinserti32x4 m4, [r1+t1], 2 + vinserti32x4 m2, [r2+t1], 3 ; c0 d1 a2 b3 + vinserti32x4 m4, [r2+2*t0], 3 ; c1 d0 a3 b2 + vpermq m7, m6, q1032 ; s1 s0 s3 s2 + movu xm3, [r1] + movu xm5, [r1+t0] + vinserti128 ym3, [r2+t0], 1 + vinserti128 ym5, [r2], 1 + vinserti32x4 m3, [r3+2*t0], 2 + vinserti32x4 m5, [r3+t1], 2 +%if %1 == 4 + vinserti32x4 m3, [r4+t1], 3 ; a0 b1 c2 d3 + vinserti32x4 m5, [r4+2*t0], 3 ; a1 b0 c3 d2 +%endif +%if %%i != %2/4-1 +%assign %%j 1 +%rep %1 + lea r%+%%j, [r%+%%j+4*t0] + %assign %%j %%j+1 +%endrep +%endif + psadbw m2, m6 + psadbw m4, m7 + psadbw m3, m6 + psadbw m5, m7 + ACCUM paddd, 0, 2, %%i + ACCUM paddd, 1, 3, %%i + paddd m0, m4 + paddd m1, m5 + %assign %%i %%i+1 +%endrep +%if %1 == 4 + movifnidn t2, r6mp +%else + movifnidn t2, r5mp +%endif + mov t1d, 0x1111 + kmovw k1, t1d + vshufi32x4 m0, m0, q1032 + paddd m0, m1 + punpckhqdq m1, m0, m0 + paddd m0, m1 + vpcompressd m0 {k1}{z}, m0 + mova [t2], xm0 + RET +%endmacro + +; t0 = stride, t1 = tmp/stride3, t2 = scores +%if WIN64 + %define s1 xmm16 ; xmm6 and xmm7 reduces code size, but + %define s2 xmm17 ; they're callee-saved on win64 + DECLARE_REG_TMP 4, 6, 0 +%else + %define s1 xmm6 + %define s2 xmm7 +%if ARCH_X86_64 + DECLARE_REG_TMP 4, 6, 5 ; scores is passed in a register on unix64 +%else + DECLARE_REG_TMP 4, 5, 0 +%endif +%endif + +INIT_YMM avx512 +SAD_X_W4_AVX512 3, 4 ; x3_4x4 +SAD_X_W4_AVX512 3, 8 ; x3_4x8 +SAD_X_W8_AVX512 3, 4 ; x3_8x4 +SAD_X_W8_AVX512 3, 8 ; x3_8x8 +SAD_X_W8_AVX512 3, 16 ; x3_8x16 +INIT_ZMM avx512 +SAD_X_W16_AVX512 3, 8 ; x3_16x8 +SAD_X_W16_AVX512 3, 16 ; x3_16x16 + +DECLARE_REG_TMP 5, 6, 0 +INIT_YMM avx512 +SAD_X_W4_AVX512 4, 4 ; x4_4x4 +SAD_X_W4_AVX512 4, 8 ; x4_4x8 +SAD_X_W8_AVX512 4, 4 ; x4_8x4 +SAD_X_W8_AVX512 4, 8 ; x4_8x8 +SAD_X_W8_AVX512 4, 16 ; x4_8x16 +INIT_ZMM avx512 +SAD_X_W16_AVX512 4, 8 ; x4_16x8 +SAD_X_W16_AVX512 4, 16 ; x4_16x16 + ;============================================================================= ; SAD cacheline split ;============================================================================= diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/x86inc.asm x264-0.152.2854+gite9a5903/common/x86/x86inc.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/x86inc.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/x86inc.asm 2017-12-31 12:50:51.000000000 +0000 @@ -323,6 +323,8 @@ %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) +%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) +%define high_mm_regs (16*cpuflag(avx512)) %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 @@ -436,15 +438,16 @@ %macro WIN64_PUSH_XMM 0 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 + %if xmm_regs_used > 6 + high_mm_regs movaps [rstk + stack_offset + 8], xmm6 %endif - %if xmm_regs_used > 7 + %if xmm_regs_used > 7 + high_mm_regs movaps [rstk + stack_offset + 24], xmm7 %endif - %if xmm_regs_used > 8 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 %assign %%i 8 - %rep xmm_regs_used-8 + %rep %%xmm_regs_on_stack movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep @@ -453,10 +456,11 @@ %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 8 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. - %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign %%pad %%xmm_regs_on_stack*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif @@ -465,9 +469,10 @@ %macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 - %if xmm_regs_used > 8 - %assign %%i xmm_regs_used - %rep xmm_regs_used-8 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i xmm_regs_used - high_mm_regs + %rep %%xmm_regs_on_stack %assign %%i %%i-1 movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep @@ -480,10 +485,10 @@ %assign %%pad_size stack_size_padded %endif %endif - %if xmm_regs_used > 7 + %if xmm_regs_used > 7 + high_mm_regs movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif - %if xmm_regs_used > 6 + %if xmm_regs_used > 6 + high_mm_regs movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro @@ -495,12 +500,12 @@ %assign xmm_regs_used 0 %endmacro -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs %macro RET 0 WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -524,9 +529,10 @@ DECLARE_REG 13, R12, 64 DECLARE_REG 14, R13, 72 -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 + %assign xmm_regs_used %3 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 @@ -536,7 +542,7 @@ DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 @@ -547,7 +553,7 @@ %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -592,7 +598,7 @@ DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 @@ -603,7 +609,7 @@ %endif %endif POP_IF_USED 6, 5, 4, 3 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -713,7 +719,7 @@ %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding - %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper %ifnidn %3, "" PROLOGUE %3 %endif @@ -776,24 +782,25 @@ %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_fma3 (1<<14)| cpuflags_avx -%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_aesni (1<<12)| cpuflags_sse42 +%assign cpuflags_avx (1<<13)| cpuflags_sse42 +%assign cpuflags_xop (1<<14)| cpuflags_avx +%assign cpuflags_fma4 (1<<15)| cpuflags_avx +%assign cpuflags_fma3 (1<<16)| cpuflags_avx +%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL + +%assign cpuflags_cache32 (1<<21) +%assign cpuflags_cache64 (1<<22) +%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<24) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) @@ -836,7 +843,7 @@ %if ARCH_X86_64 || cpuflag(sse2) %ifdef __NASM_VER__ - ALIGNMODE k8 + ALIGNMODE p6 %else CPU amdnop %endif @@ -849,11 +856,12 @@ %endif %endmacro -; Merge mmx and sse* +; Merge mmx, sse*, and avx* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# -; (All 3 remain in sync through SWAP.) +; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# +; (All 4 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 @@ -863,6 +871,18 @@ %undef %1%2 %endmacro +; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper +%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg + %if ARCH_X86_64 && cpuflag(avx512) + %assign %%i %1 + %rep 16-%1 + %assign %%i_high %%i+16 + SWAP %%i, %%i_high + %assign %%i %%i+1 + %endrep + %endif +%endmacro + %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 @@ -878,7 +898,7 @@ CAT_XDEFINE nnmm, %%i, %%i %assign %%i %%i+1 %endrep - %rep 8 + %rep 24 CAT_UNDEF m, %%i CAT_UNDEF nnmm, %%i %assign %%i %%i+1 @@ -892,7 +912,7 @@ %define mmsize 16 %define num_mmregs 8 %if ARCH_X86_64 - %define num_mmregs 16 + %define num_mmregs 32 %endif %define mova movdqa %define movu movdqu @@ -905,6 +925,10 @@ %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 + %if WIN64 + ; Swap callee-saved registers with volatile registers + AVX512_MM_PERMUTATION 6 + %endif %endmacro %macro INIT_YMM 0-1+ @@ -913,7 +937,7 @@ %define mmsize 32 %define num_mmregs 8 %if ARCH_X86_64 - %define num_mmregs 16 + %define num_mmregs 32 %endif %define mova movdqa %define movu movdqu @@ -926,6 +950,29 @@ %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 + AVX512_MM_PERMUTATION +%endmacro + +%macro INIT_ZMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_ZMM %1 + %define mmsize 64 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 32 + %endif + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, zmm %+ %%i + CAT_XDEFINE nnzmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 + AVX512_MM_PERMUTATION %endmacro INIT_XMM @@ -934,18 +981,26 @@ %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 + %define mmzmm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 + %define xmmzmm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 + %define ymmzmm%1 ymm%1 + %define zmmmm%1 mm%1 + %define zmmxmm%1 xmm%1 + %define zmmymm%1 ymm%1 + %define zmmzmm%1 zmm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 + %define zm%1 zmm %+ m%1 %endmacro %assign i 0 -%rep 16 +%rep 32 DECLARE_MMCAST i %assign i i+1 %endrep @@ -1080,12 +1135,17 @@ ;============================================================================= %assign i 0 -%rep 16 +%rep 32 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 + CAT_XDEFINE regnumofmm, i, i %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 + CAT_XDEFINE sizeofzmm, i, 64 + CAT_XDEFINE regnumofxmm, i, i + CAT_XDEFINE regnumofymm, i, i + CAT_XDEFINE regnumofzmm, i, i %assign i i+1 %endrep %undef i @@ -1202,7 +1262,7 @@ %endmacro %endmacro -; Instructions with both VEX and non-VEX encodings +; Instructions with both VEX/EVEX and legacy encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 @@ -1534,15 +1594,48 @@ FMA4_INSTR fnmadd, pd, ps, sd, ss FMA4_INSTR fnmsub, pd, ps, sd, ss -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) -%ifdef __YASM_VER__ - %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 - %macro vpbroadcastq 2 - %if sizeof%1 == 16 - movddup %1, %2 - %else - vbroadcastsd %1, %2 +; Macros for converting VEX instructions to equivalent EVEX ones. +%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex + %macro %1 2-7 fnord, fnord, %1, %2, %3 + %ifidn %3, fnord + %define %%args %1, %2 + %elifidn %4, fnord + %define %%args %1, %2, %3 + %else + %define %%args %1, %2, %3, %4 + %endif + %assign %%evex_required cpuflag(avx512) & %7 + %ifnum regnumof%1 + %if regnumof%1 >= 16 || sizeof%1 > 32 + %assign %%evex_required 1 %endif - %endmacro - %endif -%endif + %endif + %ifnum regnumof%2 + %if regnumof%2 >= 16 || sizeof%2 > 32 + %assign %%evex_required 1 + %endif + %endif + %if %%evex_required + %6 %%args + %else + %5 %%args ; Prefer VEX over EVEX due to shorter instruction length + %endif + %endmacro +%endmacro + +EVEX_INSTR vbroadcastf128, vbroadcastf32x4 +EVEX_INSTR vbroadcasti128, vbroadcasti32x4 +EVEX_INSTR vextractf128, vextractf32x4 +EVEX_INSTR vextracti128, vextracti32x4 +EVEX_INSTR vinsertf128, vinsertf32x4 +EVEX_INSTR vinserti128, vinserti32x4 +EVEX_INSTR vmovdqa, vmovdqa32 +EVEX_INSTR vmovdqu, vmovdqu32 +EVEX_INSTR vpand, vpandd +EVEX_INSTR vpandn, vpandnd +EVEX_INSTR vpor, vpord +EVEX_INSTR vpxor, vpxord +EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision +EVEX_INSTR vrcpss, vrcp14ss, 1 +EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 +EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 diff -Nru x264-0.148.2795+gitaaa9aa8/common/x86/x86util.asm x264-0.152.2854+gite9a5903/common/x86/x86util.asm --- x264-0.148.2795+gitaaa9aa8/common/x86/x86util.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/common/x86/x86util.asm 2017-12-31 12:50:51.000000000 +0000 @@ -303,24 +303,24 @@ %endmacro %macro HADDD 2 ; sum junk -%if sizeof%1 == 32 -%define %2 xmm%2 - vextracti128 %2, %1, 1 -%define %1 xmm%1 - paddd %1, %2 +%if sizeof%1 >= 64 + vextracti32x8 ymm%2, zmm%1, 1 + paddd ymm%1, ymm%2 %endif -%if mmsize >= 16 - MOVHL %2, %1 - paddd %1, %2 +%if sizeof%1 >= 32 + vextracti128 xmm%2, ymm%1, 1 + paddd xmm%1, xmm%2 +%endif +%if sizeof%1 >= 16 + MOVHL xmm%2, xmm%1 + paddd xmm%1, xmm%2 %endif %if cpuflag(xop) && sizeof%1 == 16 - vphadddq %1, %1 + vphadddq xmm%1, xmm%1 %else - PSHUFLW %2, %1, q0032 - paddd %1, %2 + PSHUFLW xmm%2, xmm%1, q1032 + paddd xmm%1, xmm%2 %endif -%undef %1 -%undef %2 %endmacro %macro HADDW 2 ; reg, tmp diff -Nru x264-0.148.2795+gitaaa9aa8/config.guess x264-0.152.2854+gite9a5903/config.guess --- x264-0.148.2795+gitaaa9aa8/config.guess 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/config.guess 2017-12-31 12:50:51.000000000 +0000 @@ -1,8 +1,8 @@ #! /bin/sh # Attempt to guess a canonical system name. -# Copyright 1992-2016 Free Software Foundation, Inc. +# Copyright 1992-2017 Free Software Foundation, Inc. -timestamp='2016-10-02' +timestamp='2017-11-07' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -15,7 +15,7 @@ # General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, see . +# along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a @@ -27,7 +27,7 @@ # Originally written by Per Bothner; maintained since 2000 by Ben Elliston. # # You can get the latest version of this script from: -# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess +# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess # # Please send patches to . @@ -39,7 +39,7 @@ Output the configuration name of the system \`$me' is run on. -Operation modes: +Options: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit @@ -50,7 +50,7 @@ GNU config.guess ($timestamp) Originally written by Per Bothner. -Copyright 1992-2016 Free Software Foundation, Inc. +Copyright 1992-2017 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -244,6 +244,9 @@ UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'` echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE} exit ;; + *:MidnightBSD:*:*) + echo ${UNAME_MACHINE}-unknown-midnightbsd${UNAME_RELEASE} + exit ;; *:ekkoBSD:*:*) echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} exit ;; @@ -259,6 +262,9 @@ *:Sortix:*:*) echo ${UNAME_MACHINE}-unknown-sortix exit ;; + *:Redox:*:*) + echo ${UNAME_MACHINE}-unknown-redox + exit ;; alpha:OSF1:*:*) case $UNAME_RELEASE in *4.0) @@ -315,15 +321,6 @@ exitcode=$? trap '' 0 exit $exitcode ;; - Alpha\ *:Windows_NT*:*) - # How do we know it's Interix rather than the generic POSIX subsystem? - # Should we change UNAME_MACHINE based on the output of uname instead - # of the specific Alpha model? - echo alpha-pc-interix - exit ;; - 21064:Windows_NT:50:3) - echo alpha-dec-winnt3.5 - exit ;; Amiga*:UNIX_System_V:4.0:*) echo m68k-unknown-sysv4 exit ;; @@ -485,13 +482,13 @@ #endif #if defined (host_mips) && defined (MIPSEB) #if defined (SYSTYPE_SYSV) - printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); + printf ("mips-mips-riscos%ssysv\\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_SVR4) - printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); + printf ("mips-mips-riscos%ssvr4\\n", argv[1]); exit (0); #endif #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) - printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); + printf ("mips-mips-riscos%sbsd\\n", argv[1]); exit (0); #endif #endif exit (-1); @@ -614,7 +611,7 @@ *:AIX:*:*) echo rs6000-ibm-aix exit ;; - ibmrt:4.4BSD:*|romp-ibm:BSD:*) + ibmrt:4.4BSD:*|romp-ibm:4.4BSD:*) echo romp-ibm-bsd4.4 exit ;; ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and @@ -635,8 +632,8 @@ 9000/[34678]??:HP-UX:*:*) HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` case "${UNAME_MACHINE}" in - 9000/31? ) HP_ARCH=m68000 ;; - 9000/[34]?? ) HP_ARCH=m68k ;; + 9000/31?) HP_ARCH=m68000 ;; + 9000/[34]??) HP_ARCH=m68k ;; 9000/[678][0-9][0-9]) if [ -x /usr/bin/getconf ]; then sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` @@ -749,7 +746,7 @@ { echo "$SYSTEM_NAME"; exit; } echo unknown-hitachi-hiuxwe2 exit ;; - 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:*) echo hppa1.1-hp-bsd exit ;; 9000/8??:4.3bsd:*:*) @@ -758,7 +755,7 @@ *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) echo hppa1.0-hp-mpeix exit ;; - hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:*) echo hppa1.1-hp-osf exit ;; hp8??:OSF1:*:*) @@ -837,10 +834,11 @@ UNAME_PROCESSOR=`/usr/bin/uname -p` case ${UNAME_PROCESSOR} in amd64) - echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; - *) - echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + UNAME_PROCESSOR=x86_64 ;; + i386) + UNAME_PROCESSOR=i586 ;; esac + echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` exit ;; i*:CYGWIN*:*) echo ${UNAME_MACHINE}-pc-cygwin @@ -854,10 +852,6 @@ *:MSYS*:*) echo ${UNAME_MACHINE}-pc-msys exit ;; - i*:windows32*:*) - # uname -m includes "-pc" on this system. - echo ${UNAME_MACHINE}-mingw32 - exit ;; i*:PW*:*) echo ${UNAME_MACHINE}-pc-pw32 exit ;; @@ -873,27 +867,12 @@ echo ia64-unknown-interix${UNAME_RELEASE} exit ;; esac ;; - [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) - echo i${UNAME_MACHINE}-pc-mks - exit ;; - 8664:Windows_NT:*) - echo x86_64-pc-mks - exit ;; - i*:Windows_NT*:* | Pentium*:Windows_NT*:*) - # How do we know it's Interix rather than the generic POSIX subsystem? - # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we - # UNAME_MACHINE based on the output of uname instead of i386? - echo i586-pc-interix - exit ;; i*:UWIN*:*) echo ${UNAME_MACHINE}-pc-uwin exit ;; amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) echo x86_64-unknown-cygwin exit ;; - p*:CYGWIN*:*) - echo powerpcle-unknown-cygwin - exit ;; prep*:SunOS:5.*:*) echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; @@ -1096,7 +1075,7 @@ i*86:*DOS:*:*) echo ${UNAME_MACHINE}-pc-msdosdjgpp exit ;; - i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) + i*86:*:4.*:*) UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} @@ -1303,14 +1282,21 @@ if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then if [ "$CC_FOR_BUILD" != no_compiler_found ]; then if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null then case $UNAME_PROCESSOR in i386) UNAME_PROCESSOR=x86_64 ;; powerpc) UNAME_PROCESSOR=powerpc64 ;; esac fi + # On 10.4-10.6 one might compile for PowerPC via gcc -arch ppc + if (echo '#ifdef __POWERPC__'; echo IS_PPC; echo '#endif') | \ + (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_PPC >/dev/null + then + UNAME_PROCESSOR=powerpc + fi fi elif test "$UNAME_PROCESSOR" = i386 ; then # Avoid executing cc on OS X 10.9, as it ships with a stub @@ -1334,15 +1320,18 @@ *:QNX:*:4*) echo i386-pc-qnx exit ;; - NEO-?:NONSTOP_KERNEL:*:*) + NEO-*:NONSTOP_KERNEL:*:*) echo neo-tandem-nsk${UNAME_RELEASE} exit ;; NSE-*:NONSTOP_KERNEL:*:*) echo nse-tandem-nsk${UNAME_RELEASE} exit ;; - NSR-?:NONSTOP_KERNEL:*:*) + NSR-*:NONSTOP_KERNEL:*:*) echo nsr-tandem-nsk${UNAME_RELEASE} exit ;; + NSX-*:NONSTOP_KERNEL:*:*) + echo nsx-tandem-nsk${UNAME_RELEASE} + exit ;; *:NonStop-UX:*:*) echo mips-compaq-nonstopux exit ;; @@ -1414,16 +1403,28 @@ exit ;; esac +echo "$0: unable to guess system type" >&2 + +case "${UNAME_MACHINE}:${UNAME_SYSTEM}" in + mips:Linux | mips64:Linux) + # If we got here on MIPS GNU/Linux, output extra information. + cat >&2 <&2 <. +# along with this program; if not, see . # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a @@ -33,7 +33,7 @@ # Otherwise, we print the canonical config type on stdout and succeed. # You can get the latest version of this script from: -# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub +# https://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub # This file is supposed to be the same for all GNU packages # and recognize all the CPU types, system types and aliases @@ -57,7 +57,7 @@ Canonicalize a configuration name. -Operation modes: +Options: -h, --help print this help, then exit -t, --time-stamp print date of last modification, then exit -v, --version print version number, then exit @@ -67,7 +67,7 @@ version="\ GNU config.sub ($timestamp) -Copyright 1992-2016 Free Software Foundation, Inc. +Copyright 1992-2017 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -229,9 +229,6 @@ -ptx*) basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` ;; - -windowsnt*) - os=`echo $os | sed -e 's/windowsnt/winnt/'` - ;; -psos*) os=-psos ;; @@ -263,7 +260,7 @@ | fido | fr30 | frv | ft32 \ | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ | hexagon \ - | i370 | i860 | i960 | ia64 \ + | i370 | i860 | i960 | ia16 | ia64 \ | ip2k | iq2000 \ | k1om \ | le32 | le64 \ @@ -315,7 +312,7 @@ | ubicom32 \ | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \ | visium \ - | we32k \ + | wasm32 \ | x86 | xc16x | xstormy16 | xtensa \ | z8k | z80) basic_machine=$basic_machine-unknown @@ -388,7 +385,7 @@ | h8300-* | h8500-* \ | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ | hexagon-* \ - | i*86-* | i860-* | i960-* | ia64-* \ + | i*86-* | i860-* | i960-* | ia16-* | ia64-* \ | ip2k-* | iq2000-* \ | k1om-* \ | le32-* | le64-* \ @@ -446,6 +443,7 @@ | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \ | vax-* \ | visium-* \ + | wasm32-* \ | we32k-* \ | x86-* | x86_64-* | xc16x-* | xps100-* \ | xstormy16-* | xtensa*-* \ @@ -641,7 +639,7 @@ basic_machine=rs6000-bull os=-bosx ;; - dpx2* | dpx2*-bull) + dpx2*) basic_machine=m68k-bull os=-sysv3 ;; @@ -903,7 +901,7 @@ basic_machine=v70-nec os=-sysv ;; - next | m*-next ) + next | m*-next) basic_machine=m68k-next case $os in -nextstep* ) @@ -948,6 +946,9 @@ nsr-tandem) basic_machine=nsr-tandem ;; + nsx-tandem) + basic_machine=nsx-tandem + ;; op50n-* | op60c-*) basic_machine=hppa1.1-oki os=-proelf @@ -1243,6 +1244,9 @@ basic_machine=a29k-wrs os=-vxworks ;; + wasm32) + basic_machine=wasm32-unknown + ;; w65*) basic_machine=w65-wdc os=-none @@ -1251,6 +1255,9 @@ basic_machine=hppa1.1-winbond os=-proelf ;; + x64) + basic_machine=x86_64-pc + ;; xbox) basic_machine=i686-pc os=-mingw32 @@ -1358,8 +1365,8 @@ if [ x"$os" != x"" ] then case $os in - # First match some system type aliases - # that might get confused with valid system types. + # First match some system type aliases that might get confused + # with valid system types. # -solaris* is a basic system type, with this one exception. -auroraux) os=-auroraux @@ -1379,9 +1386,9 @@ -gnu/linux*) os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` ;; - # First accept the basic system types. + # Now accept the basic system types. # The portable systems comes first. - # Each alternative MUST END IN A *, to match a version number. + # Each alternative MUST end in a * to match a version number. # -sysv* is not here because it comes later, after sysvr4. -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ @@ -1397,7 +1404,7 @@ | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ - | -chorusos* | -chorusrdb* | -cegcc* \ + | -chorusos* | -chorusrdb* | -cegcc* | -glidix* \ | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ | -midipix* | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \ | -linux-newlib* | -linux-musl* | -linux-uclibc* \ @@ -1409,7 +1416,7 @@ | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \ - | -onefs* | -tirtos* | -phoenix* | -fuchsia*) + | -onefs* | -tirtos* | -phoenix* | -fuchsia* | -redox*) # Remember, each alternative MUST END IN *, to match a version number. ;; -qnx*) @@ -1484,7 +1491,7 @@ -nova*) os=-rtmk-nova ;; - -ns2 ) + -ns2) os=-nextstep2 ;; -nsk*) @@ -1539,6 +1546,19 @@ -dicos*) os=-dicos ;; + -pikeos*) + # Until real need of OS specific support for + # particular features comes up, bare metal + # configurations are quite functional. + case $basic_machine in + arm*) + os=-eabi + ;; + *) + os=-elf + ;; + esac + ;; -nacl*) ;; -ios) @@ -1638,6 +1658,9 @@ sparc-* | *-sun) os=-sunos4.1.1 ;; + pru-*) + os=-elf + ;; *-be) os=-beos ;; @@ -1683,7 +1706,7 @@ m88k-omron*) os=-luna ;; - *-next ) + *-next) os=-nextstep ;; *-sequent) @@ -1818,7 +1841,7 @@ exit # Local variables: -# eval: (add-hook 'write-file-hooks 'time-stamp) +# eval: (add-hook 'write-file-functions 'time-stamp) # time-stamp-start: "timestamp='" # time-stamp-format: "%:y-%02m-%02d" # time-stamp-end: "'" diff -Nru x264-0.148.2795+gitaaa9aa8/configure x264-0.152.2854+gite9a5903/configure --- x264-0.148.2795+gitaaa9aa8/configure 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/configure 2017-12-31 12:50:51.000000000 +0000 @@ -554,9 +554,12 @@ compiler_style=MS CFLAGS="$CFLAGS -Qstd=c99 -nologo -Qms0 -DHAVE_STRING_H -I\$(SRCPATH)/extras" QPRE="-Q" - `$CC 2>&1 | grep -q IA-32` && host_cpu=i486 - `$CC 2>&1 | grep -q "Intel(R) 64"` && host_cpu=x86_64 - cpp_check "" "" "_MSC_VER >= 1400" || die "Windows Intel Compiler support requires Visual Studio 2005 or newer" + cpp_check '' '' '_MSC_VER >= 1400' || die "Windows Intel Compiler support requires Visual Studio 2005 or newer" + if cpp_check '' '' 'defined(_M_AMD64) || defined(_M_X64)' ; then + host_cpu=x86_64 + elif cpp_check '' '' 'defined(_M_IX86)' ; then + host_cpu=i486 + fi if cc_check '' -Qdiag-error:10006,10157 ; then CHECK_CFLAGS="$CHECK_CFLAGS -Qdiag-error:10006,10157" fi @@ -565,9 +568,16 @@ compiler=CL compiler_style=MS CFLAGS="$CFLAGS -nologo -GS- -DHAVE_STRING_H -I\$(SRCPATH)/extras" - `$CC 2>&1 | grep -q 'x86'` && host_cpu=i486 - `$CC 2>&1 | grep -q 'x64'` && host_cpu=x86_64 cpp_check '' '' '_MSC_VER > 1800 || (_MSC_VER == 1800 && _MSC_FULL_VER >= 180030324)' || die "Microsoft Visual Studio support requires Visual Studio 2013 Update 2 or newer" + if cpp_check '' '' 'defined(_M_AMD64) || defined(_M_X64)' ; then + host_cpu=x86_64 + elif cpp_check '' '' 'defined(_M_IX86)' ; then + host_cpu=i486 + elif cpp_check '' '' 'defined(_M_ARM64)' ; then + host_cpu=aarch64 + elif cpp_check '' '' 'defined(_M_ARM)' ; then + host_cpu=arm + fi else # MinGW uses broken pre-VS2015 Microsoft printf functions unless it's told to use the POSIX ones. CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=200112L" @@ -579,7 +589,7 @@ fi fi -if [[ "$cc_base" = clang* ]]; then +if [ $compiler = GNU ]; then if cc_check '' -Werror=unknown-warning-option ; then CHECK_CFLAGS="$CHECK_CFLAGS -Werror=unknown-warning-option" fi @@ -678,7 +688,7 @@ case $host_cpu in i*86) ARCH="X86" - AS="${AS-yasm}" + AS="${AS-nasm}" AS_EXT=".asm" ASFLAGS="$ASFLAGS -DARCH_X86_64=0 -I\$(SRCPATH)/common/x86/" if [ $compiler = GNU ]; then @@ -704,7 +714,7 @@ ;; x86_64) ARCH="X86_64" - AS="${AS-yasm}" + AS="${AS-nasm}" AS_EXT=".asm" ASFLAGS="$ASFLAGS -DARCH_X86_64=1 -I\$(SRCPATH)/common/x86/" stack_alignment=16 @@ -853,7 +863,10 @@ fi if [ $compiler = GNU -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then - if cc_check '' -mpreferred-stack-boundary=5 ; then + if cc_check '' -mpreferred-stack-boundary=6 ; then + CFLAGS="$CFLAGS -mpreferred-stack-boundary=6" + stack_alignment=64 + elif cc_check '' -mpreferred-stack-boundary=5 ; then CFLAGS="$CFLAGS -mpreferred-stack-boundary=5" stack_alignment=32 elif [ $stack_alignment -lt 16 ] && cc_check '' -mpreferred-stack-boundary=4 ; then @@ -876,15 +889,14 @@ fi if [ $asm = auto -a \( $ARCH = X86 -o $ARCH = X86_64 \) ] ; then - if ! as_check "vpmovzxwd ymm0, xmm0" ; then + if ! as_check "vmovdqa32 [eax]{k1}{z}, zmm0" ; then VER=`($AS --version || echo no assembler) 2>/dev/null | head -n 1` echo "Found $VER" - echo "Minimum version is yasm-1.2.0" + echo "Minimum version is nasm-2.13" echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi cc_check '' '' '__asm__("pabsw %xmm0, %xmm0");' && define HAVE_X86_INLINE_ASM - ASFLAGS="$ASFLAGS -Worphan-labels" define HAVE_MMX fi diff -Nru x264-0.148.2795+gitaaa9aa8/debian/changelog x264-0.152.2854+gite9a5903/debian/changelog --- x264-0.148.2795+gitaaa9aa8/debian/changelog 2017-07-02 13:18:16.000000000 +0000 +++ x264-0.152.2854+gite9a5903/debian/changelog 2018-02-03 17:47:10.000000000 +0000 @@ -1,8 +1,31 @@ -x264 (2:0.148.2795+gitaaa9aa8-1~xenial) xenial; urgency=medium +x264 (2:0.152.2854+gite9a5903-2~xenial) xenial; urgency=medium * For xenial - -- Doug McMahon Sun, 02 Jul 2017 09:17:48 -0400 + -- Doug McMahon Sat, 03 Feb 2018 12:46:17 -0500 + +x264 (2:0.152.2854+gite9a5903-2) unstable; urgency=medium + + * Team upload. + * Upload to unstable. + + -- Sebastian Ramacher Fri, 19 Jan 2018 12:36:28 +0100 + +x264 (2:0.152.2854+gite9a5903-1) experimental; urgency=medium + + [ Rico Tzschichholz ] + * Update to new stable upstream + * New upstream version 0.152.2854+gite9a5903 + * Require nasm (>= 2.13) instead of yasm + * Update debian/control for soname bump + * Regenerate manpage + + [ Sebastian Ramacher ] + * debian/control: + - Bump Standards-Version. + - Remove obsolete Pre-Depends. + + -- Rico Tzschichholz Fri, 05 Jan 2018 12:14:48 +0100 x264 (2:0.148.2795+gitaaa9aa8-1) unstable; urgency=medium diff -Nru x264-0.148.2795+gitaaa9aa8/debian/control x264-0.152.2854+gite9a5903/debian/control --- x264-0.148.2795+gitaaa9aa8/debian/control 2017-07-02 13:18:22.000000000 +0000 +++ x264-0.152.2854+gite9a5903/debian/control 2018-02-03 17:47:32.000000000 +0000 @@ -7,14 +7,13 @@ Fabian Greffrath , Rico Tzschichholz Build-Depends: - debhelper (>= 9.20141010), - autotools-dev, + debhelper (>= 9.20160115), dpkg-dev (>= 1.17.14), libavformat-dev (>= 6:9) , libffms2-dev , libgpac-dev (>= 0.5.0+svn4288~) , - yasm [any-i386 any-amd64] -Standards-Version: 4.0.0 + nasm (>= 2.13) [any-i386 any-amd64] +Standards-Version: 4.1.3 Vcs-Git: https://anonscm.debian.org/git/pkg-multimedia/x264.git Vcs-Browser: https://anonscm.debian.org/cgit/pkg-multimedia/x264.git Homepage: http://www.videolan.org/developers/x264.html @@ -49,10 +48,9 @@ * parallel encoding on multiple CPUs * interlaced streams -Package: libx264-148 +Package: libx264-152 Architecture: any Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends} @@ -67,7 +65,7 @@ Architecture: any Multi-Arch: same Depends: - libx264-148 (= ${binary:Version}), + libx264-152 (= ${binary:Version}), ${misc:Depends} Description: development files for libx264 libx264 is an advanced encoding library for creating H.264 (MPEG-4 AVC) diff -Nru x264-0.148.2795+gitaaa9aa8/debian/control.in x264-0.152.2854+gite9a5903/debian/control.in --- x264-0.148.2795+gitaaa9aa8/debian/control.in 2017-06-28 06:31:44.000000000 +0000 +++ x264-0.152.2854+gite9a5903/debian/control.in 2018-01-19 11:35:17.000000000 +0000 @@ -7,14 +7,13 @@ Fabian Greffrath , Rico Tzschichholz Build-Depends: - debhelper (>= 9.20141010), - autotools-dev, + debhelper (>= 9.20160115), dpkg-dev (>= 1.17.14), libavformat-dev (>= 6:9) , libffms2-dev , libgpac-dev (>= 0.5.0+svn4288~) , - yasm [any-i386 any-amd64] -Standards-Version: 4.0.0 + nasm (>= 2.13) [any-i386 any-amd64] +Standards-Version: 4.1.3 Vcs-Git: https://anonscm.debian.org/git/pkg-multimedia/x264.git Vcs-Browser: https://anonscm.debian.org/cgit/pkg-multimedia/x264.git Homepage: http://www.videolan.org/developers/x264.html @@ -52,7 +51,6 @@ Package: @libx264N@ Architecture: any Multi-Arch: same -Pre-Depends: ${misc:Pre-Depends} Depends: ${misc:Depends}, ${shlibs:Depends} diff -Nru x264-0.148.2795+gitaaa9aa8/debian/rules x264-0.152.2854+gite9a5903/debian/rules --- x264-0.148.2795+gitaaa9aa8/debian/rules 2015-01-17 20:01:19.000000000 +0000 +++ x264-0.152.2854+gite9a5903/debian/rules 2018-01-19 11:35:17.000000000 +0000 @@ -13,7 +13,7 @@ debian/libx264-dev.install %: - dh $@ --parallel --with autotools_dev + dh $@ --parallel .PHONY: debian/control debian/control: diff -Nru x264-0.148.2795+gitaaa9aa8/debian/x264.1 x264-0.152.2854+gite9a5903/debian/x264.1 --- x264-0.148.2795+gitaaa9aa8/debian/x264.1 2017-06-28 06:31:44.000000000 +0000 +++ x264-0.152.2854+gite9a5903/debian/x264.1 2018-01-19 11:35:17.000000000 +0000 @@ -1,9 +1,9 @@ -.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.47.4. -.TH X264 "1" "June 2017" "Videolan project" "User Commands" +.\" DO NOT MODIFY THIS FILE! It was generated by help2man 1.47.5. +.TH X264 "1" "December 2017" "Videolan project" "User Commands" .SH NAME x264 \- fast h264 encoder .SH DESCRIPTION -x264 core:148 r2795 aaa9aa8 +x264 core:152 r2854 e9a5903 Syntax: x264 [options] \fB\-o\fR outfile infile .PP Infile can be raw (in which case resolution is required), @@ -564,8 +564,8 @@ Specify input colorspace format for raw input \- valid csps for `raw' demuxer: .TP -i420, yv12, nv12, nv21, i422, yv16, nv16, i444, -yv24, bgr, bgra, rgb +i420, yv12, nv12, nv21, i422, yv16, nv16, yuyv, +uyvy, i444, yv24, bgr, bgra, rgb .TP \- valid csps for `lavf' demuxer: yuv420p, yuyv422, rgb24, bgr24, yuv422p, @@ -612,7 +612,10 @@ bayer_grbg16be, yuv440p10le, yuv440p10be, yuv440p12le, yuv440p12be, ayuv64le, ayuv64be, videotoolbox_vld, p010le, p010be, gbrap12be, -gbrap12le, gbrap10be, gbrap10le, mediacodec +gbrap12le, gbrap10be, gbrap10le, mediacodec, +gray12be, gray12le, gray10be, gray10le, p016le, +p016be, d3d11, gray9be, gray9le, gbrpf32be, +gbrpf32le, gbrapf32be, gbrapf32le, drm_prime .TP \fB\-\-output\-csp\fR Specify output colorspace ["i420"] @@ -759,7 +762,7 @@ \- fittobox and sar: same as above except with specified sar \- csp: convert to the given csp. syntax: [name][:depth] .IP -\- valid csp names [keep current]: i420, yv12, nv12, nv21, i422, yv16, nv16, i444, yv24, bgr, bgra, rgb +\- valid csp names [keep current]: i420, yv12, nv12, nv21, i422, yv16, nv16, yuyv, uyvy, i444, yv24, bgr, bgra, rgb \- depth: 8 or 16 bits per pixel [keep current] .IP note: not all depths are supported by all csps. @@ -775,10 +778,10 @@ offsets: the offset into the step to select a frame see: http://avisynth.nl/index.php/Select#SelectEvery .PP -(libswscale 4.2.100) -(libavformat 57.56.101) +(libswscale 4.8.100) +(libavformat 57.83.100) (ffmpegsource 2.23.0.0) -built on Jun 28 2017, gcc: 6.3.0 20170618 +built on Dec 31 2017, gcc: 7.2.0 x264 configuration: \fB\-\-bit\-depth\fR=\fI\,8\/\fR \fB\-\-chroma\-format\fR=\fI\,all\/\fR libx264 configuration: \fB\-\-bit\-depth\fR=\fI\,10\/\fR \fB\-\-chroma\-format\fR=\fI\,all\/\fR x264 license: GPL version 2 or later diff -Nru x264-0.148.2795+gitaaa9aa8/debian/x264-get-orig-source x264-0.152.2854+gite9a5903/debian/x264-get-orig-source --- x264-0.148.2795+gitaaa9aa8/debian/x264-get-orig-source 2017-06-28 06:31:44.000000000 +0000 +++ x264-0.152.2854+gite9a5903/debian/x264-get-orig-source 2018-01-19 11:35:17.000000000 +0000 @@ -3,8 +3,8 @@ # Script used to generate the orig source tarball for x264. X264_GIT_URL="git://git.videolan.org/x264.git" -X264_GIT_COMMIT="aaa9aa83a111ed6f1db253d5afa91c5fc844583f" -DATE_RETRIEVED="20170521" +X264_GIT_COMMIT="e9a5903edf8ca59ef20e6f4894c196f135af735e" +DATE_RETRIEVED="20171224" COMMIT_SHORT_FORM="$(echo $X264_GIT_COMMIT | \ sed -e 's/^\([[:xdigit:]]\{,7\}\).*/\1/')" diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/analyse.c x264-0.152.2854+gite9a5903/encoder/analyse.c --- x264-0.148.2795+gitaaa9aa8/encoder/analyse.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/analyse.c 2017-12-31 12:50:51.000000000 +0000 @@ -34,37 +34,23 @@ typedef struct { - /* 16x16 */ - int i_rd16x16; x264_me_t me16x16; x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */ - - /* 8x8 */ - int i_cost8x8; - /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */ - ALIGNED_4( int16_t mvc[32][5][2] ); x264_me_t me8x8[4]; - - /* Sub 4x4 */ - int i_cost4x4[4]; /* cost per 8x8 partition */ x264_me_t me4x4[4][4]; - - /* Sub 8x4 */ - int i_cost8x4[4]; /* cost per 8x8 partition */ x264_me_t me8x4[4][2]; - - /* Sub 4x8 */ - int i_cost4x8[4]; /* cost per 8x8 partition */ x264_me_t me4x8[4][2]; - - /* 16x8 */ - int i_cost16x8; x264_me_t me16x8[2]; - - /* 8x16 */ - int i_cost8x16; x264_me_t me8x16[2]; - + int i_rd16x16; + int i_cost8x8; + int i_cost4x4[4]; /* cost per 8x8 partition */ + int i_cost8x4[4]; /* cost per 8x8 partition */ + int i_cost4x8[4]; /* cost per 8x8 partition */ + int i_cost16x8; + int i_cost8x16; + /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */ + ALIGNED_4( int16_t mvc[32][5][2] ); } x264_mb_analysis_list_t; typedef struct @@ -278,29 +264,31 @@ static int init_costs( x264_t *h, float *logs, int qp ) { - int lambda = x264_lambda_tab[qp]; if( h->cost_mv[qp] ) return 0; + + int mv_range = h->param.analyse.i_mv_range; + int lambda = x264_lambda_tab[qp]; /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ - CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) ); - h->cost_mv[qp] += 2*4*2048; - for( int i = 0; i <= 2*4*2048; i++ ) + CHECKED_MALLOC( h->cost_mv[qp], (4*4*mv_range + 1) * sizeof(uint16_t) ); + h->cost_mv[qp] += 2*4*mv_range; + for( int i = 0; i <= 2*4*mv_range; i++ ) { h->cost_mv[qp][-i] = - h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 ); + h->cost_mv[qp][i] = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX ); } x264_pthread_mutex_lock( &cost_ref_mutex ); for( int i = 0; i < 3; i++ ) for( int j = 0; j < 33; j++ ) - x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 ); + x264_cost_ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0; x264_pthread_mutex_unlock( &cost_ref_mutex ); if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] ) { for( int j = 0; j < 4; j++ ) { - CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) ); - h->cost_mv_fpel[qp][j] += 2*2048; - for( int i = -2*2048; i < 2*2048; i++ ) + CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*mv_range + 1) * sizeof(uint16_t) ); + h->cost_mv_fpel[qp][j] += 2*mv_range; + for( int i = -2*mv_range; i < 2*mv_range; i++ ) h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j]; } } @@ -314,12 +302,13 @@ int x264_analyse_init_costs( x264_t *h ) { - float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) ); + int mv_range = h->param.analyse.i_mv_range; + float *logs = x264_malloc( (2*4*mv_range+1) * sizeof(float) ); if( !logs ) return -1; logs[0] = 0.718f; - for( int i = 1; i <= 2*4*2048; i++ ) + for( int i = 1; i <= 2*4*mv_range; i++ ) logs[i] = log2f( i+1 ) * 2.0f + 1.718f; for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ ) @@ -338,13 +327,14 @@ void x264_analyse_free_costs( x264_t *h ) { + int mv_range = h->param.analyse.i_mv_range; for( int i = 0; i < QP_MAX+1; i++ ) { if( h->cost_mv[i] ) - x264_free( h->cost_mv[i] - 2*4*2048 ); + x264_free( h->cost_mv[i] - 2*4*mv_range ); if( h->cost_mv_fpel[i][0] ) for( int j = 0; j < 4; j++ ) - x264_free( h->cost_mv_fpel[i][j] - 2*2048 ); + x264_free( h->cost_mv_fpel[i][j] - 2*mv_range ); } } @@ -465,11 +455,10 @@ int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel /* Calculate max allowed MV range */ -#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 ) h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 ); h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 ); - h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] ); - h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] ); + h->mb.mv_min_spel[0] = X264_MAX( h->mb.mv_min[0], -i_fmv_range ); + h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max[0], i_fmv_range-1 ); if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P ) { int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */ @@ -513,9 +502,8 @@ mb_y = (h->mb.i_mb_y >> j) + (i == 1); h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 ); h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 ); - h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range ); - h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] ); - h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 ); + h->mb.mv_miny_spel_row[i] = X264_MAX( h->mb.mv_miny_row[i], -i_fmv_range ); + h->mb.mv_maxy_spel_row[i] = X264_MIN3( h->mb.mv_maxy_row[i], i_fmv_range-1, 4*thread_mvy_range ); h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border; h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border; } @@ -524,9 +512,8 @@ { h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 ); - h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); - h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); - h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 ); + h->mb.mv_min_spel[1] = X264_MAX( h->mb.mv_min[1], -i_fmv_range ); + h->mb.mv_max_spel[1] = X264_MIN3( h->mb.mv_max[1], i_fmv_range-1, 4*thread_mvy_range ); h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; } @@ -541,7 +528,6 @@ h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i]; h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i]; } -#undef CLIP_FMV a->l0.me16x16.cost = a->l0.i_rd16x16 = @@ -713,8 +699,12 @@ x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 ); if( !h->mb.i_psy_rd ) return; - /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */ - h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) ); + + M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO; + M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO; + M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO; + M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO; + h->mb.pic.fenc_hadamard_cache[8] = 0; if( b_satd ) h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) ); } @@ -743,8 +733,8 @@ h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] ); h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] ); } - a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) - + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ); + a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) + + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); return; } @@ -759,8 +749,8 @@ h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv ); h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] ); h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] ); - satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ); - satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ); + satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ); + satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); for( ; *predict_mode >= 0; predict_mode++ ) { @@ -788,8 +778,8 @@ } /* we calculate the cost */ - i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) + - h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) + + i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) + + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ) + a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] ); a->i_satd_chroma_dir[i_mode] = i_satd; @@ -845,7 +835,7 @@ if( a->i_satd_i16x16 <= i16x16_thresh ) { h->predict_16x16[I_PRED_16x16_P]( p_dst ); - a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); + a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ); a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3); COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 ); } @@ -862,7 +852,7 @@ else h->predict_16x16[i_mode]( p_dst ); - i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) + + i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ) + lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] ); COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode ); a->i_satd_i16x16_dir[i_mode] = i_satd; @@ -1065,7 +1055,7 @@ else h->predict_4x4[i_mode]( p_dst_by ); - i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); + i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_src_by, FENC_STRIDE, p_dst_by, FDEC_STRIDE ); if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) { i_satd -= lambda * 3; diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/cabac.c x264-0.152.2854+gite9a5903/encoder/cabac.c --- x264-0.148.2795+gitaaa9aa8/encoder/cabac.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/cabac.c 2017-12-31 12:50:51.000000000 +0000 @@ -801,7 +801,7 @@ static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { -#if ARCH_X86_64 && HAVE_MMX +#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ ) h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else x264_cabac_block_residual_c( h, cb, ctx_block_cat, l ); @@ -915,7 +915,7 @@ static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { -#if ARCH_X86_64 && HAVE_MMX +#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ ) h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l ); @@ -923,7 +923,7 @@ } static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { -#if ARCH_X86_64 && HAVE_MMX +#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ ) h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l ); @@ -1057,29 +1057,29 @@ src = dst; #define MUNGE_8x8_NNZ( MUNGE )\ -if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] )\ +if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] && !(h->mb.cbp[h->mb.i_mb_left_xy[0]] & 0x1000) )\ {\ - MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x80 )\ - MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x80 )\ - MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x80 )\ - MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x80 )\ - MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x80 )\ - MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x80 )\ + MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x00 )\ + MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x00 )\ + MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x00 )\ + MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x00 )\ + MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x00 )\ + MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x00 )\ }\ -if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] )\ +if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] && !(h->mb.cbp[h->mb.i_mb_left_xy[1]] & 0x1000) )\ {\ - MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x80 )\ - MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x80 )\ - MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x80 )\ - MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x80 )\ - MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x80 )\ - MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x80 )\ + MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x00 )\ + MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x00 )\ + MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x00 )\ + MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x00 )\ + MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x00 )\ + MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x00 )\ }\ -if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] )\ +if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] && !(h->mb.cbp[h->mb.i_mb_top_xy] & 0x1000) )\ {\ - MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x80808080U )\ - MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x80808080U )\ - MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x80808080U )\ + MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x00000000U )\ + MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x00000000U )\ + MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x00000000U )\ } MUNGE_8x8_NNZ( BACKUP ) diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/encoder.c x264-0.152.2854+gite9a5903/encoder/encoder.c --- x264-0.148.2795+gitaaa9aa8/encoder/encoder.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/encoder.c 2017-12-31 12:50:51.000000000 +0000 @@ -444,11 +444,6 @@ fail = 1; } #endif - if( !fail && !(cpuflags & X264_CPU_CMOV) ) - { - x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n"); - fail = 1; - } if( fail ) { x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n"); @@ -494,7 +489,8 @@ #endif if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX ) { - x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" ); + x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/YUYV/UYVY/" + "I444/YV24/BGR/BGRA/RGB supported)\n" ); return -1; } @@ -859,6 +855,11 @@ h->param.analyse.inter &= ~X264_ANALYSE_I8x8; h->param.analyse.intra &= ~X264_ANALYSE_I8x8; } + if( i_csp >= X264_CSP_I444 && h->param.b_cabac ) + { + /* Disable 8x8dct during 4:4:4+CABAC encoding for compatibility with libavcodec */ + h->param.analyse.b_transform_8x8 = 0; + } if( h->param.rc.i_rc_method == X264_RC_CQP ) { float qp_p = h->param.rc.i_qp_constant; @@ -1170,7 +1171,7 @@ if( h->param.analyse.i_mv_range <= 0 ) h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED; else - h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> PARAM_INTERLACED); + h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 8192 >> PARAM_INTERLACED); } h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART ); @@ -1530,6 +1531,12 @@ x264_rdo_init(); /* init CPU functions */ +#if (ARCH_X86 || ARCH_X86_64) && HIGH_BIT_DEPTH + /* FIXME: Only 8-bit has been optimized for AVX-512 so far. The few AVX-512 functions + * enabled in high bit-depth are insignificant and just causes potential issues with + * unnecessary thermal throttling and whatnot, so keep it disabled for now. */ + h->param.cpu &= ~X264_CPU_AVX512; +#endif x264_predict_16x16_init( h->param.cpu, h->predict_16x16 ); x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c ); @@ -1566,9 +1573,15 @@ if( !strcmp(x264_cpu_names[i].name, "SSE4.1") && (h->param.cpu & X264_CPU_SSE42) ) continue; + if( !strcmp(x264_cpu_names[i].name, "LZCNT") + && (h->param.cpu & X264_CPU_BMI1) ) + continue; if( !strcmp(x264_cpu_names[i].name, "BMI1") && (h->param.cpu & X264_CPU_BMI2) ) continue; + if( !strcmp(x264_cpu_names[i].name, "FMA4") + && (h->param.cpu & X264_CPU_FMA3) ) + continue; if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) ) p += sprintf( p, " %s", x264_cpu_names[i].name ); @@ -1580,14 +1593,6 @@ if( x264_analyse_init_costs( h ) ) goto fail; - static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 }; - /* Checks for known miscompilation issues. */ - if( h->cost_mv[X264_LOOKAHEAD_QP][2013] != cost_mv_correct[BIT_DEPTH-8] ) - { - x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" ); - goto fail; - } - /* Must be volatile or else GCC will optimize it out. */ volatile int temp = 392; if( x264_clz( temp ) != 23 ) diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/macroblock.c x264-0.152.2854+gite9a5903/encoder/macroblock.c --- x264-0.148.2795+gitaaa9aa8/encoder/macroblock.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/macroblock.c 2017-12-31 12:50:51.000000000 +0000 @@ -128,8 +128,8 @@ pixel *p_src = h->mb.pic.p_fenc[p]; pixel *p_dst = h->mb.pic.p_fdec[p]; - ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] ); - ALIGNED_ARRAY_32( dctcoef, dct_dc4x4,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] ); int nz, block_cbp = 0; int decimate_score = h->mb.b_dct_decimate ? 0 : 9; @@ -283,13 +283,10 @@ if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction ) { int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6; - int ssd[2]; + ALIGNED_ARRAY_8( int, ssd,[2] ); int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8; - int score = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] ); - if( score < thresh*4 ) - score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] ); - if( score < thresh*4 ) + if( h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], ssd ) < thresh*4 ) { h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0; h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0; @@ -350,7 +347,7 @@ int i_decimate_score = b_decimate ? 0 : 7; int nz_ac = 0; - ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); if( h->mb.b_lossless ) { @@ -561,9 +558,16 @@ pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride; if( i_mode == I_PRED_4x4_V ) + { h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 ); + memcpy( p_dst, p_dst-FDEC_STRIDE, 4*sizeof(pixel) ); + } else if( i_mode == I_PRED_4x4_H ) + { h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 ); + for( int i = 0; i < 4; i++ ) + p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1]; + } else h->predict_4x4[i_mode]( p_dst ); } @@ -574,9 +578,16 @@ pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride; if( i_mode == I_PRED_8x8_V ) + { h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 ); + memcpy( p_dst, &edge[16], 8*sizeof(pixel) ); + } else if( i_mode == I_PRED_8x8_H ) + { h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 ); + for( int i = 0; i < 8; i++ ) + p_dst[i*FDEC_STRIDE] = edge[14-i]; + } else h->predict_8x8[i_mode]( p_dst, edge ); } @@ -584,12 +595,21 @@ void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode ) { int stride = h->fenc->i_stride[p] << MB_INTERLACED; + pixel *p_dst = h->mb.pic.p_fdec[p]; + if( i_mode == I_PRED_16x16_V ) - h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 ); + { + h->mc.copy[PIXEL_16x16]( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 ); + memcpy( p_dst, p_dst-FDEC_STRIDE, 16*sizeof(pixel) ); + } else if( i_mode == I_PRED_16x16_H ) - h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 ); + { + h->mc.copy_16x16_unaligned( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 ); + for( int i = 0; i < 16; i++ ) + p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1]; + } else - h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] ); + h->predict_16x16[i_mode]( p_dst ); } /***************************************************************************** @@ -780,7 +800,7 @@ } else if( h->mb.b_transform_8x8 ) { - ALIGNED_ARRAY_32( dctcoef, dct8x8,[4],[64] ); + ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] ); b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) @@ -824,7 +844,7 @@ } else { - ALIGNED_ARRAY_32( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; @@ -965,8 +985,8 @@ *****************************************************************************/ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma ) { - ALIGNED_ARRAY_32( dctcoef, dct4x4,[8],[16] ); - ALIGNED_ARRAY_16( dctcoef, dctscan,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_64( dctcoef, dctscan,[16] ); ALIGNED_4( int16_t mvp[2] ); int i_qp = h->mb.i_qp; @@ -1219,7 +1239,7 @@ int quant_cat = p ? CQM_8PC : CQM_8PY; pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; - ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] ); + ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 ); @@ -1252,7 +1272,7 @@ pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; int i_decimate_8x8 = b_decimate ? 0 : 4; - ALIGNED_ARRAY_32( dctcoef, dct4x4,[4],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] ); int nnz8x8 = 0; h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); @@ -1311,7 +1331,7 @@ i_qp = h->mb.i_chroma_qp; for( int ch = 0; ch < 2; ch++ ) { - ALIGNED_ARRAY_32( dctcoef, dct4x4,[2],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] ); pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; @@ -1376,7 +1396,7 @@ } else { - ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] ); h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 ); h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz; diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/macroblock.h x264-0.152.2854+gite9a5903/encoder/macroblock.h --- x264-0.148.2795+gitaaa9aa8/encoder/macroblock.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/macroblock.h 2017-12-31 12:50:51.000000000 +0000 @@ -116,7 +116,7 @@ int nz; pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]]; pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]]; - ALIGNED_ARRAY_32( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] ); if( b_predict ) { @@ -154,7 +154,7 @@ int nz; pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE]; pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE]; - ALIGNED_ARRAY_32( dctcoef, dct8x8,[64] ); + ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] ); ALIGNED_ARRAY_32( pixel, edge_buf,[36] ); if( b_predict ) diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/me.c x264-0.152.2854+gite9a5903/encoder/me.c --- x264-0.148.2795+gitaaa9aa8/encoder/me.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/me.c 2017-12-31 12:50:51.000000000 +0000 @@ -1059,7 +1059,7 @@ uint64_t bcostrd = COST_MAX64; uint16_t amvd; /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ - ALIGNED_ARRAY_32( uint8_t, visited,[8],[8][8] ); + ALIGNED_ARRAY_64( uint8_t, visited,[8],[8][8] ); /* all permutations of an offset in up to 2 of the dimensions */ ALIGNED_4( static const int8_t dia4d[33][4] ) = { diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/me.h x264-0.152.2854+gite9a5903/encoder/me.h --- x264-0.148.2795+gitaaa9aa8/encoder/me.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/me.h 2017-12-31 12:50:51.000000000 +0000 @@ -32,10 +32,10 @@ typedef struct { - /* aligning the first member is a gcc hack to force the struct to be - * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */ + /* aligning the first member is a gcc hack to force the struct to be aligned, + * as well as force sizeof(struct) to be a multiple of the alignment. */ /* input */ - ALIGNED_16( int i_pixel ); /* PIXEL_WxH */ + ALIGNED_64( int i_pixel ); /* PIXEL_WxH */ uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */ int i_ref_cost; int i_ref; @@ -53,7 +53,7 @@ int cost_mv; /* lambda * nbits for the chosen mv */ int cost; /* satd + lambda * nbits */ ALIGNED_4( int16_t mv[2] ); -} ALIGNED_16( x264_me_t ); +} ALIGNED_64( x264_me_t ); void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh ); #define x264_me_search( h, m, mvc, i_mvc )\ @@ -66,8 +66,6 @@ void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight ); uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ); -extern uint16_t *x264_cost_mv_fpel[QP_MAX+1][4]; - #define COPY1_IF_LT(x,y)\ if( (y) < (x) )\ (x) = (y); diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/ratecontrol.c x264-0.152.2854+gite9a5903/encoder/ratecontrol.c --- x264-0.148.2795+gitaaa9aa8/encoder/ratecontrol.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/ratecontrol.c 2017-12-31 12:50:51.000000000 +0000 @@ -420,7 +420,7 @@ float dstdim[2] = { h->param.i_width / 16.f, h->param.i_height / 16.f}; int srcdimi[2] = {ceil(srcdim[0]), ceil(srcdim[1])}; int dstdimi[2] = {ceil(dstdim[0]), ceil(dstdim[1])}; - if( PARAM_INTERLACED ) + if( h->param.b_interlaced || h->param.b_fake_interlaced ) { srcdimi[1] = (srcdimi[1]+1)&~1; dstdimi[1] = (dstdimi[1]+1)&~1; @@ -1469,7 +1469,7 @@ if( h->i_frame == 0 ) { //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR - double fr = 1. / 172; + double fr = 1. / (h->param.i_level_idc >= 60 ? 300 : 172); int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height; rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr; } diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/rdo.c x264-0.152.2854+gite9a5903/encoder/rdo.c --- x264-0.148.2795+gitaaa9aa8/encoder/rdo.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/rdo.c 2017-12-31 12:50:51.000000000 +0000 @@ -64,9 +64,8 @@ #include "cabac.c" #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \ - sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) - (CHROMA444 ? 0 : (1024+12)-460) ) -#define COPY_CABAC_PART( pos, size )\ - memcpy( &cb->state[pos], &h->cabac.state[pos], size ) + sizeof(int) + (CHROMA444 ? 1024+12 : 460) ) +#define COPY_CABAC_PART( pos, size ) memcpy( &cb->state[pos], &h->cabac.state[pos], size ) static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y ) { @@ -634,8 +633,8 @@ const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac, int b_chroma, int dc, int num_coefs, int idx ) { - ALIGNED_ARRAY_32( dctcoef, orig_coefs, [64] ); - ALIGNED_ARRAY_32( dctcoef, quant_coefs, [64] ); + ALIGNED_ARRAY_64( dctcoef, orig_coefs, [64] ); + ALIGNED_ARRAY_64( dctcoef, quant_coefs, [64] ); const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab; const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; const int b_interlaced = MB_INTERLACED; @@ -695,7 +694,7 @@ return !!dct[0]; } -#if HAVE_MMX && ARCH_X86_64 +#if HAVE_MMX && ARCH_X86_64 && !defined( __MACH__ ) #define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\ cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8) if( num_coefs == 16 && !dc ) diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/set.c x264-0.152.2854+gite9a5903/encoder/set.c --- x264-0.148.2795+gitaaa9aa8/encoder/set.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/set.c 2017-12-31 12:50:51.000000000 +0000 @@ -783,23 +783,26 @@ const x264_level_t x264_levels[] = { - { 10, 1485, 99, 396, 64, 175, 64, 64, 0, 2, 0, 0, 1 }, - { 9, 1485, 99, 396, 128, 350, 64, 64, 0, 2, 0, 0, 1 }, /* "1b" */ - { 11, 3000, 396, 900, 192, 500, 128, 64, 0, 2, 0, 0, 1 }, - { 12, 6000, 396, 2376, 384, 1000, 128, 64, 0, 2, 0, 0, 1 }, - { 13, 11880, 396, 2376, 768, 2000, 128, 64, 0, 2, 0, 0, 1 }, - { 20, 11880, 396, 2376, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 }, - { 21, 19800, 792, 4752, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, - { 22, 20250, 1620, 8100, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, - { 30, 40500, 1620, 8100, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 }, - { 31, 108000, 3600, 18000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 }, - { 32, 216000, 5120, 20480, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 }, - { 40, 245760, 8192, 32768, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 }, - { 41, 245760, 8192, 32768, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 }, - { 42, 522240, 8704, 34816, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 }, - { 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 }, - { 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, - { 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, + { 10, 1485, 99, 396, 64, 175, 64, 64, 0, 2, 0, 0, 1 }, + { 9, 1485, 99, 396, 128, 350, 64, 64, 0, 2, 0, 0, 1 }, /* "1b" */ + { 11, 3000, 396, 900, 192, 500, 128, 64, 0, 2, 0, 0, 1 }, + { 12, 6000, 396, 2376, 384, 1000, 128, 64, 0, 2, 0, 0, 1 }, + { 13, 11880, 396, 2376, 768, 2000, 128, 64, 0, 2, 0, 0, 1 }, + { 20, 11880, 396, 2376, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 }, + { 21, 19800, 792, 4752, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, + { 22, 20250, 1620, 8100, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, + { 30, 40500, 1620, 8100, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 }, + { 31, 108000, 3600, 18000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 }, + { 32, 216000, 5120, 20480, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 }, + { 40, 245760, 8192, 32768, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 }, + { 41, 245760, 8192, 32768, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 }, + { 42, 522240, 8704, 34816, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 }, + { 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 }, + { 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, + { 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, + { 60, 4177920, 139264, 696320, 240000, 240000, 8192, 16, 24, 2, 1, 1, 1 }, + { 61, 8355840, 139264, 696320, 480000, 480000, 8192, 16, 24, 2, 1, 1, 1 }, + { 62, 16711680, 139264, 696320, 800000, 800000, 8192, 16, 24, 2, 1, 1, 1 }, { 0 } }; diff -Nru x264-0.148.2795+gitaaa9aa8/encoder/slicetype.c x264-0.152.2854+gite9a5903/encoder/slicetype.c --- x264-0.148.2795+gitaaa9aa8/encoder/slicetype.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/encoder/slicetype.c 2017-12-31 12:50:51.000000000 +0000 @@ -267,7 +267,7 @@ int i_lines = fenc->i_lines[p]; int i_width = fenc->i_width[p]; pixel *src = fenc->plane[p]; - ALIGNED_ARRAY_16( pixel, buf, [16*16] ); + ALIGNED_ARRAY_64( pixel, buf, [16*16] ); int pixoff = 0; if( w ) { @@ -544,17 +544,18 @@ if( p0 == p1 ) goto lowres_intra_mb; + int mv_range = 2 * h->param.analyse.i_mv_range; // no need for h->mb.mv_min[] - h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4; - h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4; - h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 ); - h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 ); + h->mb.mv_min_spel[0] = X264_MAX( 4*(-8*h->mb.i_mb_x - 12), -mv_range ); + h->mb.mv_max_spel[0] = X264_MIN( 4*(8*(h->mb.i_mb_width - h->mb.i_mb_x - 1) + 12), mv_range-1 ); + h->mb.mv_limit_fpel[0][0] = h->mb.mv_min_spel[0] >> 2; + h->mb.mv_limit_fpel[1][0] = h->mb.mv_max_spel[0] >> 2; if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 ) { - h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4; - h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4; - h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 ); - h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 ); + h->mb.mv_min_spel[1] = X264_MAX( 4*(-8*h->mb.i_mb_y - 12), -mv_range ); + h->mb.mv_max_spel[1] = X264_MIN( 4*(8*( h->mb.i_mb_height - h->mb.i_mb_y - 1) + 12), mv_range-1 ); + h->mb.mv_limit_fpel[0][1] = h->mb.mv_min_spel[1] >> 2; + h->mb.mv_limit_fpel[1][1] = h->mb.mv_max_spel[1] >> 2; } #define LOAD_HPELS_LUMA(dst, src) \ @@ -728,13 +729,13 @@ if( h->param.analyse.i_subpel_refine > 1 ) { h->predict_8x8c[I_PRED_CHROMA_P]( pix ); - int satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); + int satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ); i_icost = X264_MIN( i_icost, satd ); h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); for( int i = 3; i < 9; i++ ) { h->predict_8x8[i]( pix, edge ); - satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); + satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ); i_icost = X264_MIN( i_icost, satd ); } } diff -Nru x264-0.148.2795+gitaaa9aa8/filters/video/resize.c x264-0.152.2854+gite9a5903/filters/video/resize.c --- x264-0.148.2795+gitaaa9aa8/filters/video/resize.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/filters/video/resize.c 2017-12-31 12:50:51.000000000 +0000 @@ -154,10 +154,12 @@ case X264_CSP_RGB: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_RGB48 : AV_PIX_FMT_RGB24; case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGR48 : AV_PIX_FMT_BGR24; case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGRA64 : AV_PIX_FMT_BGRA; - /* the next csp has no equivalent 16bit depth in swscale */ + /* the following has no equivalent 16-bit depth in swscale */ case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV12; case X264_CSP_NV21: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV21; - /* the next csp is no supported by swscale at all */ + case X264_CSP_YUYV: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_YUYV422; + case X264_CSP_UYVY: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_UYVY422; + /* the following is not supported by swscale at all */ case X264_CSP_NV16: default: return AV_PIX_FMT_NONE; } diff -Nru x264-0.148.2795+gitaaa9aa8/input/input.c x264-0.152.2854+gite9a5903/input/input.c --- x264-0.148.2795+gitaaa9aa8/input/input.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/input/input.c 2017-12-31 12:50:51.000000000 +0000 @@ -43,6 +43,8 @@ [X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, [X264_CSP_NV21] = { "nv21", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, [X264_CSP_NV16] = { "nv16", 2, { 1, 1 }, { 1, 1 }, 2, 1 }, + [X264_CSP_YUYV] = { "yuyv", 1, { 2 }, { 1 }, 2, 1 }, + [X264_CSP_UYVY] = { "uyvy", 1, { 2 }, { 1 }, 2, 1 }, [X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 }, [X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 }, [X264_CSP_RGB] = { "rgb", 1, { 3 }, { 1 }, 1, 1 }, diff -Nru x264-0.148.2795+gitaaa9aa8/tools/checkasm-a.asm x264-0.152.2854+gite9a5903/tools/checkasm-a.asm --- x264-0.148.2795+gitaaa9aa8/tools/checkasm-a.asm 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/tools/checkasm-a.asm 2017-12-31 12:50:51.000000000 +0000 @@ -225,3 +225,14 @@ leave RET +; Trigger a warmup of vector units +%macro WARMUP 0 +cglobal checkasm_warmup, 0,0 + xorps m0, m0 + RET +%endmacro + +INIT_YMM avx +WARMUP +INIT_ZMM avx512 +WARMUP diff -Nru x264-0.148.2795+gitaaa9aa8/tools/checkasm.c x264-0.152.2854+gite9a5903/tools/checkasm.c --- x264-0.148.2795+gitaaa9aa8/tools/checkasm.c 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/tools/checkasm.c 2017-12-31 12:50:51.000000000 +0000 @@ -57,8 +57,7 @@ if( !ok ) ret = -1; \ } -#define BENCH_RUNS 100 // tradeoff between accuracy and speed -#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff) +#define BENCH_RUNS 2000 // tradeoff between accuracy and speed #define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions #define MAX_CPUS 30 // number of different combinations of cpu flags @@ -178,7 +177,10 @@ continue; printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, #if HAVE_MMX + b->cpu&X264_CPU_AVX512 ? "avx512" : b->cpu&X264_CPU_AVX2 ? "avx2" : + b->cpu&X264_CPU_BMI2 ? "bmi2" : + b->cpu&X264_CPU_BMI1 ? "bmi1" : b->cpu&X264_CPU_FMA3 ? "fma3" : b->cpu&X264_CPU_FMA4 ? "fma4" : b->cpu&X264_CPU_XOP ? "xop" : @@ -187,6 +189,7 @@ b->cpu&X264_CPU_SSE4 ? "sse4" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : + b->cpu&X264_CPU_LZCNT ? "lzcnt" : /* print sse2slow only if there's also a sse2fast version of the same func */ b->cpu&X264_CPU_SSE2_IS_SLOW && jcpu&X264_CPU_SSE2 ? "sse2" : @@ -209,10 +212,7 @@ b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" : - b->cpu&X264_CPU_LZCNT ? "_lzcnt" : - b->cpu&X264_CPU_BMI2 ? "_bmi2" : - b->cpu&X264_CPU_BMI1 ? "_bmi1" : - b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" : + b->cpu&X264_CPU_LZCNT && b->cpu&X264_CPU_SSE3 && !(b->cpu&X264_CPU_BMI1) ? "_lzcnt" : b->cpu&X264_CPU_SLOW_ATOM ? "_atom" : #elif ARCH_ARM b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : @@ -222,8 +222,18 @@ } } +/* YMM and ZMM registers on x86 are turned off to save power when they haven't been + * used for some period of time. When they are used there will be a "warmup" period + * during which performance will be reduced and inconsistent which is problematic when + * trying to benchmark individual functions. We can work around this by periodically + * issuing "dummy" instructions that uses those registers to keep them powered on. */ +static void (*simd_warmup_func)( void ) = NULL; +#define simd_warmup() do { if( simd_warmup_func ) simd_warmup_func(); } while( 0 ) + #if ARCH_X86 || ARCH_X86_64 int x264_stack_pagealign( int (*func)(), int align ); +void x264_checkasm_warmup_avx( void ); +void x264_checkasm_warmup_avx512( void ); /* detect when callee-saved regs aren't saved * needs an explicit asm check because it only sometimes crashes in normal use. */ @@ -258,6 +268,7 @@ #define call_a1(func,...) ({ \ uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \ x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \ + simd_warmup(); \ x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); }) #elif ARCH_AARCH64 && !defined(__APPLE__) void x264_checkasm_stack_clobber( uint64_t clobber, ... ); @@ -285,6 +296,7 @@ call_a1(func, __VA_ARGS__);\ for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\ {\ + simd_warmup();\ uint32_t t = read_time();\ func(__VA_ARGS__);\ func(__VA_ARGS__);\ @@ -358,8 +370,9 @@ used_asm = 1; \ for( int j = 0; j < 64; j++ ) \ { \ - res_c = call_c( pixel_c.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \ - res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \ + intptr_t stride1 = (j&31) == 31 ? 32 : FENC_STRIDE; \ + res_c = call_c( pixel_c.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \ + res_asm = call_a( pixel_asm.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ @@ -494,15 +507,17 @@ #define TEST_PIXEL_VAR2( i ) \ if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \ { \ - int res_c, res_asm, ssd_c, ssd_asm; \ + int res_c, res_asm; \ + ALIGNED_ARRAY_8( int, ssd_c, [2] ); \ + ALIGNED_ARRAY_8( int, ssd_asm,[2] ); \ set_func_name( "%s_%s", "var2", pixel_names[i] ); \ used_asm = 1; \ - res_c = call_c( pixel_c.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_c ); \ - res_asm = call_a( pixel_asm.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_asm ); \ - if( res_c != res_asm || ssd_c != ssd_asm ) \ + res_c = call_c( pixel_c.var2[i], pbuf1, pbuf2, ssd_c ); \ + res_asm = call_a( pixel_asm.var2[i], pbuf1, pbuf2, ssd_asm ); \ + if( res_c != res_asm || memcmp( ssd_c, ssd_asm, 2*sizeof(int) ) ) \ { \ ok = 0; \ - fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \ + fprintf( stderr, "var2[%d]: {%d, %d, %d} != {%d, %d, %d} [FAILED]\n", i, res_c, ssd_c[0], ssd_c[1], res_asm, ssd_asm[0], ssd_asm[1] ); \ } \ } @@ -827,10 +842,10 @@ x264_dct_function_t dct_asm; x264_quant_function_t qf; int ret = 0, ok, used_asm, interlace = 0; - ALIGNED_ARRAY_32( dctcoef, dct1, [16],[16] ); - ALIGNED_ARRAY_32( dctcoef, dct2, [16],[16] ); - ALIGNED_ARRAY_32( dctcoef, dct4, [16],[16] ); - ALIGNED_ARRAY_32( dctcoef, dct8, [4],[64] ); + ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct2, [16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] ); ALIGNED_16( dctcoef dctdc[2][8] ); x264_t h_buf; x264_t *h = &h_buf; @@ -1032,8 +1047,8 @@ x264_zigzag_function_t zigzag_ref[2]; x264_zigzag_function_t zigzag_asm[2]; - ALIGNED_ARRAY_16( dctcoef, level1,[64] ); - ALIGNED_ARRAY_16( dctcoef, level2,[64] ); + ALIGNED_ARRAY_64( dctcoef, level1,[64] ); + ALIGNED_ARRAY_64( dctcoef, level2,[64] ); #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ @@ -1526,6 +1541,33 @@ } } + if( mc_a.plane_copy_deinterleave_yuyv != mc_ref.plane_copy_deinterleave_yuyv ) + { + set_func_name( "plane_copy_deinterleave_yuyv" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + intptr_t dst_stride = ALIGN( w, 32/sizeof(pixel) ); + intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1; + intptr_t offv = dst_stride*h; + pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); + memset( pbuf3, 0, 0x1000 ); + memset( pbuf4, 0, 0x1000 ); + /* Skip benchmarking since it's the same as plane_copy_deinterleave(), just verify correctness. */ + call_c1( mc_c.plane_copy_deinterleave_yuyv, pbuf3, dst_stride, pbuf3+offv, dst_stride, src1, src_stride, w, h ); + call_a1( mc_a.plane_copy_deinterleave_yuyv, pbuf4, dst_stride, pbuf4+offv, dst_stride, src1, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) || + memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(pixel) ) ) + { + fprintf( stderr, "plane_copy_deinterleave_yuyv FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); + break; + } + } + } + if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb ) { set_func_name( "plane_copy_deinterleave_rgb" ); @@ -1566,7 +1608,7 @@ { int w = (plane_specs[i].w + 1) >> 1; int h = plane_specs[i].h; - intptr_t dst_stride = ALIGN( w, 16 ); + intptr_t dst_stride = ALIGN( w, 32 ); intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t); intptr_t offv = dst_stride*h + 32; memset( pbuf3, 0, 0x1000 ); @@ -1704,7 +1746,7 @@ { ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4; if( !ok ) - fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] ); + fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] ); } } } @@ -1723,15 +1765,16 @@ h.mb.i_mb_width = width; h.mb.i_mb_height = height; - uint16_t *ref_costsc = (uint16_t*)buf3; - uint16_t *ref_costsa = (uint16_t*)buf4; - int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size); + uint16_t *ref_costsc = (uint16_t*)buf3 + width; + uint16_t *ref_costsa = (uint16_t*)buf4 + width; + int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + width + size); int16_t *propagate_amount = (int16_t*)(mvs + width); uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width); - h.scratch_buffer2 = (uint8_t*)(ref_costsa + size); + h.scratch_buffer2 = (uint8_t*)(ref_costsa + width + size); int bipred_weight = (rand()%63)+1; + int mb_y = rand()&3; int list = i&1; - for( int j = 0; j < size; j++ ) + for( int j = -width; j < size+width; j++ ) ref_costsc[j] = ref_costsa[j] = rand()&32767; for( int j = 0; j < width; j++ ) { @@ -1742,18 +1785,18 @@ lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT; } - call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); - call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); + call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); - for( int j = 0; j < size && ok; j++ ) + for( int j = -width; j < size+width && ok; j++ ) { ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1; if( !ok ) fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] ); } - call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); - call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); + call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); } } @@ -1816,12 +1859,14 @@ { set_func_name( "memcpy_aligned" ); ok = 1; used_asm = 1; - for( size_t size = 16; size < 256; size += 16 ) + for( size_t size = 16; size < 512; size += 16 ) { - memset( buf4, 0xAA, size + 1 ); + for( int i = 0; i < size; i++ ) + buf1[i] = rand(); + memset( buf4-1, 0xAA, size + 2 ); call_c( mc_c.memcpy_aligned, buf3, buf1, size ); call_a( mc_a.memcpy_aligned, buf4, buf1, size ); - if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA ) + if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA ) { ok = 0; fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size ); @@ -1837,10 +1882,10 @@ ok = 1; used_asm = 1; for( size_t size = 128; size < 1024; size += 128 ) { - memset( buf4, 0xAA, size + 1 ); + memset( buf4-1, 0xAA, size + 2 ); call_c( mc_c.memzero_aligned, buf3, size ); call_a( mc_a.memzero_aligned, buf4, size ); - if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA ) + if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA ) { ok = 0; fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size ); @@ -1920,9 +1965,12 @@ if( db_a.deblock_strength != db_ref.deblock_strength ) { + set_func_name( "deblock_strength" ); + used_asm = 1; for( int i = 0; i < 100; i++ ) { - ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] ); + ALIGNED_ARRAY_16( uint8_t, nnz_buf, [X264_SCAN8_SIZE+8] ); + uint8_t *nnz = &nnz_buf[8]; ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] ); ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] ); @@ -1934,9 +1982,8 @@ { ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2; for( int l = 0; l < 2; l++ ) - mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512; + mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&16383) - 8192; } - set_func_name( "deblock_strength" ); call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) ); call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) ); if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) ) @@ -1969,8 +2016,8 @@ x264_quant_function_t qf_c; x264_quant_function_t qf_ref; x264_quant_function_t qf_a; - ALIGNED_ARRAY_32( dctcoef, dct1,[64] ); - ALIGNED_ARRAY_32( dctcoef, dct2,[64] ); + ALIGNED_ARRAY_64( dctcoef, dct1,[64] ); + ALIGNED_ARRAY_64( dctcoef, dct2,[64] ); ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] ); ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] ); ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] ); @@ -2214,7 +2261,7 @@ int max = X264_MIN( i, PIXEL_MAX*16 ); \ for( int j = 0; j < size; j++ ) \ dct1[j] = rand()%(max*2+1) - max; \ - for( int j = 0; i <= size; j += 4 ) \ + for( int j = 0; j <= size; j += 4 ) \ qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \ memcpy( dct2, dct1, size*sizeof(dctcoef) ); \ res_c = call_c1( qf_c.optname, dct1, dmf ); \ @@ -2575,6 +2622,11 @@ x264_quant_init( &h, cpu_new, &h.quantf ); h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4; +/* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */ +#define GET_CB( i ) (\ + x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\ + cb[i].f8_bits_encoded = 0, &cb[i] ) + #define CABAC_RESIDUAL(name, start, end, rd)\ {\ if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\ @@ -2587,7 +2639,7 @@ {\ for( int j = 0; j < 256; j++ )\ {\ - ALIGNED_ARRAY_32( dctcoef, dct, [2],[64] );\ + ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\ uint8_t bitstream[2][1<<16];\ static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\ int ac = ctx_ac[ctx_block_cat];\ @@ -2610,13 +2662,9 @@ x264_cabac_t cb[2];\ x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\ x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\ - x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\ - x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\ - cb[0].f8_bits_encoded = 0;\ - cb[1].f8_bits_encoded = 0;\ if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\ - call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\ - call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ + call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\ + call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\ ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\ if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\ if( !ok )\ @@ -2629,8 +2677,8 @@ }\ if( (j&15) == 0 )\ {\ - call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\ - call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ + call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\ + call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\ }\ }\ }\ @@ -2757,6 +2805,14 @@ int ret = 0; int cpu0 = 0, cpu1 = 0; uint32_t cpu_detect = x264_cpu_detect(); +#if ARCH_X86 || ARCH_X86_64 + if( cpu_detect & X264_CPU_AVX512 ) + simd_warmup_func = x264_checkasm_warmup_avx512; + else if( cpu_detect & X264_CPU_AVX ) + simd_warmup_func = x264_checkasm_warmup_avx; +#endif + simd_warmup(); + #if HAVE_MMX if( cpu_detect & X264_CPU_MMX2 ) { @@ -2767,13 +2823,6 @@ ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); cpu1 &= ~X264_CPU_CACHELINE_32; #endif - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; - } - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" ); - cpu1 &= ~X264_CPU_SLOW_CTZ; } if( cpu_detect & X264_CPU_SSE ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" ); @@ -2785,13 +2834,11 @@ cpu1 &= ~X264_CPU_CACHELINE_64; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" ); cpu1 &= ~X264_CPU_SLOW_SHUFFLE; - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" ); - cpu1 &= ~X264_CPU_SLOW_CTZ; - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; - } + } + if( cpu_detect & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; } if( cpu_detect & X264_CPU_SSE3 ) { @@ -2805,8 +2852,6 @@ cpu1 &= ~X264_CPU_CACHELINE_64; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" ); cpu1 &= ~X264_CPU_SLOW_SHUFFLE; - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" ); - cpu1 &= ~X264_CPU_SLOW_CTZ; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" ); cpu1 &= ~X264_CPU_CACHELINE_64; @@ -2831,29 +2876,15 @@ cpu1 &= ~X264_CPU_FMA4; } if( cpu_detect & X264_CPU_FMA3 ) - { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); - cpu1 &= ~X264_CPU_FMA3; - } - if( cpu_detect & X264_CPU_AVX2 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" ); - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; - } - } if( cpu_detect & X264_CPU_BMI1 ) - { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); - cpu1 &= ~X264_CPU_BMI1; - } if( cpu_detect & X264_CPU_BMI2 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" ); - cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2); - } + ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" ); + if( cpu_detect & X264_CPU_AVX2 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); + if( cpu_detect & X264_CPU_AVX512 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" ); #elif ARCH_PPC if( cpu_detect & X264_CPU_ALTIVEC ) { @@ -2883,8 +2914,6 @@ int main(int argc, char *argv[]) { - int ret = 0; - #ifdef _WIN32 /* Disable the Windows Error Reporting dialog */ SetErrorMode( SEM_NOGPFAULTERRORBOX ); @@ -2910,8 +2939,8 @@ fprintf( stderr, "x264: using random seed %u\n", seed ); srand( seed ); - buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS ); - pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS ); + buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) ); + pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) ); if( !buf1 || !pbuf1 ) { fprintf( stderr, "malloc failed, unable to initiate tests!\n" ); @@ -2932,21 +2961,7 @@ } memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) ); - /* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */ - if( do_bench ) - for( int i = 0; i < BENCH_ALIGNS && !ret; i++ ) - { - INIT_POINTER_OFFSETS; - ret |= x264_stack_pagealign( check_all_flags, i*32 ); - buf1 += 32; - pbuf1 += 32; - quiet = 1; - fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS ); - } - else - ret = x264_stack_pagealign( check_all_flags, 0 ); - - if( ret ) + if( x264_stack_pagealign( check_all_flags, 0 ) ) { fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" ); return -1; diff -Nru x264-0.148.2795+gitaaa9aa8/version.sh x264-0.152.2854+gite9a5903/version.sh --- x264-0.148.2795+gitaaa9aa8/version.sh 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/version.sh 2017-12-31 12:50:51.000000000 +0000 @@ -1,5 +1,5 @@ #!/bin/sh # Script modified from upstream source for Debian packaging since packaging # won't include .git repository. -echo '#define X264_VERSION " r2795 aaa9aa8"' -echo '#define X264_POINTVER "0.148.2795 aaa9aa8"' +echo '#define X264_VERSION " r2854 e9a5903"' +echo '#define X264_POINTVER "0.152.2854 e9a5903"' diff -Nru x264-0.148.2795+gitaaa9aa8/x264.h x264-0.152.2854+gite9a5903/x264.h --- x264-0.148.2795+gitaaa9aa8/x264.h 2017-06-28 06:24:56.000000000 +0000 +++ x264-0.152.2854+gite9a5903/x264.h 2017-12-31 12:50:51.000000000 +0000 @@ -45,7 +45,7 @@ #include "x264_config.h" -#define X264_BUILD 148 +#define X264_BUILD 152 /* Application developers planning to link against a shared library version of * libx264 from a Microsoft Visual Studio or similar development environment @@ -119,39 +119,38 @@ /* CPU flags */ /* x86 */ -#define X264_CPU_CMOV 0x0000001 -#define X264_CPU_MMX 0x0000002 -#define X264_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */ -#define X264_CPU_MMXEXT X264_CPU_MMX2 -#define X264_CPU_SSE 0x0000008 -#define X264_CPU_SSE2 0x0000010 -#define X264_CPU_SSE3 0x0000020 -#define X264_CPU_SSSE3 0x0000040 -#define X264_CPU_SSE4 0x0000080 /* SSE4.1 */ -#define X264_CPU_SSE42 0x0000100 /* SSE4.2 */ -#define X264_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */ -#define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */ -#define X264_CPU_XOP 0x0000800 /* AMD XOP */ -#define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */ -#define X264_CPU_FMA3 0x0002000 /* FMA3 */ -#define X264_CPU_AVX2 0x0004000 /* AVX2 */ -#define X264_CPU_BMI1 0x0008000 /* BMI1 */ -#define X264_CPU_BMI2 0x0010000 /* BMI2 */ +#define X264_CPU_MMX (1<<0) +#define X264_CPU_MMX2 (1<<1) /* MMX2 aka MMXEXT aka ISSE */ +#define X264_CPU_MMXEXT X264_CPU_MMX2 +#define X264_CPU_SSE (1<<2) +#define X264_CPU_SSE2 (1<<3) +#define X264_CPU_LZCNT (1<<4) +#define X264_CPU_SSE3 (1<<5) +#define X264_CPU_SSSE3 (1<<6) +#define X264_CPU_SSE4 (1<<7) /* SSE4.1 */ +#define X264_CPU_SSE42 (1<<8) /* SSE4.2 */ +#define X264_CPU_AVX (1<<9) /* Requires OS support even if YMM registers aren't used */ +#define X264_CPU_XOP (1<<10) /* AMD XOP */ +#define X264_CPU_FMA4 (1<<11) /* AMD FMA4 */ +#define X264_CPU_FMA3 (1<<12) +#define X264_CPU_BMI1 (1<<13) +#define X264_CPU_BMI2 (1<<14) +#define X264_CPU_AVX2 (1<<15) +#define X264_CPU_AVX512 (1<<16) /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */ /* x86 modifiers */ -#define X264_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */ -#define X264_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */ -#define X264_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */ -#define X264_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */ -#define X264_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ -#define X264_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */ -#define X264_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */ -#define X264_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow +#define X264_CPU_CACHELINE_32 (1<<17) /* avoid memory loads that span the border between two cachelines */ +#define X264_CPU_CACHELINE_64 (1<<18) /* 32/64 is the size of a cacheline in bytes */ +#define X264_CPU_SSE2_IS_SLOW (1<<19) /* avoid most SSE2 functions on Athlon64 */ +#define X264_CPU_SSE2_IS_FAST (1<<20) /* a few functions are only faster on Core2 and Phenom */ +#define X264_CPU_SLOW_SHUFFLE (1<<21) /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ +#define X264_CPU_STACK_MOD4 (1<<22) /* if stack is only mod4 and not mod16 */ +#define X264_CPU_SLOW_ATOM (1<<23) /* The Atom is terrible: slow SSE unaligned loads, slow * SIMD multiplies, slow SIMD variable shifts, slow pshufb, * cacheline split penalties -- gather everything here that * isn't shared by other CPUs to avoid making half a dozen * new SLOW flags. */ -#define X264_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */ -#define X264_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */ +#define X264_CPU_SLOW_PSHUFB (1<<24) /* such as on the Intel Atom */ +#define X264_CPU_SLOW_PALIGNR (1<<25) /* such as on the AMD Bobcat */ /* PowerPC */ #define X264_CPU_ALTIVEC 0x0000001 @@ -227,13 +226,15 @@ #define X264_CSP_I422 0x0005 /* yuv 4:2:2 planar */ #define X264_CSP_YV16 0x0006 /* yvu 4:2:2 planar */ #define X264_CSP_NV16 0x0007 /* yuv 4:2:2, with one y plane and one packed u+v */ -#define X264_CSP_V210 0x0008 /* 10-bit yuv 4:2:2 packed in 32 */ -#define X264_CSP_I444 0x0009 /* yuv 4:4:4 planar */ -#define X264_CSP_YV24 0x000a /* yvu 4:4:4 planar */ -#define X264_CSP_BGR 0x000b /* packed bgr 24bits */ -#define X264_CSP_BGRA 0x000c /* packed bgr 32bits */ -#define X264_CSP_RGB 0x000d /* packed rgb 24bits */ -#define X264_CSP_MAX 0x000e /* end of list */ +#define X264_CSP_YUYV 0x0008 /* yuyv 4:2:2 packed */ +#define X264_CSP_UYVY 0x0009 /* uyvy 4:2:2 packed */ +#define X264_CSP_V210 0x000a /* 10-bit yuv 4:2:2 packed in 32 */ +#define X264_CSP_I444 0x000b /* yuv 4:4:4 planar */ +#define X264_CSP_YV24 0x000c /* yvu 4:4:4 planar */ +#define X264_CSP_BGR 0x000d /* packed bgr 24bits */ +#define X264_CSP_BGRA 0x000e /* packed bgr 32bits */ +#define X264_CSP_RGB 0x000f /* packed rgb 24bits */ +#define X264_CSP_MAX 0x0010 /* end of list */ #define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */ #define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */ @@ -563,19 +564,19 @@ typedef struct x264_level_t { - int level_idc; - int mbps; /* max macroblock processing rate (macroblocks/sec) */ - int frame_size; /* max frame size (macroblocks) */ - int dpb; /* max decoded picture buffer (mbs) */ - int bitrate; /* max bitrate (kbit/sec) */ - int cpb; /* max vbv buffer (kbit) */ - int mv_range; /* max vertical mv component range (pixels) */ - int mvs_per_2mb; /* max mvs per 2 consecutive mbs. */ - int slice_rate; /* ?? */ - int mincr; /* min compression ratio */ - int bipred8x8; /* limit bipred to >=8x8 */ - int direct8x8; /* limit b_direct to >=8x8 */ - int frame_only; /* forbid interlacing */ + uint8_t level_idc; + uint32_t mbps; /* max macroblock processing rate (macroblocks/sec) */ + uint32_t frame_size; /* max frame size (macroblocks) */ + uint32_t dpb; /* max decoded picture buffer (mbs) */ + uint32_t bitrate; /* max bitrate (kbit/sec) */ + uint32_t cpb; /* max vbv buffer (kbit) */ + uint16_t mv_range; /* max vertical mv component range (pixels) */ + uint8_t mvs_per_2mb; /* max mvs per 2 consecutive mbs. */ + uint8_t slice_rate; /* ?? */ + uint8_t mincr; /* min compression ratio */ + uint8_t bipred8x8; /* limit bipred to >=8x8 */ + uint8_t direct8x8; /* limit b_direct to >=8x8 */ + uint8_t frame_only; /* forbid interlacing */ } x264_level_t; /* all of the levels defined in the standard, terminated by .level_idc=0 */