diff -Nru x264-0.142.2389+git956c8d8/AUTHORS x264-0.142.2431+gita5831aa/AUTHORS --- x264-0.142.2389+git956c8d8/AUTHORS 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/AUTHORS 2014-07-11 01:16:23.000000000 +0000 @@ -47,8 +47,8 @@ D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes S: Sweden -N: Jason Garrett-Glaser -E: darkshikari AT gmail DOT com +N: Fiona Glaser +E: fiona AT x264 DOT com D: x86 asm, 1pass VBV, adaptive quantization, inline asm D: various speed optimizations, bugfixes S: USA diff -Nru x264-0.142.2389+git956c8d8/common/arm/asm.S x264-0.142.2431+gita5831aa/common/arm/asm.S --- x264-0.142.2389+git956c8d8/common/arm/asm.S 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/asm.S 2014-07-11 01:16:23.000000000 +0000 @@ -40,32 +40,38 @@ # define ELF @ #endif - .macro require8, val=1 +.macro require8, val=1 ELF .eabi_attribute 24, \val - .endm +.endm - .macro preserve8, val=1 +.macro preserve8, val=1 ELF .eabi_attribute 25, \val - .endm +.endm - .macro function name - .global EXTERN_ASM\name +.macro function name, export=1 .align 2 +.if \export == 1 + .global EXTERN_ASM\name +ELF .hidden EXTERN_ASM\name +ELF .type EXTERN_ASM\name, %function + .func EXTERN_ASM\name EXTERN_ASM\name: +.else ELF .hidden \name ELF .type \name, %function .func \name \name: - .endm +.endif +.endm - .macro movrel rd, val +.macro movrel rd, val #if HAVE_ARMV6T2 && !defined(PIC) movw \rd, #:lower16:\val movt \rd, #:upper16:\val #else ldr \rd, =\val #endif - .endm +.endm .macro movconst rd, val #if HAVE_ARMV6T2 @@ -78,6 +84,10 @@ #endif .endm +#define GLUE(a, b) a ## b +#define JOIN(a, b) GLUE(a, b) +#define X(s) JOIN(EXTERN_ASM, s) + #define FENC_STRIDE 16 #define FDEC_STRIDE 32 diff -Nru x264-0.142.2389+git956c8d8/common/arm/cpu-a.S x264-0.142.2431+gita5831aa/common/arm/cpu-a.S --- x264-0.142.2389+git956c8d8/common/arm/cpu-a.S 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/cpu-a.S 2014-07-11 01:16:23.000000000 +0000 @@ -38,7 +38,7 @@ // return: 0 on success // 1 if counters were already enabled // 9 if lo-res counters were already enabled -function x264_cpu_enable_armv7_counter +function x264_cpu_enable_armv7_counter, export=0 mrc p15, 0, r2, c9, c12, 0 // read PMNC ands r0, r2, #1 andne r0, r2, #9 @@ -51,7 +51,7 @@ bx lr .endfunc -function x264_cpu_disable_armv7_counter +function x264_cpu_disable_armv7_counter, export=0 mrc p15, 0, r0, c9, c12, 0 // read PMNC bic r0, r0, #1 // disable counters mcr p15, 0, r0, c9, c12, 0 // write PMNC diff -Nru x264-0.142.2389+git956c8d8/common/arm/dct-a.S x264-0.142.2431+gita5831aa/common/arm/dct-a.S --- x264-0.142.2389+git956c8d8/common/arm/dct-a.S 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/dct-a.S 2014-07-11 01:16:23.000000000 +0000 @@ -131,7 +131,7 @@ bx lr .endfunc -function x264_sub8x4_dct_neon +function x264_sub8x4_dct_neon, export=0 vld1.64 {d0}, [r1,:64], r3 vld1.64 {d1}, [r2,:64], ip vsubl.u8 q8, d0, d1 @@ -283,17 +283,17 @@ function x264_sub16x16_dct8_neon push {lr} - bl x264_sub8x8_dct8_neon + bl X(x264_sub8x8_dct8_neon) sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 - bl x264_sub8x8_dct8_neon + bl X(x264_sub8x8_dct8_neon) sub r1, r1, #8 sub r2, r2, #8 - bl x264_sub8x8_dct8_neon + bl X(x264_sub8x8_dct8_neon) pop {lr} sub r1, r1, #FENC_STRIDE*8 - 8 sub r2, r2, #FDEC_STRIDE*8 - 8 - b x264_sub8x8_dct8_neon + b X(x264_sub8x8_dct8_neon) .endfunc @@ -338,7 +338,7 @@ bx lr .endfunc -function x264_add8x4_idct_neon +function x264_add8x4_idct_neon, export=0 vld1.64 {d0-d3}, [r1,:128]! IDCT_1D d16, d18, d20, d22, d0, d1, d2, d3 vld1.64 {d4-d7}, [r1,:128]! @@ -502,14 +502,14 @@ function x264_add16x16_idct8_neon mov ip, lr - bl x264_add8x8_idct8_neon + bl X(x264_add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 - bl x264_add8x8_idct8_neon + bl X(x264_add8x8_idct8_neon) sub r0, r0, #8 - bl x264_add8x8_idct8_neon + bl X(x264_add8x8_idct8_neon) sub r0, r0, #8*FDEC_STRIDE-8 mov lr, ip - b x264_add8x8_idct8_neon + b X(x264_add8x8_idct8_neon) .endfunc diff -Nru x264-0.142.2389+git956c8d8/common/arm/deblock-a.S x264-0.142.2431+gita5831aa/common/arm/deblock-a.S --- x264-0.142.2389+git956c8d8/common/arm/deblock-a.S 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/deblock-a.S 2014-07-11 01:16:23.000000000 +0000 @@ -304,3 +304,109 @@ bx lr .endfunc + +function x264_deblock_strength_neon + ldr ip, [sp] + vmov.i8 q8, #0 + lsl ip, ip, #8 + add r3, r3, #32 + sub ip, ip, #(1<<8)-3 + vmov.i8 q9, #0 + vdup.16 q10, ip + ldr ip, [sp, #4] + +lists: + @ load bytes ref + vld1.8 {d31}, [r1]! + add r2, r2, #16 + vld1.8 {q1}, [r1]! + vmov.i8 q0, #0 + vld1.8 {q2}, [r1]! + vext.8 q3, q0, q1, #15 + vext.8 q0, q0, q2, #15 + vuzp.32 q1, q2 + vuzp.32 q3, q0 + vext.8 q1, q15, q2, #12 + + veor q0, q0, q2 + veor q1, q1, q2 + vorr q8, q8, q0 + vorr q9, q9, q1 + + vld1.16 {q11}, [r2,:128]! @ mv + 0x10 + vld1.16 {q3}, [r2,:128]! @ mv + 0x20 + vld1.16 {q12}, [r2,:128]! @ mv + 0x30 + vld1.16 {q2}, [r2,:128]! @ mv + 0x40 + vld1.16 {q13}, [r2,:128]! @ mv + 0x50 + vext.8 q3, q3, q12, #12 + vext.8 q2, q2, q13, #12 + vabd.s16 q0, q12, q3 + vld1.16 {q3}, [r2,:128]! @ mv + 0x60 + vabd.s16 q1, q13, q2 + vld1.16 {q14}, [r2,:128]! @ mv + 0x70 + vqmovn.u16 d0, q0 + vld1.16 {q2}, [r2,:128]! @ mv + 0x80 + vld1.16 {q15}, [r2,:128]! @ mv + 0x90 + vqmovn.u16 d1, q1 + vext.8 q3, q3, q14, #12 + vext.8 q2, q2, q15, #12 + vabd.s16 q3, q14, q3 + vabd.s16 q2, q15, q2 + vqmovn.u16 d2, q3 + vqmovn.u16 d3, q2 + + vqsub.u8 q0, q0, q10 + vqsub.u8 q1, q1, q10 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + + vabd.s16 q1, q12, q13 + vorr q8, q8, q0 + + vabd.s16 q0, q11, q12 + vabd.s16 q2, q13, q14 + vabd.s16 q3, q14, q15 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + vqmovn.u16 d2, q2 + vqmovn.u16 d3, q3 + + vqsub.u8 q0, q0, q10 + vqsub.u8 q1, q1, q10 + vqmovn.u16 d0, q0 + vqmovn.u16 d1, q1 + subs ip, ip, #1 + vorr q9, q9, q0 + beq lists + + mov ip, #-32 + @ load bytes nnz + vld1.8 {d31}, [r0]! + vld1.8 {q1}, [r0]! + vmov.i8 q0, #0 + vld1.8 {q2}, [r0] + vext.8 q3, q0, q1, #15 + vext.8 q0, q0, q2, #15 + vuzp.32 q1, q2 + vuzp.32 q3, q0 + vext.8 q1, q15, q2, #12 + + vorr q0, q0, q2 + vorr q1, q1, q2 + vmov.u8 q10, #1 + vmin.u8 q0, q0, q10 + vmin.u8 q1, q1, q10 + vmin.u8 q8, q8, q10 @ mv ? 1 : 0 + vmin.u8 q9, q9, q10 + vadd.u8 q0, q0, q0 @ nnz ? 2 : 0 + vadd.u8 q1, q1, q1 + vmax.u8 q8, q8, q0 + vmax.u8 q9, q9, q1 + vzip.16 d16, d17 + vst1.8 {q9}, [r3,:128], ip @ bs[1] + vtrn.8 d16, d17 + vtrn.32 d16, d17 + + vst1.8 {q8}, [r3,:128] @ bs[0] + bx lr +.endfunc diff -Nru x264-0.142.2389+git956c8d8/common/arm/mc-a.S x264-0.142.2431+gita5831aa/common/arm/mc-a.S --- x264-0.142.2389+git956c8d8/common/arm/mc-a.S 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/mc-a.S 2014-07-11 01:16:23.000000000 +0000 @@ -88,7 +88,7 @@ .endfunc .macro MEMCPY_ALIGNED srcalign dstalign -function memcpy_aligned_\dstalign\()_\srcalign\()_neon +function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0 mov r3, r0 .if \srcalign == 8 && \dstalign == 8 sub r2, #16 @@ -181,6 +181,7 @@ AVGH 4, 2 AVGH 4, 4 AVGH 4, 8 +AVGH 4, 16 AVGH 8, 4 AVGH 8, 8 AVGH 8, 16 @@ -238,7 +239,7 @@ .endm .macro AVG_WEIGHT ext -function x264_pixel_avg_weight_w4_\ext\()_neon +function x264_pixel_avg_weight_w4_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -254,7 +255,7 @@ pop {r4-r6,pc} .endfunc -function x264_pixel_avg_weight_w8_\ext\()_neon +function x264_pixel_avg_weight_w8_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #4 @@ -278,7 +279,7 @@ pop {r4-r6,pc} .endfunc -function x264_pixel_avg_weight_w16_\ext\()_neon +function x264_pixel_avg_weight_w16_\ext\()_neon, export=0 load_weights_\ext 1: // height loop subs lr, lr, #2 @@ -303,7 +304,7 @@ AVG_WEIGHT add_sub AVG_WEIGHT sub_add -function x264_pixel_avg_w4_neon +function x264_pixel_avg_w4_neon, export=0 subs lr, lr, #2 vld1.32 {d0[]}, [r2], r3 vld1.32 {d2[]}, [r4], r5 @@ -317,7 +318,7 @@ pop {r4-r6,pc} .endfunc -function x264_pixel_avg_w8_neon +function x264_pixel_avg_w8_neon, export=0 subs lr, lr, #4 vld1.64 {d0}, [r2], r3 vld1.64 {d2}, [r4], r5 @@ -339,7 +340,7 @@ pop {r4-r6,pc} .endfunc -function x264_pixel_avg_w16_neon +function x264_pixel_avg_w16_neon, export=0 subs lr, lr, #4 vld1.64 {d0-d1}, [r2], r3 vld1.64 {d2-d3}, [r4], r5 @@ -1464,3 +1465,148 @@ vpop {d8-d15} pop {r4-r10,pc} .endfunc + +function x264_load_deinterleave_chroma_fdec_neon + mov ip, #FDEC_STRIDE/2 +1: + vld2.8 {d0-d1}, [r1,:128], r2 + subs r3, r3, #1 + pld [r1] + vst1.8 {d0}, [r0,:64], ip + vst1.8 {d1}, [r0,:64], ip + bgt 1b + + bx lr +.endfunc + +function x264_load_deinterleave_chroma_fenc_neon + mov ip, #FENC_STRIDE/2 +1: + vld2.8 {d0-d1}, [r1,:128], r2 + subs r3, r3, #1 + pld [r1] + vst1.8 {d0}, [r0,:64], ip + vst1.8 {d1}, [r0,:64], ip + bgt 1b + + bx lr +.endfunc + +function x264_plane_copy_deinterleave_neon + push {r4-r7, lr} + ldrd r6, r7, [sp, #28] + ldrd r4, r5, [sp, #20] + add lr, r6, #15 + bic lr, lr, #15 + sub r1, r1, lr + sub r3, r3, lr + sub r5, r5, lr, lsl #1 +block: + vld2.8 {d0-d3}, [r4,:128]! + subs lr, lr, #16 + vst1.8 {q0}, [r0]! + vst1.8 {q1}, [r2]! + bgt block + + add r4, r4, r5 + subs r7, r7, #1 + add r0, r0, r1 + add r2, r2, r3 + mov lr, r6 + bgt block + + pop {r4-r7, pc} +.endfunc + +function x264_plane_copy_deinterleave_rgb_neon + push {r4-r8, r10, r11, lr} + ldrd r4, r5, [sp, #32] + ldrd r6, r7, [sp, #40] + ldr r8, [sp, #48] + ldrd r10, r11, [sp, #52] + add lr, r10, #7 + subs r8, r8, #3 + bic lr, lr, #7 + sub r7, r7, lr, lsl #1 + sub r1, r1, lr + sub r3, r3, lr + sub r5, r5, lr + subne r7, r7, lr, lsl #1 + subeq r7, r7, lr + bne block4 +block3: + vld3.8 {d0,d1,d2}, [r6]! + subs lr, lr, #8 + vst1.8 {d0}, [r0]! + vst1.8 {d1}, [r2]! + vst1.8 {d2}, [r4]! + bgt block3 + + subs r11, r11, #1 + add r0, r0, r1 + add r2, r2, r3 + add r4, r4, r5 + add r6, r6, r7 + mov lr, r10 + bgt block3 + + pop {r4-r8, r10, r11, pc} +block4: + vld4.8 {d0,d1,d2,d3}, [r6]! + subs lr, lr, #8 + vst1.8 {d0}, [r0]! + vst1.8 {d1}, [r2]! + vst1.8 {d2}, [r4]! + bgt block4 + + subs r11, r11, #1 + add r0, r0, r1 + add r2, r2, r3 + add r4, r4, r5 + add r6, r6, r7 + mov lr, r10 + bgt block4 + + pop {r4-r8, r10, r11, pc} +.endfunc + +function x264_plane_copy_interleave_neon + push {r4-r7, lr} + ldrd r6, r7, [sp, #28] + ldrd r4, r5, [sp, #20] + add lr, r6, #15 + bic lr, lr, #15 + sub r1, r1, lr, lsl #1 + sub r3, r3, lr + sub r5, r5, lr +blocki: + vld1.8 {q0}, [r2]! + vld1.8 {q1}, [r4]! + subs lr, lr, #16 + vst2.8 {d0,d2}, [r0]! + vst2.8 {d1,d3}, [r0]! + bgt blocki + + subs r7, r7, #1 + add r0, r0, r1 + add r2, r2, r3 + add r4, r4, r5 + mov lr, r6 + bgt blocki + + pop {r4-r7, pc} +.endfunc + +function x264_store_interleave_chroma_neon + push {lr} + ldr lr, [sp, #4] + mov ip, #FDEC_STRIDE +1: + vld1.8 {d0}, [r2], ip + vld1.8 {d1}, [r3], ip + subs lr, lr, #1 + vst2.8 {d0,d1}, [r0,:128], r1 + bgt 1b + + pop {pc} +.endfunc diff -Nru x264-0.142.2389+git956c8d8/common/arm/mc-c.c x264-0.142.2431+gita5831aa/common/arm/mc-c.c --- x264-0.142.2389+git956c8d8/common/arm/mc-c.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/mc-c.c 2014-07-11 01:16:23.000000000 +0000 @@ -37,6 +37,7 @@ void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); +void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int ); @@ -46,13 +47,28 @@ void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ); +void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu, + pixel *dstv, intptr_t i_dstv, + pixel *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, + pixel *dstc, intptr_t i_dstc, + pixel *src, intptr_t i_src, int pw, int w, int h ); +void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst, + pixel *srcu, intptr_t i_srcu, + pixel *srcv, intptr_t i_srcv, int w, int h ); + +void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); +void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); +void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height ); + #define MC_WEIGHT(func)\ void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\ \ -static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\ +static weight_fn_t x264_mc##func##_wtab_neon[6] =\ {\ x264_mc_weight_w4##func##_neon,\ x264_mc_weight_w4##func##_neon,\ @@ -72,7 +88,7 @@ void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int ); -void x264_mc_chroma_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); +void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int ); void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int ); void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int ); @@ -224,11 +240,20 @@ pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon; + pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon; + pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon; + pf->plane_copy_interleave = x264_plane_copy_interleave_neon; + + pf->store_interleave_chroma = x264_store_interleave_chroma_neon; + pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon; + pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon; + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon; pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon; pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon; pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon; pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon; + pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon; pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon; diff -Nru x264-0.142.2389+git956c8d8/common/arm/pixel-a.S x264-0.142.2431+gita5831aa/common/arm/pixel-a.S --- x264-0.142.2389+git956c8d8/common/arm/pixel-a.S 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/pixel-a.S 2014-07-11 01:16:23.000000000 +0000 @@ -519,6 +519,38 @@ b x264_var_end .endfunc +function x264_pixel_var_8x16_neon + vld1.64 {d16}, [r0,:64], r1 + vld1.64 {d18}, [r0,:64], r1 + vmull.u8 q1, d16, d16 + vmovl.u8 q0, d16 + vld1.64 {d20}, [r0,:64], r1 + vmull.u8 q2, d18, d18 + vaddw.u8 q0, q0, d18 + + mov ip, #12 + + vld1.64 {d22}, [r0,:64], r1 + VAR_SQR_SUM q1, q1, q14, d20, vpaddl.u16 + vld1.64 {d16}, [r0,:64], r1 + VAR_SQR_SUM q2, q2, q15, d22, vpaddl.u16 + +1: subs ip, ip, #4 + vld1.64 {d18}, [r0,:64], r1 + VAR_SQR_SUM q1, q14, q12, d16 + vld1.64 {d20}, [r0,:64], r1 + VAR_SQR_SUM q2, q15, q13, d18 + vld1.64 {d22}, [r0,:64], r1 + VAR_SQR_SUM q1, q12, q14, d20 + beq 2f + vld1.64 {d16}, [r0,:64], r1 + VAR_SQR_SUM q2, q13, q15, d22 + b 1b +2: + VAR_SQR_SUM q2, q13, q15, d22 + b x264_var_end +.endfunc + function x264_pixel_var_16x16_neon vld1.64 {d16-d17}, [r0,:128], r1 vmull.u8 q12, d16, d16 @@ -543,7 +575,7 @@ bgt var16_loop .endfunc -function x264_var_end +function x264_var_end, export=0 vpaddl.u16 q8, q14 vpaddl.u16 q9, q15 vadd.u32 q1, q1, q8 @@ -603,6 +635,49 @@ bx lr .endfunc +function x264_pixel_var2_8x16_neon + vld1.64 {d16}, [r0,:64], r1 + vld1.64 {d17}, [r2,:64], r3 + vld1.64 {d18}, [r0,:64], r1 + vld1.64 {d19}, [r2,:64], r3 + vsubl.u8 q10, d16, d17 + vsubl.u8 q11, d18, d19 + SQR_ACC q1, d20, d21, vmull.s16 + vld1.64 {d16}, [r0,:64], r1 + vadd.s16 q0, q10, q11 + vld1.64 {d17}, [r2,:64], r3 + SQR_ACC q2, d22, d23, vmull.s16 + mov ip, #14 +1: subs ip, ip, #2 + vld1.64 {d18}, [r0,:64], r1 + vsubl.u8 q10, d16, d17 + vld1.64 {d19}, [r2,:64], r3 + vadd.s16 q0, q0, q10 + SQR_ACC q1, d20, d21 + vsubl.u8 q11, d18, d19 + beq 2f + vld1.64 {d16}, [r0,:64], r1 + vadd.s16 q0, q0, q11 + vld1.64 {d17}, [r2,:64], r3 + SQR_ACC q2, d22, d23 + b 1b +2: + vadd.s16 q0, q0, q11 + SQR_ACC q2, d22, d23 + + ldr ip, [sp] + vadd.s16 d0, d0, d1 + vadd.s32 q1, q1, q2 + vpaddl.s16 d0, d0 + vadd.s32 d1, d2, d3 + vpadd.s32 d0, d0, d1 + + vmov r0, r1, d0 + vst1.32 {d0[1]}, [ip,:32] + mul r0, r0, r0 + sub r0, r1, r0, lsr #7 + bx lr +.endfunc .macro LOAD_DIFF_8x4 q0 q1 q2 q3 vld1.32 {d1}, [r2], r3 @@ -685,7 +760,7 @@ SUMSUB_AB q10, q11, q2, q3 .endfunc -function x264_satd_4x8_8x4_end_neon +function x264_satd_4x8_8x4_end_neon, export=0 vadd.s16 q0, q8, q10 vadd.s16 q1, q9, q11 vsub.s16 q2, q8, q10 @@ -748,7 +823,7 @@ bx lr .endfunc -function x264_satd_8x8_neon +function x264_satd_8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 vld1.64 {d7}, [r2], r3 SUMSUB_AB q0, q1, q8, q9 @@ -769,7 +844,7 @@ .endfunc // one vertical hadamard pass and two horizontal -function x264_satd_8x4v_8x8h_neon +function x264_satd_8x4v_8x8h_neon, export=0 SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15 vtrn.16 q8, q9 SUMSUB_AB q12, q14, q0, q2 @@ -853,7 +928,7 @@ bx lr .endfunc -function x264_satd_16x4_neon +function x264_satd_16x4_neon, export=0 vld1.64 {d2-d3}, [r2], r3 vld1.64 {d0-d1}, [r0,:128], r1 vsubl.u8 q8, d0, d2 @@ -927,7 +1002,7 @@ SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4 .endm -function x264_sa8d_8x8_neon +function x264_sa8d_8x8_neon, export=0 LOAD_DIFF_8x4 q8, q9, q10, q11 vld1.64 {d7}, [r2], r3 SUMSUB_AB q0, q1, q8, q9 @@ -1028,7 +1103,7 @@ HADAMARD_AC 16, 16 // q4: satd q5: sa8d q6: mask_ac4 q7: mask_ac8 -function x264_hadamard_ac_8x8_neon +function x264_hadamard_ac_8x8_neon, export=0 vld1.64 {d2}, [r0,:64], r1 vld1.64 {d3}, [r0,:64], r1 vaddl.u8 q0, d2, d3 diff -Nru x264-0.142.2389+git956c8d8/common/arm/pixel.h x264-0.142.2431+gita5831aa/common/arm/pixel.h --- x264-0.142.2389+git956c8d8/common/arm/pixel.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/pixel.h 2014-07-11 01:16:23.000000000 +0000 @@ -56,8 +56,10 @@ int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); +uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); -int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); diff -Nru x264-0.142.2389+git956c8d8/common/arm/predict-a.S x264-0.142.2431+gita5831aa/common/arm/predict-a.S --- x264-0.142.2389+git956c8d8/common/arm/predict-a.S 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/predict-a.S 2014-07-11 01:16:23.000000000 +0000 @@ -79,6 +79,15 @@ bx lr .endfunc +function x264_predict_4x4_v_armv6 + ldr r1, [r0, #0 - 1 * FDEC_STRIDE] + str r1, [r0, #0 + 0 * FDEC_STRIDE] + str r1, [r0, #0 + 1 * FDEC_STRIDE] + str r1, [r0, #0 + 2 * FDEC_STRIDE] + str r1, [r0, #0 + 3 * FDEC_STRIDE] + bx lr +.endfunc + function x264_predict_4x4_dc_armv6 mov ip, #0 ldr r1, [r0, #-FDEC_STRIDE] diff -Nru x264-0.142.2389+git956c8d8/common/arm/predict-c.c x264-0.142.2431+gita5831aa/common/arm/predict-c.c --- x264-0.142.2389+git956c8d8/common/arm/predict-c.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/predict-c.c 2014-07-11 01:16:23.000000000 +0000 @@ -27,36 +27,6 @@ #include "predict.h" #include "pixel.h" -void x264_predict_4x4_dc_armv6( uint8_t *src ); -void x264_predict_4x4_dc_top_neon( uint8_t *src ); -void x264_predict_4x4_h_armv6( uint8_t *src ); -void x264_predict_4x4_ddr_armv6( uint8_t *src ); -void x264_predict_4x4_ddl_neon( uint8_t *src ); - -void x264_predict_8x8c_dc_neon( uint8_t *src ); -void x264_predict_8x8c_dc_top_neon( uint8_t *src ); -void x264_predict_8x8c_dc_left_neon( uint8_t *src ); -void x264_predict_8x8c_h_neon( uint8_t *src ); -void x264_predict_8x8c_v_neon( uint8_t *src ); -void x264_predict_8x8c_p_neon( uint8_t *src ); - -void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); -void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); - -void x264_predict_16x16_dc_neon( uint8_t *src ); -void x264_predict_16x16_dc_top_neon( uint8_t *src ); -void x264_predict_16x16_dc_left_neon( uint8_t *src ); -void x264_predict_16x16_h_neon( uint8_t *src ); -void x264_predict_16x16_v_neon( uint8_t *src ); -void x264_predict_16x16_p_neon( uint8_t *src ); - void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ) { if (!(cpu&X264_CPU_ARMV6)) @@ -64,6 +34,7 @@ #if !HIGH_BIT_DEPTH pf[I_PRED_4x4_H] = x264_predict_4x4_h_armv6; + pf[I_PRED_4x4_V] = x264_predict_4x4_v_armv6; pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_armv6; pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6; diff -Nru x264-0.142.2389+git956c8d8/common/arm/predict.h x264-0.142.2431+gita5831aa/common/arm/predict.h --- x264-0.142.2389+git956c8d8/common/arm/predict.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/predict.h 2014-07-11 01:16:23.000000000 +0000 @@ -26,6 +26,37 @@ #ifndef X264_ARM_PREDICT_H #define X264_ARM_PREDICT_H +void x264_predict_4x4_dc_armv6( uint8_t *src ); +void x264_predict_4x4_dc_top_neon( uint8_t *src ); +void x264_predict_4x4_v_armv6( uint8_t *src ); +void x264_predict_4x4_h_armv6( uint8_t *src ); +void x264_predict_4x4_ddr_armv6( uint8_t *src ); +void x264_predict_4x4_ddl_neon( uint8_t *src ); + +void x264_predict_8x8c_dc_neon( uint8_t *src ); +void x264_predict_8x8c_dc_top_neon( uint8_t *src ); +void x264_predict_8x8c_dc_left_neon( uint8_t *src ); +void x264_predict_8x8c_h_neon( uint8_t *src ); +void x264_predict_8x8c_v_neon( uint8_t *src ); +void x264_predict_8x8c_p_neon( uint8_t *src ); + +void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] ); +void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] ); + +void x264_predict_16x16_dc_neon( uint8_t *src ); +void x264_predict_16x16_dc_top_neon( uint8_t *src ); +void x264_predict_16x16_dc_left_neon( uint8_t *src ); +void x264_predict_16x16_h_neon( uint8_t *src ); +void x264_predict_16x16_v_neon( uint8_t *src ); +void x264_predict_16x16_p_neon( uint8_t *src ); + void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] ); void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter ); void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] ); diff -Nru x264-0.142.2389+git956c8d8/common/arm/quant-a.S x264-0.142.2431+gita5831aa/common/arm/quant-a.S --- x264-0.142.2389+git956c8d8/common/arm/quant-a.S 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/quant-a.S 2014-07-11 01:16:23.000000000 +0000 @@ -321,6 +321,20 @@ bx lr .endfunc +function x264_coeff_last8_arm + ldrd r2, r3, [r0, #8] + orrs ip, r2, r3 + movne r0, #4 + ldrdeq r2, r3, [r0] + moveq r0, #0 + tst r3, r3 + addne r0, #2 + movne r2, r3 + lsrs r2, r2, #16 + addne r0, r0, #1 + bx lr +.endfunc + .macro COEFF_LAST_1x size function x264_coeff_last\size\()_neon .if \size == 15 diff -Nru x264-0.142.2389+git956c8d8/common/arm/quant.h x264-0.142.2431+gita5831aa/common/arm/quant.h --- x264-0.142.2389+git956c8d8/common/arm/quant.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/arm/quant.h 2014-07-11 01:16:23.000000000 +0000 @@ -39,6 +39,7 @@ void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp ); int x264_coeff_last4_arm( int16_t * ); +int x264_coeff_last8_arm( int16_t * ); int x264_coeff_last15_neon( int16_t * ); int x264_coeff_last16_neon( int16_t * ); int x264_coeff_last64_neon( int16_t * ); diff -Nru x264-0.142.2389+git956c8d8/common/bitstream.c x264-0.142.2431+gita5831aa/common/bitstream.c --- x264-0.142.2389+git956c8d8/common/bitstream.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/bitstream.c 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Laurent Aimar - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/bitstream.h x264-0.142.2431+gita5831aa/common/bitstream.h --- x264-0.142.2389+git956c8d8/common/bitstream.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/bitstream.h 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * Laurent Aimar * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/cabac.c x264-0.142.2431+gita5831aa/common/cabac.c --- x264-0.142.2389+git956c8d8/common/cabac.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/cabac.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/common.h x264-0.142.2431+gita5831aa/common/common.h --- x264-0.142.2389+git956c8d8/common/common.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/common.h 2014-07-11 01:16:23.000000000 +0000 @@ -552,15 +552,15 @@ int (*dequant4_mf[4])[16]; /* [4][6][16] */ int (*dequant8_mf[4])[64]; /* [4][6][64] */ /* quantization matrix for trellis, [cqm][qp][coef] */ - int (*unquant4_mf[4])[16]; /* [4][QP_MAX_SPEC][16] */ - int (*unquant8_mf[4])[64]; /* [4][QP_MAX_SPEC][64] */ + int (*unquant4_mf[4])[16]; /* [4][QP_MAX_SPEC+1][16] */ + int (*unquant8_mf[4])[64]; /* [4][QP_MAX_SPEC+1][64] */ /* quantization matrix for deadzone */ - udctcoef (*quant4_mf[4])[16]; /* [4][QP_MAX_SPEC][16] */ - udctcoef (*quant8_mf[4])[64]; /* [4][QP_MAX_SPEC][64] */ - udctcoef (*quant4_bias[4])[16]; /* [4][QP_MAX_SPEC][16] */ - udctcoef (*quant8_bias[4])[64]; /* [4][QP_MAX_SPEC][64] */ - udctcoef (*quant4_bias0[4])[16]; /* [4][QP_MAX_SPEC][16] */ - udctcoef (*quant8_bias0[4])[64]; /* [4][QP_MAX_SPEC][64] */ + udctcoef (*quant4_mf[4])[16]; /* [4][QP_MAX_SPEC+1][16] */ + udctcoef (*quant8_mf[4])[64]; /* [4][QP_MAX_SPEC+1][64] */ + udctcoef (*quant4_bias[4])[16]; /* [4][QP_MAX_SPEC+1][16] */ + udctcoef (*quant8_bias[4])[64]; /* [4][QP_MAX_SPEC+1][64] */ + udctcoef (*quant4_bias0[4])[16]; /* [4][QP_MAX_SPEC+1][16] */ + udctcoef (*quant8_bias0[4])[64]; /* [4][QP_MAX_SPEC+1][64] */ udctcoef (*nr_offset_emergency)[4][64]; /* mv/ref cost arrays. */ diff -Nru x264-0.142.2389+git956c8d8/common/cpu.c x264-0.142.2431+gita5831aa/common/cpu.c --- x264-0.142.2389+git956c8d8/common/cpu.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/cpu.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Loren Merritt * Laurent Aimar - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -304,7 +304,7 @@ x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" ); } -#if BROKEN_STACK_ALIGNMENT +#if STACK_ALIGNMENT < 16 cpu |= X264_CPU_STACK_MOD4; #endif @@ -338,6 +338,9 @@ uint32_t x264_cpu_detect( void ) { +#ifdef __NO_FPRS__ + return 0; +#else static void (*oldsig)( int ); oldsig = signal( SIGILL, sigill_handler ); @@ -357,6 +360,7 @@ signal( SIGILL, oldsig ); return X264_CPU_ALTIVEC; +#endif } #endif @@ -426,6 +430,10 @@ return sysconf( _SC_NPROCESSORS_ONLN ); #elif SYS_LINUX +#ifdef __ANDROID__ + // Android NDK does not expose sched_getaffinity + return sysconf( _SC_NPROCESSORS_CONF ); +#else cpu_set_t p_aff; memset( &p_aff, 0, sizeof(p_aff) ); if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) ) @@ -438,6 +446,7 @@ np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1; return np; #endif +#endif #elif SYS_BEOS system_info info; diff -Nru x264-0.142.2389+git956c8d8/common/cpu.h x264-0.142.2431+gita5831aa/common/cpu.h --- x264-0.142.2389+git956c8d8/common/cpu.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/cpu.h 2014-07-11 01:16:23.000000000 +0000 @@ -57,8 +57,8 @@ * alignment between functions (osdep.h handles manual alignment of arrays * if it doesn't). */ -#if (ARCH_X86 || HAVE_32B_STACK_ALIGNMENT) && HAVE_MMX -int x264_stack_align( void (*func)(), ... ); +#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX +intptr_t x264_stack_align( void (*func)(), ... ); #define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__) #else #define x264_stack_align(func,...) func(__VA_ARGS__) diff -Nru x264-0.142.2389+git956c8d8/common/deblock.c x264-0.142.2431+gita5831aa/common/deblock.c --- x264-0.142.2389+git956c8d8/common/deblock.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/deblock.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * Henrik Gramner * * This program is free software; you can redistribute it and/or modify @@ -734,6 +734,9 @@ void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ); +void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); #endif void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) @@ -842,6 +845,7 @@ pf->deblock_luma[0] = x264_deblock_h_luma_neon; pf->deblock_chroma[1] = x264_deblock_v_chroma_neon; pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon; + pf->deblock_strength = x264_deblock_strength_neon; } #endif #endif // !HIGH_BIT_DEPTH diff -Nru x264-0.142.2389+git956c8d8/common/frame.c x264-0.142.2431+gita5831aa/common/frame.c --- x264-0.142.2389+git956c8d8/common/frame.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/frame.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/frame.h x264-0.142.2431+gita5831aa/common/frame.h --- x264-0.142.2389+git956c8d8/common/frame.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/frame.h 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/macroblock.c x264-0.142.2431+gita5831aa/common/macroblock.c --- x264-0.142.2389+git956c8d8/common/macroblock.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/macroblock.c 2014-07-11 01:16:23.000000000 +0000 @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2003-2014 x264 project * - * Authors: Jason Garrett-Glaser + * Authors: Fiona Glaser * Laurent Aimar * Loren Merritt * Henrik Gramner @@ -389,7 +389,7 @@ ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } - int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int); + int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t); scratch_size = X264_MAX( scratch_size, buf_mbtree ); if( scratch_size ) CHECKED_MALLOC( h->scratch_buffer, scratch_size ); @@ -397,7 +397,9 @@ h->scratch_buffer = NULL; int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2; - CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads ); + int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */ + scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 ); + CHECKED_MALLOC( h->scratch_buffer2, scratch_size ); return 0; fail: @@ -1253,8 +1255,13 @@ } } - if( b_mbaff && mb_x == 0 && !(mb_y&1) && mb_y > 0 ) - h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_xy - h->mb.i_mb_stride]; + if( b_mbaff && mb_x == 0 && !(mb_y&1) ) + { + if( h->mb.i_mb_top_xy >= h->sh.i_first_mb ) + h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_top_xy]; + else + h->mb.field_decoding_flag = 0; + } /* Check whether skip here would cause decoder to predict interlace mode incorrectly. * FIXME: It might be better to change the interlace type rather than forcing a skip to be non-skip. */ @@ -1262,26 +1269,8 @@ if( b_mbaff ) { if( MB_INTERLACED != h->mb.field_decoding_flag && - h->mb.i_mb_prev_xy >= 0 && IS_SKIP(h->mb.type[h->mb.i_mb_prev_xy]) ) + (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) ) h->mb.b_allow_skip = 0; - if( (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) ) - { - if( h->mb.i_neighbour & MB_LEFT ) - { - if( h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED ) - h->mb.b_allow_skip = 0; - } - else if( h->mb.i_neighbour & MB_TOP ) - { - if( h->mb.field[h->mb.i_mb_top_xy] != MB_INTERLACED ) - h->mb.b_allow_skip = 0; - } - else // Frame mb pair is predicted - { - if( MB_INTERLACED ) - h->mb.b_allow_skip = 0; - } - } } if( h->param.b_cabac ) diff -Nru x264-0.142.2389+git956c8d8/common/macroblock.h x264-0.142.2431+gita5831aa/common/macroblock.h --- x264-0.142.2389+git956c8d8/common/macroblock.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/macroblock.h 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Loren Merritt * Laurent Aimar - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/mc.c x264-0.142.2431+gita5831aa/common/mc.c --- x264-0.142.2389+git956c8d8/common/mc.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/mc.c 2014-07-11 01:16:23.000000000 +0000 @@ -483,20 +483,97 @@ /* Estimate the total amount of influence on future quality that could be had if we * were to improve the reference samples used to inter predict any given macroblock. */ -static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) { - float fps = *fps_factor / 256.f; + float fps = *fps_factor; for( int i = 0; i < len; i++ ) { - float intra_cost = intra_costs[i] * inv_qscales[i]; - float propagate_amount = propagate_in[i] + intra_cost*fps; - float propagate_num = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK); - float propagate_denom = intra_costs[i]; - dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f); + int intra_cost = intra_costs[i]; + int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK); + float propagate_intra = intra_cost * inv_qscales[i]; + float propagate_amount = propagate_in[i] + propagate_intra*fps; + float propagate_num = intra_cost - inter_cost; + float propagate_denom = intra_cost; + dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767); } } +static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], + int16_t *propagate_amount, uint16_t *lowres_costs, + int bipred_weight, int mb_y, int len, int list ) +{ + unsigned stride = h->mb.i_mb_stride; + unsigned width = h->mb.i_mb_width; + unsigned height = h->mb.i_mb_height; + + for( unsigned i = 0; i < len; i++ ) + { +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) + int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT; + + if( !(lists_used & (1 << list)) ) + continue; + + int listamount = propagate_amount[i]; + /* Apply bipred weighting. */ + if( lists_used == 3 ) + listamount = (listamount * bipred_weight + 32) >> 6; + + /* Early termination for simple case of mv0. */ + if( !M32( mvs[i] ) ) + { + CLIP_ADD( ref_costs[mb_y*stride + i], listamount ); + continue; + } + + int x = mvs[i][0]; + int y = mvs[i][1]; + unsigned mbx = (x>>5)+i; + unsigned mby = (y>>5)+mb_y; + unsigned idx0 = mbx + mby * stride; + unsigned idx2 = idx0 + stride; + x &= 31; + y &= 31; + int idx0weight = (32-y)*(32-x); + int idx1weight = (32-y)*x; + int idx2weight = y*(32-x); + int idx3weight = y*x; + idx0weight = (idx0weight * listamount + 512) >> 10; + idx1weight = (idx1weight * listamount + 512) >> 10; + idx2weight = (idx2weight * listamount + 512) >> 10; + idx3weight = (idx3weight * listamount + 512) >> 10; + + if( mbx < width-1 && mby < height-1 ) + { + CLIP_ADD( ref_costs[idx0+0], idx0weight ); + CLIP_ADD( ref_costs[idx0+1], idx1weight ); + CLIP_ADD( ref_costs[idx2+0], idx2weight ); + CLIP_ADD( ref_costs[idx2+1], idx3weight ); + } + else + { + /* Note: this takes advantage of unsigned representation to + * catch negative mbx/mby. */ + if( mby < height ) + { + if( mbx < width ) + CLIP_ADD( ref_costs[idx0+0], idx0weight ); + if( mbx+1 < width ) + CLIP_ADD( ref_costs[idx0+1], idx1weight ); + } + if( mby+1 < height ) + { + if( mbx < width ) + CLIP_ADD( ref_costs[idx2+0], idx2weight ); + if( mbx+1 < width ) + CLIP_ADD( ref_costs[idx2+1], idx3weight ); + } + } + } +#undef CLIP_ADD +} + void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ) { pf->mc_luma = mc_luma; @@ -552,6 +629,7 @@ pf->integral_init8v = integral_init8v; pf->mbtree_propagate_cost = mbtree_propagate_cost; + pf->mbtree_propagate_list = mbtree_propagate_list; #if HAVE_MMX x264_mc_init_mmx( cpu, pf ); @@ -565,7 +643,10 @@ #endif if( cpu_independent ) + { pf->mbtree_propagate_cost = mbtree_propagate_cost; + pf->mbtree_propagate_list = mbtree_propagate_list; + } } void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end ) diff -Nru x264-0.142.2389+git956c8d8/common/mc.h x264-0.142.2431+gita5831aa/common/mc.h --- x264-0.142.2389+git956c8d8/common/mc.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/mc.h 2014-07-11 01:16:23.000000000 +0000 @@ -122,8 +122,12 @@ weight_fn_t *offsetsub; void (*weight_cache)( x264_t *, x264_weight_t * ); - void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, + void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); + + void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], + int16_t *propagate_amount, uint16_t *lowres_costs, + int bipred_weight, int mb_y, int len, int list ); } x264_mc_functions_t; void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent ); diff -Nru x264-0.142.2389+git956c8d8/common/mvpred.c x264-0.142.2431+gita5831aa/common/mvpred.c --- x264-0.142.2389+git956c8d8/common/mvpred.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/mvpred.c 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * Laurent Aimar * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/osdep.h x264-0.142.2431+gita5831aa/common/osdep.h --- x264-0.142.2389+git956c8d8/common/osdep.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/osdep.h 2014-07-11 01:16:23.000000000 +0000 @@ -126,7 +126,7 @@ #define EXPAND(x) x -#if HAVE_32B_STACK_ALIGNMENT +#if STACK_ALIGNMENT >= 32 #define ALIGNED_ARRAY_32( type, name, sub1, ... )\ ALIGNED_32( type name sub1 __VA_ARGS__ ) #else diff -Nru x264-0.142.2389+git956c8d8/common/pixel.c x264-0.142.2431+gita5831aa/common/pixel.c --- x264-0.142.2389+git956c8d8/common/pixel.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/pixel.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Loren Merritt * Laurent Aimar - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,6 +36,7 @@ #endif #if ARCH_ARM # include "arm/pixel.h" +# include "arm/predict.h" #endif #if ARCH_UltraSPARC # include "sparc/pixel.h" @@ -532,6 +533,10 @@ INTRA_MBCMP_8x8( sad, _mmx2, _c ) INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 ) #endif +#if !HIGH_BIT_DEPTH && HAVE_ARMV6 +INTRA_MBCMP_8x8( sad, _neon, _neon ) +INTRA_MBCMP_8x8(sa8d, _neon, _neon ) +#endif #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\ void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\ @@ -587,6 +592,16 @@ INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 ) #endif #endif +#if !HIGH_BIT_DEPTH && HAVE_ARMV6 +INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _armv6 ) +INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _armv6 ) +INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon ) +INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c ) +INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon ) +INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon ) +#endif // No C implementation of intra_satd_x9. See checkasm for its behavior, // or see x264_mb_analyse_intra for the entirely different algorithm we @@ -1006,8 +1021,16 @@ } if( cpu&X264_CPU_XOP ) { + INIT5( sad_x3, _xop ); + INIT5( sad_x4, _xop ); + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; pixf->vsad = x264_pixel_vsad_xop; pixf->asd8 = x264_pixel_asd8_xop; +#if ARCH_X86_64 + pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; +#endif } if( cpu&X264_CPU_AVX2 ) { @@ -1293,6 +1316,7 @@ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop; + pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; @@ -1347,8 +1371,21 @@ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon; + + pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon; + pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon; + pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon; + pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_neon; + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon; + pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_neon; + pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_neon; + pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon; + pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon; + pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon; pixf->ssim_end4 = x264_pixel_ssim_end4_neon; diff -Nru x264-0.142.2389+git956c8d8/common/pixel.h x264-0.142.2431+gita5831aa/common/pixel.h --- x264-0.142.2389+git956c8d8/common/pixel.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/pixel.h 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2004-2014 x264 project * * Authors: Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser Henrik Gramner * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/predict.c x264-0.142.2431+gita5831aa/common/predict.c --- x264-0.142.2389+git956c8d8/common/predict.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/predict.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * Henrik Gramner * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/quant.c x264-0.142.2431+gita5831aa/common/quant.c --- x264-0.142.2389+git956c8d8/common/quant.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/quant.c 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * Christian Heine * Henrik Gramner * @@ -725,7 +725,10 @@ #if HAVE_ARMV6 if( cpu&X264_CPU_ARMV6 ) + { pf->coeff_last4 = x264_coeff_last4_arm; + pf->coeff_last8 = x264_coeff_last8_arm; + } if( cpu&X264_CPU_NEON ) { diff -Nru x264-0.142.2389+git956c8d8/common/quant.h x264-0.142.2431+gita5831aa/common/quant.h --- x264-0.142.2389+git956c8d8/common/quant.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/quant.h 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/rectangle.c x264-0.142.2431+gita5831aa/common/rectangle.c --- x264-0.142.2389+git956c8d8/common/rectangle.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/rectangle.c 2014-07-11 01:16:23.000000000 +0000 @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2010-2014 x264 project * - * Authors: Jason Garrett-Glaser + * Authors: Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/rectangle.h x264-0.142.2431+gita5831aa/common/rectangle.h --- x264-0.142.2389+git956c8d8/common/rectangle.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/rectangle.h 2014-07-11 01:16:23.000000000 +0000 @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2003-2014 x264 project * - * Authors: Jason Garrett-Glaser + * Authors: Fiona Glaser * Loren Merritt * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/set.c x264-0.142.2431+gita5831aa/common/set.c --- x264-0.142.2389+git956c8d8/common/set.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/set.c 2014-07-11 01:16:23.000000000 +0000 @@ -105,9 +105,9 @@ }\ else\ {\ - CHECKED_MALLOC( h-> quant##w##_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );\ + CHECKED_MALLOC( h-> quant##w##_mf[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\ CHECKED_MALLOC( h->dequant##w##_mf[i], 6*size*sizeof(int) );\ - CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX+1)*size*sizeof(int) );\ + CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX_SPEC+1)*size*sizeof(int) );\ }\ for( j = 0; j < i; j++ )\ if( deadzone[j] == deadzone[i] &&\ @@ -120,8 +120,8 @@ }\ else\ {\ - CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );\ - CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) );\ + CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\ + CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\ }\ } diff -Nru x264-0.142.2389+git956c8d8/common/vlc.c x264-0.142.2431+gita5831aa/common/vlc.c --- x264-0.142.2389+git956c8d8/common/vlc.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/vlc.c 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2003-2014 x264 project * * Authors: Laurent Aimar - * Jason Garrett-Glaser + * Fiona Glaser * Henrik Gramner * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/x86/bitstream-a.asm x264-0.142.2431+gita5831aa/common/x86/bitstream-a.asm --- x264-0.142.2389+git956c8d8/common/x86/bitstream-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/bitstream-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -3,7 +3,7 @@ ;***************************************************************************** ;* Copyright (C) 2010-2014 x264 project ;* -;* Authors: Jason Garrett-Glaser +;* Authors: Fiona Glaser ;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/x86/cabac-a.asm x264-0.142.2431+gita5831aa/common/x86/cabac-a.asm --- x264-0.142.2389+git956c8d8/common/x86/cabac-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/cabac-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ ;* Copyright (C) 2008-2014 x264 project ;* ;* Authors: Loren Merritt -;* Jason Garrett-Glaser +;* Fiona Glaser ;* Holger Lubitz ;* ;* This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/x86/const-a.asm x264-0.142.2431+gita5831aa/common/x86/const-a.asm --- x264-0.142.2389+git956c8d8/common/x86/const-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/const-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ ;* Copyright (C) 2010-2014 x264 project ;* ;* Authors: Loren Merritt -;* Jason Garrett-Glaser +;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -36,6 +36,7 @@ const pw_512, times 16 dw 512 const pw_00ff, times 16 dw 0x00ff const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1) +const pw_0to15, dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 const pd_1, times 8 dd 1 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3 diff -Nru x264-0.142.2389+git956c8d8/common/x86/cpu-a.asm x264-0.142.2431+gita5831aa/common/x86/cpu-a.asm --- x264-0.142.2389+git956c8d8/common/x86/cpu-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/cpu-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ ;* ;* Authors: Laurent Aimar ;* Loren Merritt -;* Jason Garrett-Glaser +;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/x86/dct-a.asm x264-0.142.2431+gita5831aa/common/x86/dct-a.asm --- x264-0.142.2389+git956c8d8/common/x86/dct-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/dct-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -7,7 +7,7 @@ ;* Loren Merritt ;* Laurent Aimar ;* Min Chen -;* Jason Garrett-Glaser +;* Fiona Glaser ;* ;* This program is free software; you can redistribute it and/or modify ;* it under the terms of the GNU General Public License as published by @@ -675,7 +675,7 @@ mova m6, [pw_pixel_max] mova m7, [pd_32] pxor m5, m5 -.loop +.loop: mova m3, [r1] paddd m3, m7 psrad m3, 6 ; dc0 0 dc1 0 dc2 0 dc3 0 diff -Nru x264-0.142.2389+git956c8d8/common/x86/dct.h x264-0.142.2431+gita5831aa/common/x86/dct.h --- x264-0.142.2389+git956c8d8/common/x86/dct.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/dct.h 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Loren Merritt * Laurent Aimar - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/x86/deblock-a.asm x264-0.142.2431+gita5831aa/common/x86/deblock-a.asm --- x264-0.142.2389+git956c8d8/common/x86/deblock-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/deblock-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ ;* Copyright (C) 2005-2014 x264 project ;* ;* Authors: Loren Merritt -;* Jason Garrett-Glaser +;* Fiona Glaser ;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify @@ -621,7 +621,7 @@ mov r6, 2 mova m0, [pw_2] LOAD_AB aa, bb, r2d, r3d -.loop +.loop: mova p2, [r4+r1] mova p1, [r4+2*r1] mova p0, [r4+r5] @@ -671,7 +671,7 @@ add r4, r0 ; pix+4*stride mov r6, 2 mova m0, [pw_2] -.loop +.loop: movu q3, [r0-8] movu q2, [r0+r1-8] movu q1, [r0+r1*2-8] @@ -804,35 +804,6 @@ %define PASS8ROWS(base, base3, stride, stride3, offset) \ PASS8ROWS(base+offset, base3+offset, stride, stride3) -; in: 8 rows of 4 bytes in %4..%11 -; out: 4 rows of 8 bytes in m0..m3 -%macro TRANSPOSE4x8_LOAD 11 - movh m0, %4 - movh m2, %5 - movh m1, %6 - movh m3, %7 - punpckl%1 m0, m2 - punpckl%1 m1, m3 - mova m2, m0 - punpckl%2 m0, m1 - punpckh%2 m2, m1 - - movh m4, %8 - movh m6, %9 - movh m5, %10 - movh m7, %11 - punpckl%1 m4, m6 - punpckl%1 m5, m7 - mova m6, m4 - punpckl%2 m4, m5 - punpckh%2 m6, m5 - - punpckh%3 m1, m0, m4 - punpckh%3 m3, m2, m6 - punpckl%3 m0, m4 - punpckl%3 m2, m6 -%endmacro - ; in: 4 rows of 8 bytes in m0..m3 ; out: 8 rows of 4 bytes in %1..%8 %macro TRANSPOSE8x4B_STORE 8 @@ -844,24 +815,24 @@ punpcklbw m2, m3 punpcklwd m1, m0, m2 punpckhwd m0, m2 - movh %1, m1 + movd %1, m1 punpckhdq m1, m1 - movh %2, m1 - movh %3, m0 + movd %2, m1 + movd %3, m0 punpckhdq m0, m0 - movh %4, m0 + movd %4, m0 punpckhdq m3, m3 punpcklbw m4, m5 punpcklbw m6, m3 punpcklwd m5, m4, m6 punpckhwd m4, m6 - movh %5, m5 + movd %5, m5 punpckhdq m5, m5 - movh %6, m5 - movh %7, m4 + movd %6, m5 + movd %7, m4 punpckhdq m4, m4 - movh %8, m4 + movd %8, m4 %endmacro ; in: 8 rows of 4 bytes in %9..%10 @@ -877,34 +848,94 @@ pextrd %8, %10, 3 %endmacro -%macro TRANSPOSE4x8B_LOAD 8 - TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 -%endmacro - -%macro TRANSPOSE4x8W_LOAD 8 -%if mmsize==16 - TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8 -%else +; in: 4 rows of 4 words in %1..%4 +; out: 4 rows of 4 word in m0..m3 +; clobbers: m4 +%macro TRANSPOSE4x4W_LOAD 4-8 +%if mmsize==8 SWAP 1, 4, 2, 3 - mova m0, [t5] - mova m1, [t5+r1] - mova m2, [t5+r1*2] - mova m3, [t5+t6] + movq m0, %1 + movq m1, %2 + movq m2, %3 + movq m3, %4 TRANSPOSE4x4W 0, 1, 2, 3, 4 +%else + movq m0, %1 + movq m2, %2 + movq m1, %3 + movq m3, %4 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpckldq m0, m1 + punpckhdq m2, m1 + movhlps m1, m0 + movhlps m3, m2 %endif %endmacro -%macro TRANSPOSE8x2W_STORE 8 +; in: 2 rows of 4 words in m1..m2 +; out: 4 rows of 2 words in %1..%4 +; clobbers: m0, m1 +%macro TRANSPOSE4x2W_STORE 4-8 +%if mmsize==8 punpckhwd m0, m1, m2 punpcklwd m1, m2 -%if mmsize==8 +%else + punpcklwd m1, m2 + movhlps m0, m1 +%endif movd %3, m0 movd %1, m1 psrlq m1, 32 psrlq m0, 32 movd %2, m1 movd %4, m0 +%endmacro + +; in: 4/8 rows of 4 words in %1..%8 +; out: 4 rows of 4/8 word in m0..m3 +; clobbers: m4, m5, m6, m7 +%macro TRANSPOSE4x8W_LOAD 8 +%if mmsize==8 + TRANSPOSE4x4W_LOAD %1, %2, %3, %4 %else + movq m0, %1 + movq m2, %2 + movq m1, %3 + movq m3, %4 + punpcklwd m0, m2 + punpcklwd m1, m3 + mova m2, m0 + punpckldq m0, m1 + punpckhdq m2, m1 + + movq m4, %5 + movq m6, %6 + movq m5, %7 + movq m7, %8 + punpcklwd m4, m6 + punpcklwd m5, m7 + mova m6, m4 + punpckldq m4, m5 + punpckhdq m6, m5 + + punpckhqdq m1, m0, m4 + punpckhqdq m3, m2, m6 + punpcklqdq m0, m4 + punpcklqdq m2, m6 +%endif +%endmacro + +; in: 2 rows of 4/8 words in m1..m2 +; out: 4/8 rows of 2 words in %1..%8 +; clobbers: m0, m1 +%macro TRANSPOSE8x2W_STORE 8 +%if mmsize==8 + TRANSPOSE4x2W_STORE %1, %2, %3, %4 +%else + punpckhwd m0, m1, m2 + punpcklwd m1, m2 movd %5, m0 movd %1, m1 psrldq m1, 4 @@ -1118,7 +1149,7 @@ %endif mova m6, [pb_1] psubusb m4, m6 ; alpha - 1 - psubusb m5, m6 ; alpha - 2 + psubusb m5, m6 ; beta - 1 %if %0>2 mova %3, m4 %endif @@ -1361,19 +1392,18 @@ ;----------------------------------------------------------------------------- ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 ) ;----------------------------------------------------------------------------- - %if cpuflag(avx) INIT_XMM cpuname %else INIT_MMX cpuname %endif -cglobal deblock_h_luma, 0,5,8,0x60+HAVE_ALIGNED_STACK*12 - mov r0, r0mp +cglobal deblock_h_luma, 1,5,8,0x60+12 mov r3, r1m lea r4, [r3*3] sub r0, 4 lea r1, [r0+r4] - %define pix_tmp esp+12*HAVE_ALIGNED_STACK + %define pix_tmp esp+12 + ; esp is intentionally misaligned to make it aligned after pushing the arguments for deblock_%1_luma. ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp @@ -2098,17 +2128,14 @@ ;----------------------------------------------------------------------------- %macro DEBLOCK_H_CHROMA_420_MBAFF 0 cglobal deblock_h_chroma_mbaff, 5,7,8 - sub r0, 4 - lea t6, [r1*3] - mov t5, r0 - add r0, t6 - TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + CHROMA_H_START + TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6) LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 - TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) RET %endmacro @@ -2249,9 +2276,9 @@ INIT_MMX mmx2 cglobal deblock_h_chroma_intra_mbaff, 4,6,8 CHROMA_H_START - TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6) + TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body - TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) + TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2) RET %endif ; !HIGH_BIT_DEPTH diff -Nru x264-0.142.2389+git956c8d8/common/x86/mc-a2.asm x264-0.142.2431+gita5831aa/common/x86/mc-a2.asm --- x264-0.142.2389+git956c8d8/common/x86/mc-a2.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/mc-a2.asm 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ ;* Copyright (C) 2005-2014 x264 project ;* ;* Authors: Loren Merritt -;* Jason Garrett-Glaser +;* Fiona Glaser ;* Holger Lubitz ;* Mathieu Monnier ;* Oskar Arvidsson @@ -32,12 +32,14 @@ SECTION_RODATA 32 +pw_1024: times 16 dw 1024 filt_mul20: times 32 db 20 filt_mul15: times 16 db 1, -5 filt_mul51: times 16 db -5, 1 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 +%if HIGH_BIT_DEPTH v210_mask: times 4 dq 0xc00ffc003ff003ff v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14 @@ -45,18 +47,18 @@ v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800 dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800 -%if HIGH_BIT_DEPTH deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 %else +deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1 + db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1 + deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 -%endif -pw_1024: times 16 dw 1024 +%endif ; !HIGH_BIT_DEPTH pd_16: times 4 dd 16 pd_0f: times 4 dd 0xffff -pf_inv256: times 8 dd 0.00390625 pad10: times 8 dw 10*PIXEL_MAX pad20: times 8 dw 20*PIXEL_MAX @@ -67,16 +69,22 @@ tap2: times 4 dw 20, 20 tap3: times 4 dw -5, 1 +pw_0xc000: times 8 dw 0xc000 +pw_31: times 8 dw 31 +pd_4: times 4 dd 4 + SECTION .text cextern pb_0 cextern pw_1 +cextern pw_8 cextern pw_16 cextern pw_32 cextern pw_512 cextern pw_00ff cextern pw_3fff cextern pw_pixel_max +cextern pw_0to15 cextern pd_ffff %macro LOAD_ADD 4 @@ -1202,6 +1210,105 @@ RET %endmacro ; PLANE_DEINTERLEAVE +%macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2 +%if cpuflag(ssse3) + mova m3, [deinterleave_rgb_shuf+(%1-3)*16] +%endif +%%loopy: + mov %8, r6 + mov %9, %6 +%%loopx: + movu m0, [%8] + movu m1, [%8+%1*mmsize/4] +%if cpuflag(ssse3) + pshufb m0, m3 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3 + pshufb m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7 +%elif %1 == 3 + psrldq m2, m0, 6 + punpcklqdq m0, m1 ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5 + psrldq m1, 6 + punpcklqdq m2, m1 ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7 + psrlq m3, m0, 24 + psrlq m4, m2, 24 + punpckhbw m1, m0, m3 ; b4 b5 g4 g5 r4 r5 + punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1 + punpckhbw m3, m2, m4 ; b6 b7 g6 g7 r6 r7 + punpcklbw m2, m4 ; b2 b3 g2 g3 r2 r3 + punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3 + punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7 +%else + pshufd m3, m0, q2301 + pshufd m4, m1, q2301 + punpckhbw m2, m0, m3 ; b2 b3 g2 g3 r2 r3 + punpcklbw m0, m3 ; b0 b1 g0 g1 r0 r1 + punpckhbw m3, m1, m4 ; b6 b7 g6 g7 r6 r7 + punpcklbw m1, m4 ; b4 b5 g4 g5 r4 r5 + punpcklwd m0, m2 ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3 + punpcklwd m1, m3 ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7 +%endif + punpckldq m2, m0, m1 ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7 + punpckhdq m0, m1 ; r0 r1 r2 r3 r4 r5 r6 r7 + movh [r0+%9], m2 + movhps [r2+%9], m2 + movh [r4+%9], m0 + add %8, %1*mmsize/2 + add %9, mmsize/2 + jl %%loopx + add r0, %2 + add r2, %3 + add r4, %4 + add r6, %5 + dec %7d + jg %%loopy +%endmacro + +%macro PLANE_DEINTERLEAVE_RGB 0 +;----------------------------------------------------------------------------- +; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta, +; pixel *dstb, intptr_t i_dstb, +; pixel *dstc, intptr_t i_dstc, +; pixel *src, intptr_t i_src, int pw, int w, int h ) +;----------------------------------------------------------------------------- +%if ARCH_X86_64 +cglobal plane_copy_deinterleave_rgb, 8,12 + %define %%args r1, r3, r5, r7, r8, r9, r10, r11 + mov r8d, r9m + mov r9d, r10m + add r0, r8 + add r2, r8 + add r4, r8 + neg r8 +%else +cglobal plane_copy_deinterleave_rgb, 1,7 + %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5 + mov r1, r9m + mov r2, r2m + mov r4, r4m + mov r6, r6m + add r0, r1 + add r2, r1 + add r4, r1 + neg r1 + mov r9m, r1 + mov r1, r10m +%endif + cmp dword r8m, 4 + je .pw4 + PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR + jmp .ret +.pw4: + PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA +.ret: + REP_RET +%endmacro + +%if HIGH_BIT_DEPTH == 0 +INIT_XMM sse2 +PLANE_DEINTERLEAVE_RGB +INIT_XMM ssse3 +PLANE_DEINTERLEAVE_RGB +%endif ; !HIGH_BIT_DEPTH + %macro PLANE_DEINTERLEAVE_V210 0 ;----------------------------------------------------------------------------- ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty, @@ -1881,62 +1988,64 @@ ; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ) ;----------------------------------------------------------------------------- %macro MBTREE 0 -cglobal mbtree_propagate_cost, 7,7,7 - add r6d, r6d - lea r0, [r0+r6*2] - add r1, r6 - add r2, r6 - add r3, r6 - add r4, r6 - neg r6 - pxor xmm4, xmm4 - movss xmm6, [r5] - shufps xmm6, xmm6, 0 - mulps xmm6, [pf_inv256] - movdqa xmm5, [pw_3fff] -.loop: - movq xmm2, [r2+r6] ; intra - movq xmm0, [r4+r6] ; invq - movq xmm3, [r3+r6] ; inter - movq xmm1, [r1+r6] ; prop - punpcklwd xmm2, xmm4 - punpcklwd xmm0, xmm4 - pmaddwd xmm0, xmm2 - pand xmm3, xmm5 - punpcklwd xmm1, xmm4 - punpcklwd xmm3, xmm4 +cglobal mbtree_propagate_cost, 6,6,7 + movss m6, [r5] + mov r5d, r6m + lea r0, [r0+r5*2] + add r5d, r5d + add r1, r5 + add r2, r5 + add r3, r5 + add r4, r5 + neg r5 + pxor m4, m4 + shufps m6, m6, 0 + mova m5, [pw_3fff] +.loop: + movq m2, [r2+r5] ; intra + movq m0, [r4+r5] ; invq + movq m3, [r3+r5] ; inter + movq m1, [r1+r5] ; prop + pand m3, m5 + pminsw m3, m2 + punpcklwd m2, m4 + punpcklwd m0, m4 + pmaddwd m0, m2 + punpcklwd m1, m4 + punpcklwd m3, m4 %if cpuflag(fma4) - cvtdq2ps xmm0, xmm0 - cvtdq2ps xmm1, xmm1 - fmaddps xmm0, xmm0, xmm6, xmm1 - cvtdq2ps xmm1, xmm2 - psubd xmm2, xmm3 - cvtdq2ps xmm2, xmm2 - rcpps xmm3, xmm1 - mulps xmm1, xmm3 - mulps xmm0, xmm2 - addps xmm2, xmm3, xmm3 - fnmaddps xmm3, xmm1, xmm3, xmm2 - mulps xmm0, xmm3 -%else - cvtdq2ps xmm0, xmm0 - mulps xmm0, xmm6 ; intra*invq*fps_factor>>8 - cvtdq2ps xmm1, xmm1 ; prop - addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8) - cvtdq2ps xmm1, xmm2 ; intra - psubd xmm2, xmm3 ; intra - inter - cvtdq2ps xmm2, xmm2 ; intra - inter - rcpps xmm3, xmm1 ; 1 / intra 1st approximation - mulps xmm1, xmm3 ; intra * (1/intra 1st approx) - mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2 - mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) - addps xmm3, xmm3 ; 2 * (1/intra 1st approx) - subps xmm3, xmm1 ; 2nd approximation for 1/intra - mulps xmm0, xmm3 ; / intra -%endif - cvtps2dq xmm0, xmm0 - movdqa [r0+r6*2], xmm0 - add r6, 8 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + fmaddps m0, m0, m6, m1 + cvtdq2ps m1, m2 + psubd m2, m3 + cvtdq2ps m2, m2 + rcpps m3, m1 + mulps m1, m3 + mulps m0, m2 + addps m2, m3, m3 + fnmaddps m3, m1, m3, m2 + mulps m0, m3 +%else + cvtdq2ps m0, m0 + mulps m0, m6 ; intra*invq*fps_factor>>8 + cvtdq2ps m1, m1 ; prop + addps m0, m1 ; prop + (intra*invq*fps_factor>>8) + cvtdq2ps m1, m2 ; intra + psubd m2, m3 ; intra - inter + cvtdq2ps m2, m2 ; intra - inter + rcpps m3, m1 ; 1 / intra 1st approximation + mulps m1, m3 ; intra * (1/intra 1st approx) + mulps m1, m3 ; intra * (1/intra 1st approx)^2 + mulps m0, m2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter) + addps m3, m3 ; 2 * (1/intra 1st approx) + subps m3, m1 ; 2nd approximation for 1/intra + mulps m0, m3 ; / intra +%endif + cvtps2dq m0, m0 + packssdw m0, m0 + movh [r0+r5], m0 + add r5, 8 jl .loop RET %endmacro @@ -1948,34 +2057,35 @@ MBTREE %macro INT16_UNPACK 1 - vpunpckhwd xm4, xm%1, xm7 - vpunpcklwd xm%1, xm7 - vinsertf128 m%1, m%1, xm4, 1 -%endmacro - -; FIXME: align loads/stores to 16 bytes -%macro MBTREE_AVX 0 -cglobal mbtree_propagate_cost, 7,7,8 - add r6d, r6d - lea r0, [r0+r6*2] - add r1, r6 - add r2, r6 - add r3, r6 - add r4, r6 - neg r6 - mova xm5, [pw_3fff] - vbroadcastss m6, [r5] - mulps m6, [pf_inv256] + punpckhwd xm4, xm%1, xm7 + punpcklwd xm%1, xm7 + vinsertf128 m%1, m%1, xm4, 1 +%endmacro + +; FIXME: align loads to 16 bytes +%macro MBTREE_AVX 1 +cglobal mbtree_propagate_cost, 6,6,%1 + vbroadcastss m6, [r5] + mov r5d, r6m + lea r0, [r0+r5*2] + add r5d, r5d + add r1, r5 + add r2, r5 + add r3, r5 + add r4, r5 + neg r5 + mova xm5, [pw_3fff] %if notcpuflag(avx2) - pxor xm7, xm7 + pxor xm7, xm7 %endif .loop: %if cpuflag(avx2) - pmovzxwd m0, [r2+r6] ; intra - pmovzxwd m1, [r4+r6] ; invq - pmovzxwd m2, [r1+r6] ; prop - pand xm3, xm5, [r3+r6] ; inter + pmovzxwd m0, [r2+r5] ; intra + pmovzxwd m1, [r4+r5] ; invq + pmovzxwd m2, [r1+r5] ; prop + pand xm3, xm5, [r3+r5] ; inter pmovzxwd m3, xm3 + pminsd m3, m0 pmaddwd m1, m0 psubd m4, m0, m3 cvtdq2ps m0, m0 @@ -1990,10 +2100,11 @@ fnmaddps m4, m2, m3, m4 mulps m1, m4 %else - movu xm0, [r2+r6] - movu xm1, [r4+r6] - movu xm2, [r1+r6] - pand xm3, xm5, [r3+r6] + movu xm0, [r2+r5] + movu xm1, [r4+r5] + movu xm2, [r1+r5] + pand xm3, xm5, [r3+r5] + pminsw xm3, xm0 INT16_UNPACK 0 INT16_UNPACK 1 INT16_UNPACK 2 @@ -2015,13 +2126,107 @@ mulps m1, m3 ; / intra %endif vcvtps2dq m1, m1 - movu [r0+r6*2], m1 - add r6, 16 + vextractf128 xm2, m1, 1 + packssdw xm1, xm2 + mova [r0+r5], xm1 + add r5, 16 jl .loop RET %endmacro INIT_YMM avx -MBTREE_AVX +MBTREE_AVX 8 INIT_YMM avx2,fma3 -MBTREE_AVX +MBTREE_AVX 7 + +%macro MBTREE_PROPAGATE_LIST 0 +;----------------------------------------------------------------------------- +; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs, +; int16_t *output, int bipred_weight, int mb_y, int len ) +;----------------------------------------------------------------------------- +cglobal mbtree_propagate_list_internal, 4,6,8 + movh m6, [pw_0to15] ; mb_x + movd m7, r5m + pshuflw m7, m7, 0 + punpcklwd m6, m7 ; 0 y 1 y 2 y 3 y + movd m7, r4m + SPLATW m7, m7 ; bipred_weight + psllw m7, 9 ; bipred_weight << 9 + + mov r5d, r6m + xor r4d, r4d +.loop: + mova m3, [r1+r4*2] + movu m4, [r2+r4*2] + mova m5, [pw_0xc000] + pand m4, m5 + pcmpeqw m4, m5 + pmulhrsw m5, m3, m7 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 +%if cpuflag(avx) + pblendvb m5, m3, m5, m4 +%else + pand m5, m4 + pandn m4, m3 + por m5, m4 ; if( lists_used == 3 ) + ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 +%endif + + movu m0, [r0+r4*4] ; x,y + movu m1, [r0+r4*4+mmsize] + + psraw m2, m0, 5 + psraw m3, m1, 5 + mova m4, [pd_4] + paddw m2, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y} + paddw m6, m4 ; {mbx, mby} += {4, 0} + paddw m3, m6 ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y} + paddw m6, m4 ; {mbx, mby} += {4, 0} + + mova [r3+mmsize*0], m2 + mova [r3+mmsize*1], m3 + + mova m3, [pw_31] + pand m0, m3 ; x &= 31 + pand m1, m3 ; y &= 31 + packuswb m0, m1 + psrlw m1, m0, 3 + pand m0, m3 ; x + SWAP 1, 3 + pandn m1, m3 ; y premultiplied by (1<<5) for later use of pmulhrsw + + mova m3, [pw_32] + psubw m3, m0 ; 32 - x + mova m4, [pw_1024] + psubw m4, m1 ; (32 - y) << 5 + + pmullw m2, m3, m4 ; idx0weight = (32-y)*(32-x) << 5 + pmullw m4, m0 ; idx1weight = (32-y)*x << 5 + pmullw m0, m1 ; idx3weight = y*x << 5 + pmullw m1, m3 ; idx2weight = y*(32-x) << 5 + + ; avoid overflow in the input to pmulhrsw + psrlw m3, m2, 15 + psubw m2, m3 ; idx0weight -= (idx0weight == 32768) + + pmulhrsw m2, m5 ; idx0weight * propagate_amount + 512 >> 10 + pmulhrsw m4, m5 ; idx1weight * propagate_amount + 512 >> 10 + pmulhrsw m1, m5 ; idx2weight * propagate_amount + 512 >> 10 + pmulhrsw m0, m5 ; idx3weight * propagate_amount + 512 >> 10 + + SBUTTERFLY wd, 2, 4, 3 + SBUTTERFLY wd, 1, 0, 3 + mova [r3+mmsize*2], m2 + mova [r3+mmsize*3], m4 + mova [r3+mmsize*4], m1 + mova [r3+mmsize*5], m0 + add r4d, mmsize/2 + add r3, mmsize*6 + cmp r4d, r5d + jl .loop + REP_RET +%endmacro + +INIT_XMM ssse3 +MBTREE_PROPAGATE_LIST +INIT_XMM avx +MBTREE_PROPAGATE_LIST diff -Nru x264-0.142.2389+git956c8d8/common/x86/mc-a.asm x264-0.142.2431+gita5831aa/common/x86/mc-a.asm --- x264-0.142.2389+git956c8d8/common/x86/mc-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/mc-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ ;* Copyright (C) 2003-2014 x264 project ;* ;* Authors: Loren Merritt -;* Jason Garrett-Glaser +;* Fiona Glaser ;* Laurent Aimar ;* Dylan Yudaken ;* Holger Lubitz diff -Nru x264-0.142.2389+git956c8d8/common/x86/mc-c.c x264-0.142.2431+gita5831aa/common/x86/mc-c.c --- x264-0.142.2389+git956c8d8/common/x86/mc-c.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/mc-c.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -116,6 +116,14 @@ void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu, uint16_t *dstv, intptr_t i_dstv, uint16_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, + pixel *dstc, intptr_t i_dstc, + pixel *src, intptr_t i_src, int pw, int w, int h ); +void x264_plane_copy_deinterleave_rgb_ssse3( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, + pixel *dstc, intptr_t i_dstc, + pixel *src, intptr_t i_src, int pw, int w, int h ); void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu, uint16_t *dstv, intptr_t i_dstv, uint32_t *src, intptr_t i_src, int w, int h ); @@ -153,13 +161,13 @@ void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride ); -void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs, +void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); #define MC_CHROMA(cpu)\ @@ -525,6 +533,113 @@ PLANE_INTERLEAVE(avx) #endif +#if HAVE_X86_INLINE_ASM +#define CLIP_ADD(s,x)\ +do\ +{\ + int temp;\ + asm("movd %0, %%xmm0 \n"\ + "movd %2, %%xmm1 \n"\ + "paddsw %%xmm1, %%xmm0 \n"\ + "movd %%xmm0, %1 \n"\ + :"+m"(s), "=&r"(temp)\ + :"m"(x)\ + );\ + s = temp;\ +} while(0) + +#define CLIP_ADD2(s,x)\ +do\ +{\ + asm("movd %0, %%xmm0 \n"\ + "movd %1, %%xmm1 \n"\ + "paddsw %%xmm1, %%xmm0 \n"\ + "movd %%xmm0, %0 \n"\ + :"+m"(M32(s))\ + :"m"(M32(x))\ + );\ +} while(0) +#else +#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1) +#define CLIP_ADD2(s,x)\ +do\ +{\ + CLIP_ADD((s)[0], (x)[0]);\ + CLIP_ADD((s)[1], (x)[1]);\ +} while(0) +#endif + +#define PROPAGATE_LIST(cpu)\ +void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\ + uint16_t *lowres_costs, int16_t *output,\ + int bipred_weight, int mb_y, int len );\ +\ +static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\ + int16_t *propagate_amount, uint16_t *lowres_costs,\ + int bipred_weight, int mb_y, int len, int list )\ +{\ + int16_t *current = h->scratch_buffer2;\ +\ + x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\ + current, bipred_weight, mb_y, len );\ +\ + unsigned stride = h->mb.i_mb_stride;\ + unsigned width = h->mb.i_mb_width;\ + unsigned height = h->mb.i_mb_height;\ +\ + for( unsigned i = 0; i < len; current += 32 )\ + {\ + int end = X264_MIN( i+8, len );\ + for( ; i < end; i++, current += 2 )\ + {\ + if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\ + continue;\ +\ + unsigned mbx = current[0];\ + unsigned mby = current[1];\ + unsigned idx0 = mbx + mby * stride;\ + unsigned idx2 = idx0 + stride;\ +\ + /* Shortcut for the simple/common case of zero MV */\ + if( !M32( mvs[i] ) )\ + {\ + CLIP_ADD( ref_costs[idx0], current[16] );\ + continue;\ + }\ +\ + if( mbx < width-1 && mby < height-1 )\ + {\ + CLIP_ADD2( ref_costs+idx0, current+16 );\ + CLIP_ADD2( ref_costs+idx2, current+32 );\ + }\ + else\ + {\ + /* Note: this takes advantage of unsigned representation to\ + * catch negative mbx/mby. */\ + if( mby < height )\ + {\ + if( mbx < width )\ + CLIP_ADD( ref_costs[idx0+0], current[16] );\ + if( mbx+1 < width )\ + CLIP_ADD( ref_costs[idx0+1], current[17] );\ + }\ + if( mby+1 < height )\ + {\ + if( mbx < width )\ + CLIP_ADD( ref_costs[idx2+0], current[32] );\ + if( mbx+1 < width )\ + CLIP_ADD( ref_costs[idx2+1], current[33] );\ + }\ + }\ + }\ + }\ +} + +PROPAGATE_LIST(ssse3) +PROPAGATE_LIST(avx) +#undef CLIP_ADD +#undef CLIP_ADD2 + void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) @@ -637,6 +752,7 @@ pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3; + pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) ) pf->integral_init4v = x264_integral_init4v_ssse3; @@ -688,6 +804,7 @@ pf->integral_init8v = x264_integral_init8v_sse2; pf->hpel_filter = x264_hpel_filter_sse2_amd; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2; + pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2; if( !(cpu&X264_CPU_SSE2_IS_SLOW) ) { @@ -738,6 +855,8 @@ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_ssse3; pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3; pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3; + pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3; + pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3; if( !(cpu&X264_CPU_SLOW_PSHUFB) ) { @@ -814,6 +933,7 @@ return; pf->memzero_aligned = x264_memzero_aligned_avx; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; + pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx; if( cpu&X264_CPU_FMA4 ) pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4; diff -Nru x264-0.142.2389+git956c8d8/common/x86/pixel-a.asm x264-0.142.2431+gita5831aa/common/x86/pixel-a.asm --- x264-0.142.2389+git956c8d8/common/x86/pixel-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/pixel-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -7,7 +7,7 @@ ;* Holger Lubitz ;* Laurent Aimar ;* Alex Izvorski -;* Jason Garrett-Glaser +;* Fiona Glaser ;* Oskar Arvidsson ;* ;* This program is free software; you can redistribute it and/or modify @@ -205,7 +205,7 @@ mov r4d, %%n %endif pxor m0, m0 -.loop +.loop: mova m1, [r0] mova m2, [r0+offset0_1] mova m3, [r0+offset0_2] @@ -561,10 +561,15 @@ pshufhw m0, m0, q3120 pshufhw m1, m1, q3120 %endif +%if cpuflag(xop) + pmadcswd m2, m0, m0, m2 + pmadcswd m3, m1, m1, m3 +%else pmaddwd m0, m0 pmaddwd m1, m1 paddd m2, m0 paddd m3, m1 +%endif add r6, 2*mmsize jl .loopx %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled @@ -657,10 +662,15 @@ por m0, m1 psrlw m2, m0, 8 pand m0, m5 +%if cpuflag(xop) + pmadcswd m4, m2, m2, m4 + pmadcswd m3, m0, m0, m3 +%else pmaddwd m2, m2 pmaddwd m0, m0 - paddd m3, m0 paddd m4, m2 + paddd m3, m0 +%endif add r6, mmsize jl .loopx %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled @@ -695,6 +705,8 @@ SSD_NV12 INIT_XMM avx SSD_NV12 +INIT_XMM xop +SSD_NV12 INIT_YMM avx2 SSD_NV12 @@ -1265,7 +1277,7 @@ ; clobber: m3..m7 ; out: %1 = satd %macro SATD_4x4_MMX 3 - %xdefine %%n n%1 + %xdefine %%n nn%1 %assign offset %2*SIZEOF_PIXEL LOAD_DIFF m4, m3, none, [r0+ offset], [r2+ offset] LOAD_DIFF m5, m3, none, [r0+ r1+offset], [r2+ r3+offset] diff -Nru x264-0.142.2389+git956c8d8/common/x86/pixel.h x264-0.142.2431+gita5831aa/common/x86/pixel.h --- x264-0.142.2389+git956c8d8/common/x86/pixel.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/pixel.h 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -56,6 +56,7 @@ DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) DECL_X4( sad, ssse3 ) +DECL_X4( sad, xop ) DECL_X4( sad, avx ) DECL_X4( sad, avx2 ) DECL_X1( ssd, mmx ) @@ -153,6 +154,9 @@ void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); +void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1, + pixel *pixuv2, intptr_t stride2, int width, + int height, uint64_t *ssd_u, uint64_t *ssd_v ); void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1, pixel *pixuv2, intptr_t stride2, int width, int height, uint64_t *ssd_u, uint64_t *ssd_v ); diff -Nru x264-0.142.2389+git956c8d8/common/x86/predict-a.asm x264-0.142.2431+gita5831aa/common/x86/predict-a.asm --- x264-0.142.2389+git956c8d8/common/x86/predict-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/predict-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ ;* ;* Authors: Loren Merritt ;* Holger Lubitz -;* Jason Garrett-Glaser +;* Fiona Glaser ;* Henrik Gramner ;* ;* This program is free software; you can redistribute it and/or modify @@ -31,7 +31,6 @@ SECTION_RODATA 32 -pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4 pw_m3: times 16 dw -3 pw_m7: times 16 dw -7 @@ -56,6 +55,7 @@ cextern pw_16 cextern pw_00ff cextern pw_pixel_max +cextern pw_0to15 %macro STORE8 1 mova [r0+0*FDEC_STRIDEB], %1 diff -Nru x264-0.142.2389+git956c8d8/common/x86/predict-c.c x264-0.142.2431+gita5831aa/common/x86/predict-c.c --- x264-0.142.2389+git956c8d8/common/x86/predict-c.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/predict-c.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/common/x86/quant-a.asm x264-0.142.2431+gita5831aa/common/x86/quant-a.asm --- x264-0.142.2389+git956c8d8/common/x86/quant-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/quant-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ ;* Copyright (C) 2005-2014 x264 project ;* ;* Authors: Loren Merritt -;* Jason Garrett-Glaser +;* Fiona Glaser ;* Christian Heine ;* Oskar Arvidsson ;* Henrik Gramner diff -Nru x264-0.142.2389+git956c8d8/common/x86/quant.h x264-0.142.2431+gita5831aa/common/x86/quant.h --- x264-0.142.2389+git956c8d8/common/x86/quant.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/quant.h 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * Christian Heine * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/x86/sad16-a.asm x264-0.142.2431+gita5831aa/common/x86/sad16-a.asm --- x264-0.142.2389+git956c8d8/common/x86/sad16-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/sad16-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -519,6 +519,19 @@ SAD_X 4, 8, 16 SAD_X 4, 8, 8 SAD_X 4, 8, 4 +INIT_XMM xop +%define XMM_REGS 7 +SAD_X 3, 16, 16 +SAD_X 3, 16, 8 +SAD_X 3, 8, 16 +SAD_X 3, 8, 8 +SAD_X 3, 8, 4 +%define XMM_REGS 9 +SAD_X 4, 16, 16 +SAD_X 4, 16, 8 +SAD_X 4, 8, 16 +SAD_X 4, 8, 8 +SAD_X 4, 8, 4 INIT_YMM avx2 %define XMM_REGS 7 SAD_X 3, 16, 16 diff -Nru x264-0.142.2389+git956c8d8/common/x86/sad-a.asm x264-0.142.2431+gita5831aa/common/x86/sad-a.asm --- x264-0.142.2389+git956c8d8/common/x86/sad-a.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/sad-a.asm 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ ;* Copyright (C) 2003-2014 x264 project ;* ;* Authors: Loren Merritt -;* Jason Garrett-Glaser +;* Fiona Glaser ;* Laurent Aimar ;* Alex Izvorski ;* diff -Nru x264-0.142.2389+git956c8d8/common/x86/util.h x264-0.142.2431+gita5831aa/common/x86/util.h --- x264-0.142.2389+git956c8d8/common/x86/util.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/util.h 2014-07-11 01:16:23.000000000 +0000 @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2008-2014 x264 project * - * Authors: Jason Garrett-Glaser + * Authors: Fiona Glaser * Loren Merritt * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/common/x86/x86inc.asm x264-0.142.2431+gita5831aa/common/x86/x86inc.asm --- x264-0.142.2389+git956c8d8/common/x86/x86inc.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/x86inc.asm 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ ;* ;* Authors: Loren Merritt ;* Anton Mitrofanov -;* Jason Garrett-Glaser +;* Fiona Glaser ;* Henrik Gramner ;* ;* Permission to use, copy, modify, and/or distribute this software for any @@ -42,6 +42,14 @@ %define public_prefix private_prefix %endif +%ifndef STACK_ALIGNMENT + %if ARCH_X86_64 + %define STACK_ALIGNMENT 16 + %else + %define STACK_ALIGNMENT 4 + %endif +%endif + %define WIN64 0 %define UNIX64 0 %if ARCH_X86_64 @@ -94,8 +102,9 @@ ; %1 = number of arguments. loads them from stack if needed. ; %2 = number of registers used. pushes callee-saved regs if needed. ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. -; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x, -; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes), +; %4 = (optional) stack size to be allocated. The stack will be aligned before +; allocating the specified stack size. If the required stack alignment is +; larger than the known stack alignment the stack will be manually aligned ; and an extra register will be allocated to hold the original stack ; pointer (to not invalidate r0m etc.). To prevent the use of an extra ; register as stack pointer, request a negative stack size. @@ -103,8 +112,10 @@ ; PROLOGUE can also be invoked by adding the same options to cglobal ; e.g. -; cglobal foo, 2,3,0, dst, src, tmp -; declares a function (foo), taking two args (dst and src) and one local variable (tmp) +; cglobal foo, 2,3,7,0x40, dst, src, tmp +; declares a function (foo) that automatically loads two arguments (dst and +; src) into registers, uses one additional register (tmp) plus 7 vector +; registers (m0-m6) and allocates 0x40 bytes of stack space. ; TODO Some functions can use some args directly from the stack. If they're the ; last args then you can just not declare them, but if they're in the middle @@ -304,26 +315,28 @@ %assign n_arg_names %0 %endmacro +%define required_stack_alignment ((mmsize + 15) & ~15) + %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 %if %1 != 0 - %assign %%stack_alignment ((mmsize + 15) & ~15) + %assign %%pad 0 %assign stack_size %1 %if stack_size < 0 %assign stack_size -stack_size %endif - %assign stack_size_padded stack_size %if WIN64 - %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space + %assign %%pad %%pad + 32 ; shadow space %if mmsize != 8 %assign xmm_regs_used %2 %if xmm_regs_used > 8 - %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16 + %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers %endif %endif %endif - %if mmsize <= 16 && HAVE_ALIGNED_STACK - %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) + %if required_stack_alignment <= STACK_ALIGNMENT + ; maintain the current stack alignment + %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %else %assign %%reg_num (regs_used - 1) @@ -332,17 +345,17 @@ ; it, i.e. in [rsp+stack_size_padded], so we can restore the ; stack in a single instruction (i.e. mov rsp, rstk or mov ; rsp, [rsp+stack_size_padded]) - mov rstk, rsp %if %1 < 0 ; need to store rsp on stack - sub rsp, gprsize+stack_size_padded - and rsp, ~(%%stack_alignment-1) - %xdefine rstkm [rsp+stack_size_padded] - mov rstkm, rstk + %xdefine rstkm [rsp + stack_size + %%pad] + %assign %%pad %%pad + gprsize %else ; can keep rsp in rstk during whole function - sub rsp, stack_size_padded - and rsp, ~(%%stack_alignment-1) %xdefine rstkm rstk %endif + %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1)) + mov rstk, rsp + and rsp, ~(required_stack_alignment-1) + sub rsp, stack_size_padded + movifnidn rstkm, rstk %endif WIN64_PUSH_XMM %endif @@ -351,7 +364,7 @@ %macro SETUP_STACK_POINTER 1 %ifnum %1 - %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32) + %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT %if %1 > 0 %assign regs_used (regs_used + 1) %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 @@ -425,7 +438,9 @@ %assign xmm_regs_used %1 ASSERT xmm_regs_used <= 16 %if xmm_regs_used > 8 - %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32 + ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. + %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM @@ -441,7 +456,7 @@ %endrep %endif %if stack_size_padded > 0 - %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) + %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add %1, stack_size_padded @@ -507,7 +522,7 @@ %macro RET 0 %if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 +%if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded @@ -563,7 +578,7 @@ %macro RET 0 %if stack_size_padded > 0 -%if mmsize == 32 || HAVE_ALIGNED_STACK == 0 +%if required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else add rsp, stack_size_padded @@ -803,12 +818,12 @@ %assign %%i 0 %rep 8 CAT_XDEFINE m, %%i, mm %+ %%i - CAT_XDEFINE nmm, %%i, %%i + CAT_XDEFINE nnmm, %%i, %%i %assign %%i %%i+1 %endrep %rep 8 CAT_UNDEF m, %%i - CAT_UNDEF nmm, %%i + CAT_UNDEF nnmm, %%i %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 @@ -829,7 +844,7 @@ %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, xmm %+ %%i - CAT_XDEFINE nxmm, %%i, %%i + CAT_XDEFINE nnxmm, %%i, %%i %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 @@ -899,7 +914,7 @@ %endrep %rep %0/2 %xdefine m%1 %%tmp%2 - CAT_XDEFINE n, m%1, %1 + CAT_XDEFINE nn, m%1, %1 %rotate 2 %endrep %endmacro @@ -917,16 +932,16 @@ %xdefine %%tmp m%1 %xdefine m%1 m%2 %xdefine m%2 %%tmp - CAT_XDEFINE n, m%1, %1 - CAT_XDEFINE n, m%2, %2 + CAT_XDEFINE nn, m%1, %1 + CAT_XDEFINE nn, m%2, %2 %rotate 1 %endrep %endmacro %macro SWAP_INTERNAL_NAME 2-* - %xdefine %%args n %+ %1 + %xdefine %%args nn %+ %1 %rep %0-1 - %xdefine %%args %%args, n %+ %2 + %xdefine %%args %%args, nn %+ %2 %rotate 1 %endrep SWAP_INTERNAL_NUM %%args @@ -953,7 +968,7 @@ %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, %1_m %+ %%i - CAT_XDEFINE n, m %+ %%i, %%i + CAT_XDEFINE nn, m %+ %%i, %%i %assign %%i %%i+1 %endrep %endif @@ -1385,15 +1400,18 @@ %macro %1 4-7 %1, %2, %3 %if cpuflag(xop) v%5 %1, %2, %3, %4 - %else + %elifnidn %1, %4 %6 %1, %2, %3 %7 %1, %4 + %else + %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported %endif %endmacro %endmacro -FMA_INSTR pmacsdd, pmulld, paddd FMA_INSTR pmacsww, pmullw, paddw +FMA_INSTR pmacsdd, pmulld, paddd ; sse4 emulation +FMA_INSTR pmacsdql, pmuldq, paddq ; sse4 emulation FMA_INSTR pmadcswd, pmaddwd, paddd ; convert FMA4 to FMA3 if possible diff -Nru x264-0.142.2389+git956c8d8/common/x86/x86util.asm x264-0.142.2431+gita5831aa/common/x86/x86util.asm --- x264-0.142.2389+git956c8d8/common/x86/x86util.asm 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/common/x86/x86util.asm 2014-07-11 01:16:23.000000000 +0000 @@ -298,11 +298,16 @@ paddd %1, %2 %endif %if mmsize >= 16 +%if cpuflag(xop) && sizeof%1 == 16 + vphadddq %1, %1 +%endif movhlps %2, %1 paddd %1, %2 %endif +%if notcpuflag(xop) || sizeof%1 != 16 PSHUFLW %2, %1, q0032 paddd %1, %2 +%endif %undef %1 %undef %2 %endmacro diff -Nru x264-0.142.2389+git956c8d8/config.guess x264-0.142.2431+gita5831aa/config.guess --- x264-0.142.2389+git956c8d8/config.guess 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/config.guess 2014-07-11 01:16:23.000000000 +0000 @@ -1,14 +1,12 @@ #! /bin/sh # Attempt to guess a canonical system name. -# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, -# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, -# 2011, 2012 Free Software Foundation, Inc. +# Copyright 1992-2013 Free Software Foundation, Inc. -timestamp='2012-09-25' +timestamp='2013-06-10' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or +# the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, but @@ -22,19 +20,17 @@ # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. - - -# Originally written by Per Bothner. Please send patches (context -# diff format) to and include a ChangeLog -# entry. +# the same distribution terms that you use for the rest of that +# program. This Exception is an additional permission under section 7 +# of the GNU General Public License, version 3 ("GPLv3"). # -# This script attempts to guess a canonical system name similar to -# config.sub. If it succeeds, it prints the system name on stdout, and -# exits with 0. Otherwise, it exits with 1. +# Originally written by Per Bothner. # # You can get the latest version of this script from: # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD +# +# Please send patches with a ChangeLog entry to config-patches@gnu.org. + me=`echo "$0" | sed -e 's,.*/,,'` @@ -54,9 +50,7 @@ GNU config.guess ($timestamp) Originally written by Per Bothner. -Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, -2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 -Free Software Foundation, Inc. +Copyright 1992-2013 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -138,6 +132,27 @@ UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown +case "${UNAME_SYSTEM}" in +Linux|GNU|GNU/*) + # If the system lacks a compiler, then just pick glibc. + # We could probably try harder. + LIBC=gnu + + eval $set_cc_for_build + cat <<-EOF > $dummy.c + #include + #if defined(__UCLIBC__) + LIBC=uclibc + #elif defined(__dietlibc__) + LIBC=dietlibc + #else + LIBC=gnu + #endif + EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` + ;; +esac + # Note: order is significant - the case branches are not exclusive. case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in @@ -859,21 +874,21 @@ exit ;; *:GNU:*:*) # the GNU system - echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` exit ;; *:GNU/*:*:*) # other systems with GNU libc and userland - echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu + echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC} exit ;; i*86:Minix:*:*) echo ${UNAME_MACHINE}-pc-minix exit ;; aarch64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; aarch64_be:Linux:*:*) UNAME_MACHINE=aarch64_be - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; alpha:Linux:*:*) case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in @@ -886,59 +901,54 @@ EV68*) UNAME_MACHINE=alphaev68 ;; esac objdump --private-headers /bin/sh | grep -q ld.so.1 - if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi - echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + if test "$?" = 0 ; then LIBC="gnulibc1" ; fi + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + exit ;; + arc:Linux:*:* | arceb:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; arm*:Linux:*:*) eval $set_cc_for_build if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_EABI__ then - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} else if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ | grep -q __ARM_PCS_VFP then - echo ${UNAME_MACHINE}-unknown-linux-gnueabi + echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi else - echo ${UNAME_MACHINE}-unknown-linux-gnueabihf + echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf fi fi exit ;; avr32*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; cris:Linux:*:*) - echo ${UNAME_MACHINE}-axis-linux-gnu + echo ${UNAME_MACHINE}-axis-linux-${LIBC} exit ;; crisv32:Linux:*:*) - echo ${UNAME_MACHINE}-axis-linux-gnu + echo ${UNAME_MACHINE}-axis-linux-${LIBC} exit ;; frv:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; hexagon:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; i*86:Linux:*:*) - LIBC=gnu - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #ifdef __dietlibc__ - LIBC=dietlibc - #endif -EOF - eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` - echo "${UNAME_MACHINE}-pc-linux-${LIBC}" + echo ${UNAME_MACHINE}-pc-linux-${LIBC} exit ;; ia64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; m32r*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; m68*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; mips:Linux:*:* | mips64:Linux:*:*) eval $set_cc_for_build @@ -957,54 +967,63 @@ #endif EOF eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` - test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; } ;; + or1k:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} + exit ;; or32:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; padre:Linux:*:*) - echo sparc-unknown-linux-gnu + echo sparc-unknown-linux-${LIBC} exit ;; parisc64:Linux:*:* | hppa64:Linux:*:*) - echo hppa64-unknown-linux-gnu + echo hppa64-unknown-linux-${LIBC} exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in - PA7*) echo hppa1.1-unknown-linux-gnu ;; - PA8*) echo hppa2.0-unknown-linux-gnu ;; - *) echo hppa-unknown-linux-gnu ;; + PA7*) echo hppa1.1-unknown-linux-${LIBC} ;; + PA8*) echo hppa2.0-unknown-linux-${LIBC} ;; + *) echo hppa-unknown-linux-${LIBC} ;; esac exit ;; ppc64:Linux:*:*) - echo powerpc64-unknown-linux-gnu + echo powerpc64-unknown-linux-${LIBC} exit ;; ppc:Linux:*:*) - echo powerpc-unknown-linux-gnu + echo powerpc-unknown-linux-${LIBC} + exit ;; + ppc64le:Linux:*:*) + echo powerpc64le-unknown-linux-${LIBC} + exit ;; + ppcle:Linux:*:*) + echo powerpcle-unknown-linux-${LIBC} exit ;; s390:Linux:*:* | s390x:Linux:*:*) - echo ${UNAME_MACHINE}-ibm-linux + echo ${UNAME_MACHINE}-ibm-linux-${LIBC} exit ;; sh64*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; sh*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; sparc:Linux:*:* | sparc64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; tile*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; vax:Linux:*:*) - echo ${UNAME_MACHINE}-dec-linux-gnu + echo ${UNAME_MACHINE}-dec-linux-${LIBC} exit ;; x86_64:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; xtensa*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-${LIBC} exit ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. @@ -1237,19 +1256,21 @@ exit ;; *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown - case $UNAME_PROCESSOR in - i386) - eval $set_cc_for_build - if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then - if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ - (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ - grep IS_64BIT_ARCH >/dev/null - then - UNAME_PROCESSOR="x86_64" - fi - fi ;; - unknown) UNAME_PROCESSOR=powerpc ;; - esac + eval $set_cc_for_build + if test "$UNAME_PROCESSOR" = unknown ; then + UNAME_PROCESSOR=powerpc + fi + if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then + if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + case $UNAME_PROCESSOR in + i386) UNAME_PROCESSOR=x86_64 ;; + powerpc) UNAME_PROCESSOR=powerpc64 ;; + esac + fi + fi echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} exit ;; *:procnto*:*:* | *:QNX:[0123456789]*:*) diff -Nru x264-0.142.2389+git956c8d8/config.sub x264-0.142.2431+gita5831aa/config.sub --- x264-0.142.2389+git956c8d8/config.sub 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/config.sub 2014-07-11 01:16:23.000000000 +0000 @@ -1,24 +1,18 @@ #! /bin/sh # Configuration validation subroutine script. -# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, -# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, -# 2011, 2012 Free Software Foundation, Inc. - -timestamp='2012-12-06' - -# This file is (in principle) common to ALL GNU software. -# The presence of a machine in this file suggests that SOME GNU software -# can handle that machine. It does not imply ALL GNU software can. -# -# This file is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation; either version 2 of the License, or +# Copyright 1992-2013 Free Software Foundation, Inc. + +timestamp='2013-08-10' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, see . @@ -26,11 +20,12 @@ # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a # configuration script generated by Autoconf, you may include it under -# the same distribution terms that you use for the rest of that program. +# the same distribution terms that you use for the rest of that +# program. This Exception is an additional permission under section 7 +# of the GNU General Public License, version 3 ("GPLv3"). -# Please send patches to . Submit a context -# diff and a properly formatted GNU ChangeLog entry. +# Please send patches with a ChangeLog entry to config-patches@gnu.org. # # Configuration subroutine to validate and canonicalize a configuration type. # Supply the specified configuration type as an argument. @@ -73,9 +68,7 @@ version="\ GNU config.sub ($timestamp) -Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, -2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 -Free Software Foundation, Inc. +Copyright 1992-2013 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -259,12 +252,12 @@ | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ | am33_2.0 \ - | arc \ + | arc | arceb \ | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \ | avr | avr32 \ | be32 | be64 \ | bfin \ - | c4x | clipper \ + | c4x | c8051 | clipper \ | d10v | d30v | dlx | dsp16xx \ | epiphany \ | fido | fr30 | frv \ @@ -293,16 +286,17 @@ | mipsisa64r2 | mipsisa64r2el \ | mipsisa64sb1 | mipsisa64sb1el \ | mipsisa64sr71k | mipsisa64sr71kel \ + | mipsr5900 | mipsr5900el \ | mipstx39 | mipstx39el \ | mn10200 | mn10300 \ | moxie \ | mt \ | msp430 \ | nds32 | nds32le | nds32be \ - | nios | nios2 \ + | nios | nios2 | nios2eb | nios2el \ | ns16k | ns32k \ | open8 \ - | or32 \ + | or1k | or32 \ | pdp10 | pdp11 | pj | pjl \ | powerpc | powerpc64 | powerpc64le | powerpcle \ | pyramid \ @@ -372,13 +366,13 @@ | aarch64-* | aarch64_be-* \ | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ - | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ + | alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \ | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ | avr-* | avr32-* \ | be32-* | be64-* \ | bfin-* | bs2000-* \ | c[123]* | c30-* | [cjt]90-* | c4x-* \ - | clipper-* | craynv-* | cydra-* \ + | c8051-* | clipper-* | craynv-* | cydra-* \ | d10v-* | d30v-* | dlx-* \ | elxsi-* \ | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \ @@ -410,12 +404,13 @@ | mipsisa64r2-* | mipsisa64r2el-* \ | mipsisa64sb1-* | mipsisa64sb1el-* \ | mipsisa64sr71k-* | mipsisa64sr71kel-* \ + | mipsr5900-* | mipsr5900el-* \ | mipstx39-* | mipstx39el-* \ | mmix-* \ | mt-* \ | msp430-* \ | nds32-* | nds32le-* | nds32be-* \ - | nios-* | nios2-* \ + | nios-* | nios2-* | nios2eb-* | nios2el-* \ | none-* | np1-* | ns16k-* | ns32k-* \ | open8-* \ | orion-* \ @@ -799,7 +794,7 @@ os=-mingw64 ;; mingw32) - basic_machine=i386-pc + basic_machine=i686-pc os=-mingw32 ;; mingw32ce) @@ -835,7 +830,7 @@ basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` ;; msys) - basic_machine=i386-pc + basic_machine=i686-pc os=-msys ;; mvs) @@ -1357,7 +1352,7 @@ -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\ | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \ - | -sym* | -kopensolaris* \ + | -sym* | -kopensolaris* | -plan9* \ | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ | -aos* | -aros* \ | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ @@ -1503,9 +1498,6 @@ -aros*) os=-aros ;; - -kaos*) - os=-kaos - ;; -zvmoe) os=-zvmoe ;; @@ -1554,6 +1546,9 @@ c4x-* | tic4x-*) os=-coff ;; + c8051-*) + os=-elf + ;; hexagon-*) os=-elf ;; @@ -1597,6 +1592,9 @@ mips*-*) os=-elf ;; + or1k-*) + os=-elf + ;; or32-*) os=-coff ;; diff -Nru x264-0.142.2389+git956c8d8/configure x264-0.142.2431+gita5831aa/configure --- x264-0.142.2389+git956c8d8/configure 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/configure 2014-07-11 01:16:23.000000000 +0000 @@ -467,7 +467,6 @@ ;; darwin*) SYS="MACOSX" - CFLAGS="$CFLAGS -falign-loops=16" libm="-lm" if [ "$pic" = "no" ]; then cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic" @@ -557,7 +556,7 @@ LDFLAGS="$LDFLAGS $libm" -aligned_stack=1 +stack_alignment=16 case $host_cpu in i*86) ARCH="X86" @@ -577,8 +576,7 @@ if [ $SYS = LINUX ]; then # < 11 is completely incapable of keeping a mod16 stack if cpp_check "" "" "__INTEL_COMPILER < 1100" ; then - define BROKEN_STACK_ALIGNMENT - aligned_stack=0 + stack_alignment=4 # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so. elif cpp_check "" "" "__INTEL_COMPILER < 1200" ; then CFLAGS="$CFLAGS -falign-stack=assume-16-byte" @@ -586,7 +584,7 @@ # >= 12 defaults to a mod16 stack fi # icl on windows has no mod16 stack support - [ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT && aligned_stack=0 + [ $SYS = WINDOWS ] && stack_alignment=4 fi if [ "$SYS" = MACOSX ]; then ASFLAGS="$ASFLAGS -f macho -DPREFIX" @@ -681,7 +679,6 @@ ARCH="$(echo $host_cpu | tr a-z A-Z)" ;; esac -ASFLAGS="$ASFLAGS -DHAVE_ALIGNED_STACK=${aligned_stack}" if [ $SYS = WINDOWS ]; then if ! rc_check "0 RCDATA {0}" ; then @@ -713,7 +710,7 @@ fi fi -if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" \) ] ; then +if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" -o $ARCH = "AARCH64" \) ] ; then pic="yes" fi @@ -733,10 +730,11 @@ echo "If you really want to compile without asm, configure with --disable-asm." exit 1 fi + ASFLAGS="$ASFLAGS -Worphan-labels" define HAVE_MMX - if cc_check '' -mpreferred-stack-boundary=5 ; then + if [ $compiler = GNU ] && cc_check '' -mpreferred-stack-boundary=5 ; then CFLAGS="$CFLAGS -mpreferred-stack-boundary=5" - define HAVE_32B_STACK_ALIGNMENT + stack_alignment=32 fi fi @@ -761,6 +759,9 @@ define ARCH_$ARCH define SYS_$SYS +define STACK_ALIGNMENT $stack_alignment +ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment" + # skip endianness check for Intel Compiler, as all supported platforms are little. the -ipo flag will also cause the check to fail if [ $compiler = GNU ]; then echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c @@ -806,10 +807,15 @@ fi ;; QNX) - cc_check pthread.h -lc && thread="posix" && libpthread="-lc" + cc_check pthread.h -lc "pthread_create(0,0,0,0);" && thread="posix" && libpthread="-lc" ;; *) - cc_check pthread.h -lpthread && thread="posix" && libpthread="-lpthread" + if cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then + thread="posix" + libpthread="-lpthread" + else + cc_check pthread.h "" "pthread_create(0,0,0,0);" && thread="posix" && libpthread="" + fi ;; esac fi @@ -998,7 +1004,6 @@ fi if [ "$strip" = "yes" ]; then - CFLAGS="$CFLAGS -s" LDFLAGS="$LDFLAGS -s" fi diff -Nru x264-0.142.2389+git956c8d8/debian/changelog x264-0.142.2431+gita5831aa/debian/changelog --- x264-0.142.2389+git956c8d8/debian/changelog 2014-05-12 23:02:24.000000000 +0000 +++ x264-0.142.2431+gita5831aa/debian/changelog 2014-07-11 01:21:53.000000000 +0000 @@ -1,3 +1,19 @@ +x264 (2:0.142.2431+gita5831aa-1) unstable; urgency=low + + * Update to new upstream snapshot + * Imported Upstream version 0.142.2431+gita5831aa + * Fixes for armel and armhf (Closes: #752168) + + -- Reinhard Tartler Thu, 10 Jul 2014 21:21:51 -0400 + +x264 (2:0.142.2412+gitd7e6896-1) unstable; urgency=medium + + * Update to new upstream snapshot + * Imported Upstream version 0.142.2412+gitd7e6896 + * Drop powerpcspe.patch which got upstreamed + + -- Rico Tzschichholz Sat, 07 Jun 2014 09:16:36 +0200 + x264 (2:0.142.2389+git956c8d8-5) unstable; urgency=medium [ Peter Michael Green ] diff -Nru x264-0.142.2389+git956c8d8/debian/patches/aarch64-pic.patch x264-0.142.2431+gita5831aa/debian/patches/aarch64-pic.patch --- x264-0.142.2389+git956c8d8/debian/patches/aarch64-pic.patch 2014-05-11 19:10:39.000000000 +0000 +++ x264-0.142.2431+gita5831aa/debian/patches/aarch64-pic.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,14 +0,0 @@ -Description: Build with -fPIC on aarch64 to fix link failure. -Author: William Grant - ---- a/configure -+++ b/configure -@@ -713,7 +713,7 @@ if [ $compiler != ICL ]; then - fi - fi - --if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" \) ] ; then -+if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" -o $ARCH = "AARCH64" \) ] ; then - pic="yes" - fi - diff -Nru x264-0.142.2389+git956c8d8/debian/patches/link_gpac_dynamically.patch x264-0.142.2431+gita5831aa/debian/patches/link_gpac_dynamically.patch --- x264-0.142.2389+git956c8d8/debian/patches/link_gpac_dynamically.patch 2014-05-12 23:00:26.000000000 +0000 +++ x264-0.142.2431+gita5831aa/debian/patches/link_gpac_dynamically.patch 2014-07-11 01:22:39.000000000 +0000 @@ -1,6 +1,6 @@ --- a/configure +++ b/configure -@@ -940,7 +940,7 @@ fi +@@ -946,7 +946,7 @@ fi if [ "$gpac" = "auto" -a "$lsmash" != "yes" ] ; then gpac="no" diff -Nru x264-0.142.2389+git956c8d8/debian/patches/powerpcspe.patch x264-0.142.2431+gita5831aa/debian/patches/powerpcspe.patch --- x264-0.142.2389+git956c8d8/debian/patches/powerpcspe.patch 2014-05-11 19:10:39.000000000 +0000 +++ x264-0.142.2431+gita5831aa/debian/patches/powerpcspe.patch 1970-01-01 00:00:00.000000000 +0000 @@ -1,20 +0,0 @@ ---- a/common/cpu.c -+++ b/common/cpu.c -@@ -338,6 +338,9 @@ uint32_t x264_cpu_detect( void ) - - uint32_t x264_cpu_detect( void ) - { -+#ifdef __NO_FPRS__ -+ return 0; -+#else - static void (*oldsig)( int ); - - oldsig = signal( SIGILL, sigill_handler ); -@@ -357,6 +360,7 @@ uint32_t x264_cpu_detect( void ) - signal( SIGILL, oldsig ); - - return X264_CPU_ALTIVEC; -+#endif - } - #endif - diff -Nru x264-0.142.2389+git956c8d8/debian/patches/series x264-0.142.2431+gita5831aa/debian/patches/series --- x264-0.142.2389+git956c8d8/debian/patches/series 2014-05-12 22:58:28.000000000 +0000 +++ x264-0.142.2431+gita5831aa/debian/patches/series 2014-07-11 01:22:34.000000000 +0000 @@ -1,3 +1 @@ -aarch64-pic.patch -powerpcspe.patch link_gpac_dynamically.patch diff -Nru x264-0.142.2389+git956c8d8/debian/x264-get-orig-source x264-0.142.2431+gita5831aa/debian/x264-get-orig-source --- x264-0.142.2389+git956c8d8/debian/x264-get-orig-source 2014-05-11 19:10:39.000000000 +0000 +++ x264-0.142.2431+gita5831aa/debian/x264-get-orig-source 2014-07-11 01:15:14.000000000 +0000 @@ -3,8 +3,8 @@ # Script used to generate the orig source tarball for x264. X264_GIT_URL="git://git.videolan.org/x264.git" -X264_GIT_COMMIT="956c8d8c2a3c2fb1f2f17807532321e492c75efc" -DATE_RETRIEVED="20140116" +X264_GIT_COMMIT="a5831aa256b3161f898d2577d2eb8daa838d88d2" +DATE_RETRIEVED="20140422" COMMIT_SHORT_FORM="$(echo $X264_GIT_COMMIT | \ sed -e 's/^\([[:xdigit:]]\{,7\}\).*/\1/')" diff -Nru x264-0.142.2389+git956c8d8/encoder/analyse.c x264-0.142.2431+gita5831aa/encoder/analyse.c --- x264-0.142.2389+git956c8d8/encoder/analyse.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/analyse.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/encoder/cabac.c x264-0.142.2431+gita5831aa/encoder/cabac.c --- x264-0.142.2389+git956c8d8/encoder/cabac.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/cabac.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/encoder/cavlc.c x264-0.142.2431+gita5831aa/encoder/cavlc.c --- x264-0.142.2389+git956c8d8/encoder/cavlc.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/cavlc.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -500,6 +500,9 @@ && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) ) { bs_write1( s, MB_INTERLACED ); +#if !RDO_SKIP_BS + h->mb.field_decoding_flag = MB_INTERLACED; +#endif } #if !RDO_SKIP_BS diff -Nru x264-0.142.2389+git956c8d8/encoder/encoder.c x264-0.142.2431+gita5831aa/encoder/encoder.c --- x264-0.142.2389+git956c8d8/encoder/encoder.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/encoder.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -2557,7 +2557,7 @@ } } -static int x264_slice_write( x264_t *h ) +static intptr_t x264_slice_write( x264_t *h ) { int i_skip; int mb_xy, i_mb_x, i_mb_y; @@ -2567,7 +2567,8 @@ * other inaccuracies. */ int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 1 + h->param.b_cabac + 5; int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : 0; - int back_up_bitstream = slice_max_size || (!h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH); + int back_up_bitstream_cavlc = !h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH; + int back_up_bitstream = slice_max_size || back_up_bitstream_cavlc; int starting_bits = bs_pos(&h->out.bs); int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1; int b_hpel = h->fdec->b_kept_as_ref; @@ -2575,9 +2576,10 @@ int thread_last_mb = h->i_threadslice_end * h->mb.i_mb_width - 1; uint8_t *last_emu_check; #define BS_BAK_SLICE_MAX_SIZE 0 -#define BS_BAK_SLICE_MIN_MBS 1 -#define BS_BAK_ROW_VBV 2 - x264_bs_bak_t bs_bak[3]; +#define BS_BAK_CAVLC_OVERFLOW 1 +#define BS_BAK_SLICE_MIN_MBS 2 +#define BS_BAK_ROW_VBV 3 + x264_bs_bak_t bs_bak[4]; b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv; bs_realign( &h->out.bs ); @@ -2630,11 +2632,16 @@ x264_fdec_filter_row( h, i_mb_y, 0 ); } - if( !(i_mb_y & SLICE_MBAFF) && back_up_bitstream ) + if( back_up_bitstream ) { - x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], i_skip, 0 ); - if( slice_max_size && (thread_last_mb+1-mb_xy) == h->param.i_slice_min_mbs ) - x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], i_skip, 0 ); + if( back_up_bitstream_cavlc ) + x264_bitstream_backup( h, &bs_bak[BS_BAK_CAVLC_OVERFLOW], i_skip, 0 ); + if( slice_max_size && !(i_mb_y & SLICE_MBAFF) ) + { + x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], i_skip, 0 ); + if( (thread_last_mb+1-mb_xy) == h->param.i_slice_min_mbs ) + x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], i_skip, 0 ); + } } if( PARAM_INTERLACED ) @@ -2698,7 +2705,7 @@ h->mb.i_skip_intra = 0; h->mb.b_skip_mc = 0; h->mb.b_overflow = 0; - x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], &i_skip, 0 ); + x264_bitstream_restore( h, &bs_bak[BS_BAK_CAVLC_OVERFLOW], &i_skip, 0 ); goto reencode; } } diff -Nru x264-0.142.2389+git956c8d8/encoder/macroblock.c x264-0.142.2431+gita5831aa/encoder/macroblock.c --- x264-0.142.2389+git956c8d8/encoder/macroblock.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/macroblock.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * Henrik Gramner * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/encoder/me.c x264-0.142.2431+gita5831aa/encoder/me.c --- x264-0.142.2389+git956c8d8/encoder/me.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/me.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Loren Merritt * Laurent Aimar - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/encoder/ratecontrol.c x264-0.142.2431+gita5831aa/encoder/ratecontrol.c --- x264-0.142.2389+git956c8d8/encoder/ratecontrol.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/ratecontrol.c 2014-07-11 01:16:23.000000000 +0000 @@ -6,7 +6,7 @@ * Authors: Loren Merritt * Michael Niedermayer * Gabriel Bouvigne - * Jason Garrett-Glaser + * Fiona Glaser * Måns Rullgård * * This program is free software; you can redistribute it and/or modify diff -Nru x264-0.142.2389+git956c8d8/encoder/rdo.c x264-0.142.2431+gita5831aa/encoder/rdo.c --- x264-0.142.2389+git956c8d8/encoder/rdo.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/rdo.c 2014-07-11 01:16:23.000000000 +0000 @@ -4,7 +4,7 @@ * Copyright (C) 2005-2014 x264 project * * Authors: Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff -Nru x264-0.142.2389+git956c8d8/encoder/set.c x264-0.142.2431+gita5831aa/encoder/set.c --- x264-0.142.2389+git956c8d8/encoder/set.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/set.c 2014-07-11 01:16:23.000000000 +0000 @@ -228,7 +228,8 @@ } /* FIXME: not sufficient for interlaced video */ - sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5; + sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5 && + sps->i_chroma_format_idc == CHROMA_420; if( sps->vui.b_chroma_loc_info_present ) { sps->vui.i_chroma_loc_top = param->vui.i_chroma_loc; diff -Nru x264-0.142.2389+git956c8d8/encoder/slicetype.c x264-0.142.2431+gita5831aa/encoder/slicetype.c --- x264-0.142.2389+git956c8d8/encoder/slicetype.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/encoder/slicetype.c 2014-07-11 01:16:23.000000000 +0000 @@ -3,7 +3,7 @@ ***************************************************************************** * Copyright (C) 2005-2014 x264 project * - * Authors: Jason Garrett-Glaser + * Authors: Fiona Glaser * Loren Merritt * Dylan Yudaken * @@ -1022,9 +1022,12 @@ return i_score; } +/* Trade off precision in mbtree for increased range */ +#define MBTREE_PRECISION 0.5f + static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance ) { - int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 ); + int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 / MBTREE_PRECISION ); float weightdelta = 0.0; if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 ) weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]); @@ -1051,11 +1054,12 @@ int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32; int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] }; int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight}; - int *buf = h->scratch_buffer; + int16_t *buf = h->scratch_buffer; uint16_t *propagate_cost = frames[b]->i_propagate_cost; + uint16_t *lowres_costs = frames[b]->lowres_costs[b-p0][p1-b]; x264_emms(); - float fps_factor = CLIP_DURATION(frames[b]->f_duration) / CLIP_DURATION(average_duration); + float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f) * MBTREE_PRECISION; /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */ if( !referenced ) @@ -1065,72 +1069,17 @@ { int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride; h->mc.mbtree_propagate_cost( buf, propagate_cost, - frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index, + frames[b]->i_intra_cost+mb_index, lowres_costs+mb_index, frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width ); if( referenced ) propagate_cost += h->mb.i_mb_width; - for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ ) + + h->mc.mbtree_propagate_list( h, ref_costs[0], &mvs[0][mb_index], buf, &lowres_costs[mb_index], + bipred_weights[0], h->mb.i_mb_y, h->mb.i_mb_width, 0 ); + if( b != p1 ) { - int propagate_amount = buf[h->mb.i_mb_x]; - /* Don't propagate for an intra block. */ - if( propagate_amount > 0 ) - { - /* Access width-2 bitfield. */ - int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT; - /* Follow the MVs to the previous frame(s). */ - for( int list = 0; list < 2; list++ ) - if( (lists_used >> list)&1 ) - { -#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1) - int listamount = propagate_amount; - /* Apply bipred weighting. */ - if( lists_used == 3 ) - listamount = (listamount * bipred_weights[list] + 32) >> 6; - - /* Early termination for simple case of mv0. */ - if( !M32( mvs[list][mb_index] ) ) - { - CLIP_ADD( ref_costs[list][mb_index], listamount ); - continue; - } - - int x = mvs[list][mb_index][0]; - int y = mvs[list][mb_index][1]; - int mbx = (x>>5)+h->mb.i_mb_x; - int mby = (y>>5)+h->mb.i_mb_y; - int idx0 = mbx + mby * h->mb.i_mb_stride; - int idx1 = idx0 + 1; - int idx2 = idx0 + h->mb.i_mb_stride; - int idx3 = idx0 + h->mb.i_mb_stride + 1; - x &= 31; - y &= 31; - int idx0weight = (32-y)*(32-x); - int idx1weight = (32-y)*x; - int idx2weight = y*(32-x); - int idx3weight = y*x; - - /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't - * be counted. */ - if( mbx < h->mb.i_mb_width-1 && mby < h->mb.i_mb_height-1 && mbx >= 0 && mby >= 0 ) - { - CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 ); - CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 ); - CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 ); - CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 ); - } - else /* Check offsets individually */ - { - if( mbx < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx >= 0 && mby >= 0 ) - CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 ); - if( mbx+1 < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx+1 >= 0 && mby >= 0 ) - CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 ); - if( mbx < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx >= 0 && mby+1 >= 0 ) - CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 ); - if( mbx+1 < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx+1 >= 0 && mby+1 >= 0 ) - CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 ); - } - } - } + h->mc.mbtree_propagate_list( h, ref_costs[1], &mvs[1][mb_index], buf, &lowres_costs[mb_index], + bipred_weights[1], h->mb.i_mb_y, h->mb.i_mb_width, 1 ); } } diff -Nru x264-0.142.2389+git956c8d8/output/matroska.c x264-0.142.2431+gita5831aa/output/matroska.c --- x264-0.142.2389+git956c8d8/output/matroska.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/output/matroska.c 2014-07-11 01:16:23.000000000 +0000 @@ -33,6 +33,7 @@ int width, height, d_width, d_height; int display_size_units; + int stereo_mode; int64_t frame_duration; @@ -79,6 +80,7 @@ p_mkv->width = p_mkv->d_width = p_param->i_width; p_mkv->height = p_mkv->d_height = p_param->i_height; p_mkv->display_size_units = DS_PIXELS; + p_mkv->stereo_mode = p_param->i_frame_packing; if( p_param->vui.i_sar_width && p_param->vui.i_sar_height && p_param->vui.i_sar_width != p_param->vui.i_sar_height ) @@ -147,7 +149,7 @@ ret = mk_write_header( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC", avcC, avcC_len, p_mkv->frame_duration, 50000, p_mkv->width, p_mkv->height, - p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units ); + p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units, p_mkv->stereo_mode ); if( ret < 0 ) return ret; diff -Nru x264-0.142.2389+git956c8d8/output/matroska_ebml.c x264-0.142.2431+gita5831aa/output/matroska_ebml.c --- x264-0.142.2389+git956c8d8/output/matroska_ebml.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/output/matroska_ebml.c 2014-07-11 01:16:23.000000000 +0000 @@ -317,13 +317,15 @@ return w; } +static const uint8_t mk_stereo_modes[6] = {5,9,7,1,3,13}; + int mk_write_header( mk_writer *w, const char *writing_app, const char *codec_id, const void *codec_private, unsigned codec_private_size, int64_t default_frame_duration, int64_t timescale, unsigned width, unsigned height, - unsigned d_width, unsigned d_height, int display_size_units ) + unsigned d_width, unsigned d_height, int display_size_units, int stereo_mode ) { mk_context *c, *ti, *v; @@ -379,6 +381,8 @@ CHECK( mk_write_uint( v, 0x54b2, display_size_units ) ); CHECK( mk_write_uint( v, 0x54b0, d_width ) ); CHECK( mk_write_uint( v, 0x54ba, d_height ) ); + if( stereo_mode >= 0 && stereo_mode <= 5 ) + CHECK( mk_write_uint( v, 0x53b8, mk_stereo_modes[stereo_mode] ) ); CHECK( mk_close_context( v, 0 ) ); CHECK( mk_close_context( ti, 0 ) ); diff -Nru x264-0.142.2389+git956c8d8/output/matroska_ebml.h x264-0.142.2431+gita5831aa/output/matroska_ebml.h --- x264-0.142.2389+git956c8d8/output/matroska_ebml.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/output/matroska_ebml.h 2014-07-11 01:16:23.000000000 +0000 @@ -42,7 +42,7 @@ int64_t default_frame_duration, int64_t timescale, unsigned width, unsigned height, - unsigned d_width, unsigned d_height, int display_size_units ); + unsigned d_width, unsigned d_height, int display_size_units, int stereo_mode ); int mk_start_frame( mk_writer *w ); int mk_add_frame_data( mk_writer *w, const void *data, unsigned size ); diff -Nru x264-0.142.2389+git956c8d8/tools/checkasm.c x264-0.142.2431+gita5831aa/tools/checkasm.c --- x264-0.142.2389+git956c8d8/tools/checkasm.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/tools/checkasm.c 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Loren Merritt * Laurent Aimar - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -200,7 +200,7 @@ b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : #endif "", - ((int64_t)10*b->cycles/b->den - nop_time)/4 ); + (int64_t)(10*b->cycles/b->den - nop_time)/4 ); } } @@ -1451,6 +1451,37 @@ } } } + + if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb ) + { + set_func_name( "plane_copy_deinterleave_rgb" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 2) >> 2; + int h = plane_specs[i].h; + intptr_t src_stride = plane_specs[i].src_stride; + intptr_t dst_stride = ALIGN( w, 16 ); + intptr_t offv = dst_stride*h + 16; + + for( int pw = 3; pw <= 4; pw++ ) + { + memset( pbuf3, 0, 0x1000 ); + memset( pbuf4, 0, 0x1000 ); + call_c( mc_c.plane_copy_deinterleave_rgb, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf3+2*offv, dst_stride, pbuf1, src_stride, pw, w, h ); + call_a( mc_a.plane_copy_deinterleave_rgb, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf4+2*offv, dst_stride, pbuf1, src_stride, pw, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( pbuf3+y*dst_stride+0*offv, pbuf4+y*dst_stride+0*offv, w ) || + memcmp( pbuf3+y*dst_stride+1*offv, pbuf4+y*dst_stride+1*offv, w ) || + memcmp( pbuf3+y*dst_stride+2*offv, pbuf4+y*dst_stride+2*offv, w ) ) + { + ok = 0; + fprintf( stderr, "plane_copy_deinterleave_rgb FAILED: w=%d h=%d stride=%d pw=%d\n", w, h, (int)src_stride, pw ); + break; + } + } + } + } report( "plane_copy :" ); if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 ) @@ -1567,16 +1598,17 @@ INTEGRAL_INIT( integral_init8v, 9, sum, stride ); report( "integral init :" ); + ok = 1; used_asm = 0; if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost ) { - ok = 1; used_asm = 1; + used_asm = 1; x264_emms(); for( int i = 0; i < 10; i++ ) { - float fps_factor = (rand()&65535) / 256.; - set_func_name( "mbtree_propagate" ); - int *dsta = (int*)buf3; - int *dstc = dsta+400; + float fps_factor = (rand()&65535) / 65535.0f; + set_func_name( "mbtree_propagate_cost" ); + int16_t *dsta = (int16_t*)buf3; + int16_t *dstc = dsta+400; uint16_t *prop = (uint16_t*)buf1; uint16_t *intra = (uint16_t*)buf4; uint16_t *inter = intra+128; @@ -1598,12 +1630,60 @@ { ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4; if( !ok ) - fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] ); + fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] ); } } - report( "mbtree propagate :" ); } + if( mc_a.mbtree_propagate_list != mc_ref.mbtree_propagate_list ) + { + used_asm = 1; + for( int i = 0; i < 8; i++ ) + { + set_func_name( "mbtree_propagate_list" ); + x264_t h; + int height = 4; + int width = 128; + int size = width*height; + h.mb.i_mb_stride = width; + h.mb.i_mb_width = width; + h.mb.i_mb_height = height; + + uint16_t *ref_costsc = (uint16_t*)buf3; + uint16_t *ref_costsa = (uint16_t*)buf4; + int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size); + int16_t *propagate_amount = (int16_t*)(mvs + width); + uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width); + h.scratch_buffer2 = (uint8_t*)(ref_costsa + size); + int bipred_weight = (rand()%63)+1; + int list = i&1; + for( int j = 0; j < size; j++ ) + ref_costsc[j] = ref_costsa[j] = rand()&32767; + for( int j = 0; j < width; j++ ) + { + static const uint8_t list_dist[2][8] = {{0,1,1,1,1,1,1,1},{1,1,3,3,3,3,3,2}}; + for( int k = 0; k < 2; k++ ) + mvs[j][k] = (rand()&127) - 64; + propagate_amount[j] = rand()&32767; + lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT; + } + + call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + + for( int j = 0; j < size && ok; j++ ) + { + ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1; + if( !ok ) + fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] ); + } + + call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + } + } + report( "mbtree :" ); + if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned ) { set_func_name( "memcpy_aligned" ); @@ -2530,7 +2610,7 @@ { *cpu_ref = *cpu_new; *cpu_new |= flags; -#if BROKEN_STACK_ALIGNMENT +#if STACK_ALIGNMENT < 16 *cpu_new |= X264_CPU_STACK_MOD4; #endif if( *cpu_new & X264_CPU_SSE2_IS_FAST ) diff -Nru x264-0.142.2389+git956c8d8/version.sh x264-0.142.2431+gita5831aa/version.sh --- x264-0.142.2389+git956c8d8/version.sh 2014-02-13 23:26:08.000000000 +0000 +++ x264-0.142.2431+gita5831aa/version.sh 2014-07-11 01:16:23.000000000 +0000 @@ -1,5 +1,5 @@ #!/bin/sh # Script modified from upstream source for Debian packaging since packaging # won't include .git repository. -echo '#define X264_VERSION " r2389 956c8d8"' -echo '#define X264_POINTVER "0.142.2389 956c8d8"' +echo '#define X264_VERSION " r2431 a5831aa"' +echo '#define X264_POINTVER "0.142.2431 a5831aa"' diff -Nru x264-0.142.2389+git956c8d8/x264.c x264-0.142.2431+gita5831aa/x264.c --- x264-0.142.2389+git956c8d8/x264.c 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/x264.c 2014-07-11 01:16:23.000000000 +0000 @@ -6,7 +6,7 @@ * Authors: Loren Merritt * Laurent Aimar * Steven Walters - * Jason Garrett-Glaser + * Fiona Glaser * Kieran Kunhya * Henrik Gramner * diff -Nru x264-0.142.2389+git956c8d8/x264.h x264-0.142.2431+gita5831aa/x264.h --- x264-0.142.2389+git956c8d8/x264.h 2014-02-13 23:26:07.000000000 +0000 +++ x264-0.142.2431+gita5831aa/x264.h 2014-07-11 01:16:23.000000000 +0000 @@ -5,7 +5,7 @@ * * Authors: Laurent Aimar * Loren Merritt - * Jason Garrett-Glaser + * Fiona Glaser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -884,13 +884,15 @@ /* x264_encoder_headers: * return the SPS and PPS that will be used for the whole stream. * *pi_nal is the number of NAL units outputted in pp_nal. + * returns the number of bytes in the returned NALs. * returns negative on error. * the payloads of all output NALs are guaranteed to be sequential in memory. */ int x264_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal ); /* x264_encoder_encode: * encode one picture. * *pi_nal is the number of NAL units outputted in pp_nal. - * returns negative on error, zero if no NAL units returned. + * returns the number of bytes in the returned NALs. + * returns negative on error and zero if no NAL units returned. * the payloads of all output NALs are guaranteed to be sequential in memory. */ int x264_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out ); /* x264_encoder_close: