diff -Nru x264-0.142.2389+git956c8d8/AUTHORS x264-0.142.2431+gita5831aa/AUTHORS
--- x264-0.142.2389+git956c8d8/AUTHORS	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/AUTHORS	2014-07-11 01:16:23.000000000 +0000
@@ -47,8 +47,8 @@
 D: 4:2:2 chroma subsampling, x86 asm, Windows improvements, bugfixes
 S: Sweden
 
-N: Jason Garrett-Glaser
-E: darkshikari AT gmail DOT com
+N: Fiona Glaser
+E: fiona AT x264 DOT com
 D: x86 asm, 1pass VBV, adaptive quantization, inline asm
 D: various speed optimizations, bugfixes
 S: USA
diff -Nru x264-0.142.2389+git956c8d8/common/arm/asm.S x264-0.142.2431+gita5831aa/common/arm/asm.S
--- x264-0.142.2389+git956c8d8/common/arm/asm.S	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/asm.S	2014-07-11 01:16:23.000000000 +0000
@@ -40,32 +40,38 @@
 #   define ELF @
 #endif
 
-        .macro require8, val=1
+.macro require8, val=1
 ELF     .eabi_attribute 24, \val
-        .endm
+.endm
 
-        .macro preserve8, val=1
+.macro preserve8, val=1
 ELF     .eabi_attribute 25, \val
-        .endm
+.endm
 
-        .macro function name
-        .global EXTERN_ASM\name
+.macro function name, export=1
         .align  2
+.if \export == 1
+        .global EXTERN_ASM\name
+ELF     .hidden EXTERN_ASM\name
+ELF     .type   EXTERN_ASM\name, %function
+        .func   EXTERN_ASM\name
 EXTERN_ASM\name:
+.else
 ELF     .hidden \name
 ELF     .type   \name, %function
         .func   \name
 \name:
-        .endm
+.endif
+.endm
 
-        .macro movrel rd, val
+.macro movrel rd, val
 #if HAVE_ARMV6T2 && !defined(PIC)
         movw            \rd, #:lower16:\val
         movt            \rd, #:upper16:\val
 #else
         ldr             \rd, =\val
 #endif
-        .endm
+.endm
 
 .macro movconst rd, val
 #if HAVE_ARMV6T2
@@ -78,6 +84,10 @@
 #endif
 .endm
 
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+#define X(s) JOIN(EXTERN_ASM, s)
+
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32
 
diff -Nru x264-0.142.2389+git956c8d8/common/arm/cpu-a.S x264-0.142.2431+gita5831aa/common/arm/cpu-a.S
--- x264-0.142.2389+git956c8d8/common/arm/cpu-a.S	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/cpu-a.S	2014-07-11 01:16:23.000000000 +0000
@@ -38,7 +38,7 @@
 // return: 0 on success
 //         1 if counters were already enabled
 //         9 if lo-res counters were already enabled
-function x264_cpu_enable_armv7_counter
+function x264_cpu_enable_armv7_counter, export=0
     mrc         p15, 0, r2, c9, c12, 0      // read PMNC
     ands        r0, r2, #1
     andne       r0, r2, #9
@@ -51,7 +51,7 @@
     bx          lr
 .endfunc
 
-function x264_cpu_disable_armv7_counter
+function x264_cpu_disable_armv7_counter, export=0
     mrc         p15, 0, r0, c9, c12, 0      // read PMNC
     bic         r0, r0, #1                  // disable counters
     mcr         p15, 0, r0, c9, c12, 0      // write PMNC
diff -Nru x264-0.142.2389+git956c8d8/common/arm/dct-a.S x264-0.142.2431+gita5831aa/common/arm/dct-a.S
--- x264-0.142.2389+git956c8d8/common/arm/dct-a.S	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/dct-a.S	2014-07-11 01:16:23.000000000 +0000
@@ -131,7 +131,7 @@
     bx              lr
 .endfunc
 
-function x264_sub8x4_dct_neon
+function x264_sub8x4_dct_neon, export=0
     vld1.64         {d0}, [r1,:64], r3
     vld1.64         {d1}, [r2,:64], ip
     vsubl.u8        q8,  d0,  d1
@@ -283,17 +283,17 @@
 
 function x264_sub16x16_dct8_neon
     push            {lr}
-    bl              x264_sub8x8_dct8_neon
+    bl              X(x264_sub8x8_dct8_neon)
     sub             r1,  r1,  #FENC_STRIDE*8 - 8
     sub             r2,  r2,  #FDEC_STRIDE*8 - 8
-    bl              x264_sub8x8_dct8_neon
+    bl              X(x264_sub8x8_dct8_neon)
     sub             r1,  r1,  #8
     sub             r2,  r2,  #8
-    bl              x264_sub8x8_dct8_neon
+    bl              X(x264_sub8x8_dct8_neon)
     pop             {lr}
     sub             r1,  r1,  #FENC_STRIDE*8 - 8
     sub             r2,  r2,  #FDEC_STRIDE*8 - 8
-    b               x264_sub8x8_dct8_neon
+    b               X(x264_sub8x8_dct8_neon)
 .endfunc
 
 
@@ -338,7 +338,7 @@
     bx              lr
 .endfunc
 
-function x264_add8x4_idct_neon
+function x264_add8x4_idct_neon, export=0
     vld1.64         {d0-d3}, [r1,:128]!
     IDCT_1D         d16, d18, d20, d22, d0, d1, d2, d3
     vld1.64         {d4-d7}, [r1,:128]!
@@ -502,14 +502,14 @@
 
 function x264_add16x16_idct8_neon
     mov             ip,  lr
-    bl              x264_add8x8_idct8_neon
+    bl              X(x264_add8x8_idct8_neon)
     sub             r0,  r0,  #8*FDEC_STRIDE-8
-    bl              x264_add8x8_idct8_neon
+    bl              X(x264_add8x8_idct8_neon)
     sub             r0,  r0,  #8
-    bl              x264_add8x8_idct8_neon
+    bl              X(x264_add8x8_idct8_neon)
     sub             r0,  r0,  #8*FDEC_STRIDE-8
     mov             lr,  ip
-    b               x264_add8x8_idct8_neon
+    b               X(x264_add8x8_idct8_neon)
 .endfunc
 
 
diff -Nru x264-0.142.2389+git956c8d8/common/arm/deblock-a.S x264-0.142.2431+gita5831aa/common/arm/deblock-a.S
--- x264-0.142.2389+git956c8d8/common/arm/deblock-a.S	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/deblock-a.S	2014-07-11 01:16:23.000000000 +0000
@@ -304,3 +304,109 @@
 
     bx              lr
 .endfunc
+
+function x264_deblock_strength_neon
+    ldr             ip,  [sp]
+    vmov.i8         q8,  #0
+    lsl             ip,  ip,  #8
+    add             r3,  r3,  #32
+    sub             ip,  ip,  #(1<<8)-3
+    vmov.i8         q9,  #0
+    vdup.16         q10, ip
+    ldr             ip,  [sp, #4]
+
+lists:
+    @ load bytes ref
+    vld1.8          {d31}, [r1]!
+    add             r2,  r2,  #16
+    vld1.8          {q1},  [r1]!
+    vmov.i8         q0,  #0
+    vld1.8          {q2},  [r1]!
+    vext.8          q3,  q0,  q1,  #15
+    vext.8          q0,  q0,  q2,  #15
+    vuzp.32         q1,  q2
+    vuzp.32         q3,  q0
+    vext.8          q1,  q15, q2,  #12
+
+    veor            q0,  q0,  q2
+    veor            q1,  q1,  q2
+    vorr            q8,  q8,  q0
+    vorr            q9,  q9,  q1
+
+    vld1.16         {q11}, [r2,:128]!   @ mv + 0x10
+    vld1.16         {q3},  [r2,:128]!   @ mv + 0x20
+    vld1.16         {q12}, [r2,:128]!   @ mv + 0x30
+    vld1.16         {q2},  [r2,:128]!   @ mv + 0x40
+    vld1.16         {q13}, [r2,:128]!   @ mv + 0x50
+    vext.8          q3,  q3,  q12, #12
+    vext.8          q2,  q2,  q13, #12
+    vabd.s16        q0,  q12, q3
+    vld1.16         {q3},  [r2,:128]!   @ mv + 0x60
+    vabd.s16        q1,  q13, q2
+    vld1.16         {q14}, [r2,:128]!   @ mv + 0x70
+    vqmovn.u16      d0,  q0
+    vld1.16         {q2},  [r2,:128]!   @ mv + 0x80
+    vld1.16         {q15}, [r2,:128]!   @ mv + 0x90
+    vqmovn.u16      d1,  q1
+    vext.8          q3,  q3,  q14, #12
+    vext.8          q2,  q2,  q15, #12
+    vabd.s16        q3,  q14, q3
+    vabd.s16        q2,  q15, q2
+    vqmovn.u16      d2,  q3
+    vqmovn.u16      d3,  q2
+
+    vqsub.u8        q0,  q0,  q10
+    vqsub.u8        q1,  q1,  q10
+    vqmovn.u16      d0,  q0
+    vqmovn.u16      d1,  q1
+
+    vabd.s16        q1,  q12, q13
+    vorr            q8,  q8,  q0
+
+    vabd.s16        q0,  q11, q12
+    vabd.s16        q2,  q13, q14
+    vabd.s16        q3,  q14, q15
+    vqmovn.u16      d0,  q0
+    vqmovn.u16      d1,  q1
+    vqmovn.u16      d2,  q2
+    vqmovn.u16      d3,  q3
+
+    vqsub.u8        q0,  q0,  q10
+    vqsub.u8        q1,  q1,  q10
+    vqmovn.u16      d0,  q0
+    vqmovn.u16      d1,  q1
+    subs            ip,  ip,  #1
+    vorr            q9,  q9,  q0
+    beq             lists
+
+    mov             ip,  #-32
+    @ load bytes nnz
+    vld1.8          {d31}, [r0]!
+    vld1.8          {q1},  [r0]!
+    vmov.i8         q0,  #0
+    vld1.8          {q2},  [r0]
+    vext.8          q3,  q0,  q1,  #15
+    vext.8          q0,  q0,  q2,  #15
+    vuzp.32         q1,  q2
+    vuzp.32         q3,  q0
+    vext.8          q1,  q15, q2,  #12
+
+    vorr            q0,  q0,  q2
+    vorr            q1,  q1,  q2
+    vmov.u8         q10, #1
+    vmin.u8         q0,  q0,  q10
+    vmin.u8         q1,  q1,  q10
+    vmin.u8         q8,  q8,  q10       @ mv ? 1 : 0
+    vmin.u8         q9,  q9,  q10
+    vadd.u8         q0,  q0,  q0        @ nnz ? 2 : 0
+    vadd.u8         q1,  q1,  q1
+    vmax.u8         q8,  q8,  q0
+    vmax.u8         q9,  q9,  q1
+    vzip.16         d16, d17
+    vst1.8          {q9}, [r3,:128], ip @ bs[1]
+    vtrn.8          d16, d17
+    vtrn.32         d16, d17
+
+    vst1.8          {q8}, [r3,:128]     @ bs[0]
+    bx              lr
+.endfunc
diff -Nru x264-0.142.2389+git956c8d8/common/arm/mc-a.S x264-0.142.2431+gita5831aa/common/arm/mc-a.S
--- x264-0.142.2389+git956c8d8/common/arm/mc-a.S	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/mc-a.S	2014-07-11 01:16:23.000000000 +0000
@@ -88,7 +88,7 @@
 .endfunc
 
 .macro MEMCPY_ALIGNED srcalign dstalign
-function memcpy_aligned_\dstalign\()_\srcalign\()_neon
+function memcpy_aligned_\dstalign\()_\srcalign\()_neon, export=0
     mov         r3, r0
 .if \srcalign == 8 && \dstalign == 8
     sub         r2, #16
@@ -181,6 +181,7 @@
 AVGH  4, 2
 AVGH  4, 4
 AVGH  4, 8
+AVGH  4, 16
 AVGH  8, 4
 AVGH  8, 8
 AVGH  8, 16
@@ -238,7 +239,7 @@
 .endm
 
 .macro AVG_WEIGHT ext
-function x264_pixel_avg_weight_w4_\ext\()_neon
+function x264_pixel_avg_weight_w4_\ext\()_neon, export=0
     load_weights_\ext
 1:  // height loop
     subs            lr,  lr,  #2
@@ -254,7 +255,7 @@
     pop             {r4-r6,pc}
 .endfunc
 
-function x264_pixel_avg_weight_w8_\ext\()_neon
+function x264_pixel_avg_weight_w8_\ext\()_neon, export=0
     load_weights_\ext
 1:  // height loop
     subs            lr,  lr,  #4
@@ -278,7 +279,7 @@
     pop             {r4-r6,pc}
 .endfunc
 
-function x264_pixel_avg_weight_w16_\ext\()_neon
+function x264_pixel_avg_weight_w16_\ext\()_neon, export=0
     load_weights_\ext
 1:  // height loop
     subs            lr,  lr,  #2
@@ -303,7 +304,7 @@
 AVG_WEIGHT add_sub
 AVG_WEIGHT sub_add
 
-function x264_pixel_avg_w4_neon
+function x264_pixel_avg_w4_neon, export=0
     subs        lr,  lr,  #2
     vld1.32     {d0[]}, [r2], r3
     vld1.32     {d2[]}, [r4], r5
@@ -317,7 +318,7 @@
     pop         {r4-r6,pc}
 .endfunc
 
-function x264_pixel_avg_w8_neon
+function x264_pixel_avg_w8_neon, export=0
     subs        lr,  lr,  #4
     vld1.64     {d0}, [r2], r3
     vld1.64     {d2}, [r4], r5
@@ -339,7 +340,7 @@
     pop         {r4-r6,pc}
 .endfunc
 
-function x264_pixel_avg_w16_neon
+function x264_pixel_avg_w16_neon, export=0
     subs        lr,  lr,  #4
     vld1.64     {d0-d1}, [r2], r3
     vld1.64     {d2-d3}, [r4], r5
@@ -1464,3 +1465,148 @@
     vpop            {d8-d15}
     pop             {r4-r10,pc}
 .endfunc
+
+function x264_load_deinterleave_chroma_fdec_neon
+    mov             ip,  #FDEC_STRIDE/2
+1:
+    vld2.8          {d0-d1}, [r1,:128], r2
+    subs            r3,  r3,  #1
+    pld             [r1]
+    vst1.8          {d0},    [r0,:64], ip
+    vst1.8          {d1},    [r0,:64], ip
+    bgt             1b
+
+    bx              lr
+.endfunc
+
+function x264_load_deinterleave_chroma_fenc_neon
+    mov             ip,  #FENC_STRIDE/2
+1:
+    vld2.8          {d0-d1}, [r1,:128], r2
+    subs            r3,  r3,  #1
+    pld             [r1]
+    vst1.8          {d0},    [r0,:64], ip
+    vst1.8          {d1},    [r0,:64], ip
+    bgt             1b
+
+    bx              lr
+.endfunc
+
+function x264_plane_copy_deinterleave_neon
+    push            {r4-r7, lr}
+    ldrd            r6, r7, [sp, #28]
+    ldrd            r4, r5, [sp, #20]
+    add             lr,  r6,  #15
+    bic             lr,  lr,  #15
+    sub             r1,  r1,  lr
+    sub             r3,  r3,  lr
+    sub             r5,  r5,  lr, lsl #1
+block:
+    vld2.8          {d0-d3}, [r4,:128]!
+    subs            lr,  lr,  #16
+    vst1.8          {q0},    [r0]!
+    vst1.8          {q1},    [r2]!
+    bgt             block
+
+    add             r4,  r4,  r5
+    subs            r7,  r7,  #1
+    add             r0,  r0,  r1
+    add             r2,  r2,  r3
+    mov             lr,  r6
+    bgt             block
+
+    pop             {r4-r7, pc}
+.endfunc
+
+function x264_plane_copy_deinterleave_rgb_neon
+    push            {r4-r8, r10, r11, lr}
+    ldrd            r4,  r5,  [sp, #32]
+    ldrd            r6,  r7,  [sp, #40]
+    ldr             r8,  [sp, #48]
+    ldrd            r10, r11, [sp, #52]
+    add             lr,  r10, #7
+    subs            r8,  r8,  #3
+    bic             lr,  lr,  #7
+    sub             r7,  r7,  lr, lsl #1
+    sub             r1,  r1,  lr
+    sub             r3,  r3,  lr
+    sub             r5,  r5,  lr
+    subne           r7,  r7,  lr, lsl #1
+    subeq           r7,  r7,  lr
+    bne             block4
+block3:
+    vld3.8          {d0,d1,d2}, [r6]!
+    subs            lr,  lr,  #8
+    vst1.8          {d0},    [r0]!
+    vst1.8          {d1},    [r2]!
+    vst1.8          {d2},    [r4]!
+    bgt             block3
+
+    subs            r11, r11, #1
+    add             r0,  r0,  r1
+    add             r2,  r2,  r3
+    add             r4,  r4,  r5
+    add             r6,  r6,  r7
+    mov             lr,  r10
+    bgt             block3
+
+    pop             {r4-r8, r10, r11, pc}
+block4:
+    vld4.8          {d0,d1,d2,d3}, [r6]!
+    subs            lr,  lr,  #8
+    vst1.8          {d0},    [r0]!
+    vst1.8          {d1},    [r2]!
+    vst1.8          {d2},    [r4]!
+    bgt             block4
+
+    subs            r11, r11, #1
+    add             r0,  r0,  r1
+    add             r2,  r2,  r3
+    add             r4,  r4,  r5
+    add             r6,  r6,  r7
+    mov             lr,  r10
+    bgt             block4
+
+    pop             {r4-r8, r10, r11, pc}
+.endfunc
+
+function x264_plane_copy_interleave_neon
+    push            {r4-r7, lr}
+    ldrd            r6, r7, [sp, #28]
+    ldrd            r4, r5, [sp, #20]
+    add             lr,  r6,  #15
+    bic             lr,  lr,  #15
+    sub             r1,  r1,  lr, lsl #1
+    sub             r3,  r3,  lr
+    sub             r5,  r5,  lr
+blocki:
+    vld1.8          {q0}, [r2]!
+    vld1.8          {q1}, [r4]!
+    subs            lr,  lr,  #16
+    vst2.8          {d0,d2}, [r0]!
+    vst2.8          {d1,d3}, [r0]!
+    bgt             blocki
+
+    subs            r7,  r7,  #1
+    add             r0,  r0,  r1
+    add             r2,  r2,  r3
+    add             r4,  r4,  r5
+    mov             lr,  r6
+    bgt             blocki
+
+    pop             {r4-r7, pc}
+.endfunc
+
+function x264_store_interleave_chroma_neon
+    push            {lr}
+    ldr             lr,  [sp, #4]
+    mov             ip,  #FDEC_STRIDE
+1:
+    vld1.8          {d0}, [r2], ip
+    vld1.8          {d1}, [r3], ip
+    subs            lr,  lr,  #1
+    vst2.8          {d0,d1}, [r0,:128], r1
+    bgt             1b
+
+    pop             {pc}
+.endfunc
diff -Nru x264-0.142.2389+git956c8d8/common/arm/mc-c.c x264-0.142.2431+gita5831aa/common/arm/mc-c.c
--- x264-0.142.2389+git956c8d8/common/arm/mc-c.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/mc-c.c	2014-07-11 01:16:23.000000000 +0000
@@ -37,6 +37,7 @@
 void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_8x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_4x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_4x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_4x2_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
@@ -46,13 +47,28 @@
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
+void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
+                                         pixel *dstv, intptr_t i_dstv,
+                                         pixel *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
+                                            pixel *dstb, intptr_t i_dstb,
+                                            pixel *dstc, intptr_t i_dstc,
+                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
+void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
+                                      pixel *srcu, intptr_t i_srcu,
+                                      pixel *srcv, intptr_t i_srcv, int w, int h );
+
+void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+
 #define MC_WEIGHT(func)\
 void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 \
-static void (* const x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
+static weight_fn_t x264_mc##func##_wtab_neon[6] =\
 {\
     x264_mc_weight_w4##func##_neon,\
     x264_mc_weight_w4##func##_neon,\
@@ -72,7 +88,7 @@
 void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_mc_copy_w16_aligned_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 
-void x264_mc_chroma_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
 void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
 
 void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
@@ -224,11 +240,20 @@
     pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
     pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_neon;
 
+    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
+    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
+    pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
+
+    pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
+
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
     pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
     pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
     pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
+    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
     pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
diff -Nru x264-0.142.2389+git956c8d8/common/arm/pixel-a.S x264-0.142.2431+gita5831aa/common/arm/pixel-a.S
--- x264-0.142.2389+git956c8d8/common/arm/pixel-a.S	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/pixel-a.S	2014-07-11 01:16:23.000000000 +0000
@@ -519,6 +519,38 @@
     b               x264_var_end
 .endfunc
 
+function x264_pixel_var_8x16_neon
+    vld1.64         {d16}, [r0,:64], r1
+    vld1.64         {d18}, [r0,:64], r1
+    vmull.u8        q1,  d16, d16
+    vmovl.u8        q0,  d16
+    vld1.64         {d20}, [r0,:64], r1
+    vmull.u8        q2,  d18, d18
+    vaddw.u8        q0,  q0,  d18
+
+    mov             ip,  #12
+
+    vld1.64         {d22}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q1,   q14,  d20, vpaddl.u16
+    vld1.64         {d16}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q2,   q15,  d22, vpaddl.u16
+
+1:  subs            ip,  ip,  #4
+    vld1.64         {d18}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q14,  q12, d16
+    vld1.64         {d20}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q15,  q13, d18
+    vld1.64         {d22}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q12,  q14, d20
+    beq             2f
+    vld1.64         {d16}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q13,  q15, d22
+    b               1b
+2:
+    VAR_SQR_SUM     q2,  q13,  q15, d22
+    b               x264_var_end
+.endfunc
+
 function x264_pixel_var_16x16_neon
     vld1.64         {d16-d17}, [r0,:128], r1
     vmull.u8        q12, d16, d16
@@ -543,7 +575,7 @@
     bgt             var16_loop
 .endfunc
 
-function x264_var_end
+function x264_var_end, export=0
     vpaddl.u16      q8,  q14
     vpaddl.u16      q9,  q15
     vadd.u32        q1,  q1,  q8
@@ -603,6 +635,49 @@
     bx              lr
 .endfunc
 
+function x264_pixel_var2_8x16_neon
+    vld1.64         {d16}, [r0,:64], r1
+    vld1.64         {d17}, [r2,:64], r3
+    vld1.64         {d18}, [r0,:64], r1
+    vld1.64         {d19}, [r2,:64], r3
+    vsubl.u8        q10, d16, d17
+    vsubl.u8        q11, d18, d19
+    SQR_ACC         q1,  d20, d21,  vmull.s16
+    vld1.64         {d16}, [r0,:64], r1
+    vadd.s16        q0,  q10, q11
+    vld1.64         {d17}, [r2,:64], r3
+    SQR_ACC         q2,  d22, d23,  vmull.s16
+    mov             ip,  #14
+1:  subs            ip,  ip,  #2
+    vld1.64         {d18}, [r0,:64], r1
+    vsubl.u8        q10, d16, d17
+    vld1.64         {d19}, [r2,:64], r3
+    vadd.s16        q0,  q0,  q10
+    SQR_ACC         q1,  d20, d21
+    vsubl.u8        q11, d18, d19
+    beq             2f
+    vld1.64         {d16}, [r0,:64], r1
+    vadd.s16        q0,  q0,  q11
+    vld1.64         {d17}, [r2,:64], r3
+    SQR_ACC         q2,  d22, d23
+    b               1b
+2:
+    vadd.s16        q0,  q0,  q11
+    SQR_ACC         q2,  d22, d23
+
+    ldr             ip,  [sp]
+    vadd.s16        d0,  d0,  d1
+    vadd.s32        q1,  q1,  q2
+    vpaddl.s16      d0,  d0
+    vadd.s32        d1,  d2,  d3
+    vpadd.s32       d0,  d0,  d1
+
+    vmov            r0,  r1,  d0
+    vst1.32         {d0[1]}, [ip,:32]
+    mul             r0,  r0,  r0
+    sub             r0,  r1,  r0,  lsr #7
+    bx              lr
+.endfunc
 
 .macro LOAD_DIFF_8x4 q0 q1 q2 q3
     vld1.32     {d1}, [r2], r3
@@ -685,7 +760,7 @@
     SUMSUB_AB   q10, q11, q2,  q3
 .endfunc
 
-function x264_satd_4x8_8x4_end_neon
+function x264_satd_4x8_8x4_end_neon, export=0
     vadd.s16    q0,  q8,  q10
     vadd.s16    q1,  q9,  q11
     vsub.s16    q2,  q8,  q10
@@ -748,7 +823,7 @@
     bx          lr
 .endfunc
 
-function x264_satd_8x8_neon
+function x264_satd_8x8_neon, export=0
     LOAD_DIFF_8x4 q8,  q9,  q10, q11
     vld1.64     {d7}, [r2], r3
     SUMSUB_AB   q0,  q1,  q8,  q9
@@ -769,7 +844,7 @@
 .endfunc
 
 // one vertical hadamard pass and two horizontal
-function x264_satd_8x4v_8x8h_neon
+function x264_satd_8x4v_8x8h_neon, export=0
     SUMSUB_ABCD q0, q1, q2, q3, q12, q13, q14, q15
     vtrn.16     q8,  q9
     SUMSUB_AB   q12, q14, q0,  q2
@@ -853,7 +928,7 @@
     bx          lr
 .endfunc
 
-function x264_satd_16x4_neon
+function x264_satd_16x4_neon, export=0
     vld1.64     {d2-d3}, [r2], r3
     vld1.64     {d0-d1}, [r0,:128], r1
     vsubl.u8    q8,  d0,  d2
@@ -927,7 +1002,7 @@
     SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
 .endm
 
-function x264_sa8d_8x8_neon
+function x264_sa8d_8x8_neon, export=0
     LOAD_DIFF_8x4   q8,  q9,  q10, q11
     vld1.64         {d7}, [r2], r3
     SUMSUB_AB       q0,  q1,  q8,  q9
@@ -1028,7 +1103,7 @@
 HADAMARD_AC 16, 16
 
 // q4: satd  q5: sa8d  q6: mask_ac4  q7: mask_ac8
-function x264_hadamard_ac_8x8_neon
+function x264_hadamard_ac_8x8_neon, export=0
     vld1.64         {d2},  [r0,:64], r1
     vld1.64         {d3},  [r0,:64], r1
     vaddl.u8        q0,  d2,  d3
diff -Nru x264-0.142.2389+git956c8d8/common/arm/pixel.h x264-0.142.2431+gita5831aa/common/arm/pixel.h
--- x264-0.142.2389+git956c8d8/common/arm/pixel.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/pixel.h	2014-07-11 01:16:23.000000000 +0000
@@ -56,8 +56,10 @@
 int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
 
 uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
 uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 
 uint64_t x264_pixel_hadamard_ac_8x8_neon  ( uint8_t *, intptr_t );
 uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
diff -Nru x264-0.142.2389+git956c8d8/common/arm/predict-a.S x264-0.142.2431+gita5831aa/common/arm/predict-a.S
--- x264-0.142.2389+git956c8d8/common/arm/predict-a.S	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/predict-a.S	2014-07-11 01:16:23.000000000 +0000
@@ -79,6 +79,15 @@
     bx      lr
 .endfunc
 
+function x264_predict_4x4_v_armv6
+    ldr     r1,  [r0, #0 - 1 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 0 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 1 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 2 * FDEC_STRIDE]
+    str     r1,  [r0, #0 + 3 * FDEC_STRIDE]
+    bx      lr
+.endfunc
+
 function x264_predict_4x4_dc_armv6
     mov     ip, #0
     ldr     r1, [r0, #-FDEC_STRIDE]
diff -Nru x264-0.142.2389+git956c8d8/common/arm/predict-c.c x264-0.142.2431+gita5831aa/common/arm/predict-c.c
--- x264-0.142.2389+git956c8d8/common/arm/predict-c.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/predict-c.c	2014-07-11 01:16:23.000000000 +0000
@@ -27,36 +27,6 @@
 #include "predict.h"
 #include "pixel.h"
 
-void x264_predict_4x4_dc_armv6( uint8_t *src );
-void x264_predict_4x4_dc_top_neon( uint8_t *src );
-void x264_predict_4x4_h_armv6( uint8_t *src );
-void x264_predict_4x4_ddr_armv6( uint8_t *src );
-void x264_predict_4x4_ddl_neon( uint8_t *src );
-
-void x264_predict_8x8c_dc_neon( uint8_t *src );
-void x264_predict_8x8c_dc_top_neon( uint8_t *src );
-void x264_predict_8x8c_dc_left_neon( uint8_t *src );
-void x264_predict_8x8c_h_neon( uint8_t *src );
-void x264_predict_8x8c_v_neon( uint8_t *src );
-void x264_predict_8x8c_p_neon( uint8_t *src );
-
-void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
-void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
-
-void x264_predict_16x16_dc_neon( uint8_t *src );
-void x264_predict_16x16_dc_top_neon( uint8_t *src );
-void x264_predict_16x16_dc_left_neon( uint8_t *src );
-void x264_predict_16x16_h_neon( uint8_t *src );
-void x264_predict_16x16_v_neon( uint8_t *src );
-void x264_predict_16x16_p_neon( uint8_t *src );
-
 void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
 {
     if (!(cpu&X264_CPU_ARMV6))
@@ -64,6 +34,7 @@
 
 #if !HIGH_BIT_DEPTH
     pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
+    pf[I_PRED_4x4_V]   = x264_predict_4x4_v_armv6;
     pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
     pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
 
diff -Nru x264-0.142.2389+git956c8d8/common/arm/predict.h x264-0.142.2431+gita5831aa/common/arm/predict.h
--- x264-0.142.2389+git956c8d8/common/arm/predict.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/predict.h	2014-07-11 01:16:23.000000000 +0000
@@ -26,6 +26,37 @@
 #ifndef X264_ARM_PREDICT_H
 #define X264_ARM_PREDICT_H
 
+void x264_predict_4x4_dc_armv6( uint8_t *src );
+void x264_predict_4x4_dc_top_neon( uint8_t *src );
+void x264_predict_4x4_v_armv6( uint8_t *src );
+void x264_predict_4x4_h_armv6( uint8_t *src );
+void x264_predict_4x4_ddr_armv6( uint8_t *src );
+void x264_predict_4x4_ddl_neon( uint8_t *src );
+
+void x264_predict_8x8c_dc_neon( uint8_t *src );
+void x264_predict_8x8c_dc_top_neon( uint8_t *src );
+void x264_predict_8x8c_dc_left_neon( uint8_t *src );
+void x264_predict_8x8c_h_neon( uint8_t *src );
+void x264_predict_8x8c_v_neon( uint8_t *src );
+void x264_predict_8x8c_p_neon( uint8_t *src );
+
+void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
+
+void x264_predict_16x16_dc_neon( uint8_t *src );
+void x264_predict_16x16_dc_top_neon( uint8_t *src );
+void x264_predict_16x16_dc_left_neon( uint8_t *src );
+void x264_predict_16x16_h_neon( uint8_t *src );
+void x264_predict_16x16_v_neon( uint8_t *src );
+void x264_predict_16x16_p_neon( uint8_t *src );
+
 void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
 void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
 void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
diff -Nru x264-0.142.2389+git956c8d8/common/arm/quant-a.S x264-0.142.2431+gita5831aa/common/arm/quant-a.S
--- x264-0.142.2389+git956c8d8/common/arm/quant-a.S	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/quant-a.S	2014-07-11 01:16:23.000000000 +0000
@@ -321,6 +321,20 @@
     bx          lr
 .endfunc
 
+function x264_coeff_last8_arm
+    ldrd        r2,  r3,  [r0, #8]
+    orrs        ip,  r2,  r3
+    movne       r0,  #4
+    ldrdeq      r2,  r3,  [r0]
+    moveq       r0,  #0
+    tst         r3,  r3
+    addne       r0,  #2
+    movne       r2,  r3
+    lsrs        r2,  r2,  #16
+    addne       r0,  r0,  #1
+    bx          lr
+.endfunc
+
 .macro COEFF_LAST_1x size
 function x264_coeff_last\size\()_neon
 .if \size == 15
diff -Nru x264-0.142.2389+git956c8d8/common/arm/quant.h x264-0.142.2431+gita5831aa/common/arm/quant.h
--- x264-0.142.2389+git956c8d8/common/arm/quant.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/arm/quant.h	2014-07-11 01:16:23.000000000 +0000
@@ -39,6 +39,7 @@
 void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 
 int x264_coeff_last4_arm( int16_t * );
+int x264_coeff_last8_arm( int16_t * );
 int x264_coeff_last15_neon( int16_t * );
 int x264_coeff_last16_neon( int16_t * );
 int x264_coeff_last64_neon( int16_t * );
diff -Nru x264-0.142.2389+git956c8d8/common/bitstream.c x264-0.142.2431+gita5831aa/common/bitstream.c
--- x264-0.142.2389+git956c8d8/common/bitstream.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/bitstream.c	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
  * Copyright (C) 2003-2014 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/bitstream.h x264-0.142.2431+gita5831aa/common/bitstream.h
--- x264-0.142.2389+git956c8d8/common/bitstream.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/bitstream.h	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
  * Copyright (C) 2003-2014 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Laurent Aimar <fenrir@via.ecp.fr>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/cabac.c x264-0.142.2431+gita5831aa/common/cabac.c
--- x264-0.142.2389+git956c8d8/common/cabac.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/cabac.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/common.h x264-0.142.2431+gita5831aa/common/common.h
--- x264-0.142.2389+git956c8d8/common/common.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/common.h	2014-07-11 01:16:23.000000000 +0000
@@ -552,15 +552,15 @@
     int             (*dequant4_mf[4])[16];   /* [4][6][16] */
     int             (*dequant8_mf[4])[64];   /* [4][6][64] */
     /* quantization matrix for trellis, [cqm][qp][coef] */
-    int             (*unquant4_mf[4])[16];   /* [4][QP_MAX_SPEC][16] */
-    int             (*unquant8_mf[4])[64];   /* [4][QP_MAX_SPEC][64] */
+    int             (*unquant4_mf[4])[16];   /* [4][QP_MAX_SPEC+1][16] */
+    int             (*unquant8_mf[4])[64];   /* [4][QP_MAX_SPEC+1][64] */
     /* quantization matrix for deadzone */
-    udctcoef        (*quant4_mf[4])[16];     /* [4][QP_MAX_SPEC][16] */
-    udctcoef        (*quant8_mf[4])[64];     /* [4][QP_MAX_SPEC][64] */
-    udctcoef        (*quant4_bias[4])[16];   /* [4][QP_MAX_SPEC][16] */
-    udctcoef        (*quant8_bias[4])[64];   /* [4][QP_MAX_SPEC][64] */
-    udctcoef        (*quant4_bias0[4])[16];  /* [4][QP_MAX_SPEC][16] */
-    udctcoef        (*quant8_bias0[4])[64];  /* [4][QP_MAX_SPEC][64] */
+    udctcoef        (*quant4_mf[4])[16];     /* [4][QP_MAX_SPEC+1][16] */
+    udctcoef        (*quant8_mf[4])[64];     /* [4][QP_MAX_SPEC+1][64] */
+    udctcoef        (*quant4_bias[4])[16];   /* [4][QP_MAX_SPEC+1][16] */
+    udctcoef        (*quant8_bias[4])[64];   /* [4][QP_MAX_SPEC+1][64] */
+    udctcoef        (*quant4_bias0[4])[16];  /* [4][QP_MAX_SPEC+1][16] */
+    udctcoef        (*quant8_bias0[4])[64];  /* [4][QP_MAX_SPEC+1][64] */
     udctcoef        (*nr_offset_emergency)[4][64];
 
     /* mv/ref cost arrays. */
diff -Nru x264-0.142.2389+git956c8d8/common/cpu.c x264-0.142.2431+gita5831aa/common/cpu.c
--- x264-0.142.2389+git956c8d8/common/cpu.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/cpu.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -304,7 +304,7 @@
             x264_log( NULL, X264_LOG_WARNING, "unable to determine cacheline size\n" );
     }
 
-#if BROKEN_STACK_ALIGNMENT
+#if STACK_ALIGNMENT < 16
     cpu |= X264_CPU_STACK_MOD4;
 #endif
 
@@ -338,6 +338,9 @@
 
 uint32_t x264_cpu_detect( void )
 {
+#ifdef __NO_FPRS__
+    return 0;
+#else
     static void (*oldsig)( int );
 
     oldsig = signal( SIGILL, sigill_handler );
@@ -357,6 +360,7 @@
     signal( SIGILL, oldsig );
 
     return X264_CPU_ALTIVEC;
+#endif
 }
 #endif
 
@@ -426,6 +430,10 @@
     return sysconf( _SC_NPROCESSORS_ONLN );
 
 #elif SYS_LINUX
+#ifdef __ANDROID__
+    // Android NDK does not expose sched_getaffinity
+    return sysconf( _SC_NPROCESSORS_CONF );
+#else
     cpu_set_t p_aff;
     memset( &p_aff, 0, sizeof(p_aff) );
     if( sched_getaffinity( 0, sizeof(p_aff), &p_aff ) )
@@ -438,6 +446,7 @@
         np += (((uint8_t *)&p_aff)[bit / 8] >> (bit % 8)) & 1;
     return np;
 #endif
+#endif
 
 #elif SYS_BEOS
     system_info info;
diff -Nru x264-0.142.2389+git956c8d8/common/cpu.h x264-0.142.2431+gita5831aa/common/cpu.h
--- x264-0.142.2389+git956c8d8/common/cpu.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/cpu.h	2014-07-11 01:16:23.000000000 +0000
@@ -57,8 +57,8 @@
  * alignment between functions (osdep.h handles manual alignment of arrays
  * if it doesn't).
  */
-#if (ARCH_X86 || HAVE_32B_STACK_ALIGNMENT) && HAVE_MMX
-int x264_stack_align( void (*func)(), ... );
+#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX
+intptr_t x264_stack_align( void (*func)(), ... );
 #define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
 #else
 #define x264_stack_align(func,...) func(__VA_ARGS__)
diff -Nru x264-0.142.2389+git956c8d8/common/deblock.c x264-0.142.2431+gita5831aa/common/deblock.c
--- x264-0.142.2389+git956c8d8/common/deblock.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/deblock.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Henrik Gramner <henrik@gramner.com>
  *
  * This program is free software; you can redistribute it and/or modify
@@ -734,6 +734,9 @@
 void x264_deblock_h_luma_neon  ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                                 int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+                                 int mvy_limit, int bframe );
 #endif
 
 void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
@@ -842,6 +845,7 @@
         pf->deblock_luma[0] = x264_deblock_h_luma_neon;
         pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
         pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
+        pf->deblock_strength     = x264_deblock_strength_neon;
    }
 #endif
 #endif // !HIGH_BIT_DEPTH
diff -Nru x264-0.142.2389+git956c8d8/common/frame.c x264-0.142.2431+gita5831aa/common/frame.c
--- x264-0.142.2389+git956c8d8/common/frame.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/frame.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/frame.h x264-0.142.2431+gita5831aa/common/frame.h
--- x264-0.142.2389+git956c8d8/common/frame.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/frame.h	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/macroblock.c x264-0.142.2431+gita5831aa/common/macroblock.c
--- x264-0.142.2389+git956c8d8/common/macroblock.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/macroblock.c	2014-07-11 01:16:23.000000000 +0000
@@ -3,7 +3,7 @@
  *****************************************************************************
  * Copyright (C) 2003-2014 x264 project
  *
- * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
+ * Authors: Fiona Glaser <fiona@x264.com>
  *          Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
  *          Henrik Gramner <henrik@gramner.com>
@@ -389,7 +389,7 @@
             ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
         scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
     }
-    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int);
+    int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
     scratch_size = X264_MAX( scratch_size, buf_mbtree );
     if( scratch_size )
         CHECKED_MALLOC( h->scratch_buffer, scratch_size );
@@ -397,7 +397,9 @@
         h->scratch_buffer = NULL;
 
     int buf_lookahead_threads = (h->mb.i_mb_height + (4 + 32) * h->param.i_lookahead_threads) * sizeof(int) * 2;
-    CHECKED_MALLOC( h->scratch_buffer2, buf_lookahead_threads );
+    int buf_mbtree2 = buf_mbtree * 12; /* size of the internal propagate_list asm buffer */
+    scratch_size = X264_MAX( buf_lookahead_threads, buf_mbtree2 );
+    CHECKED_MALLOC( h->scratch_buffer2, scratch_size );
 
     return 0;
 fail:
@@ -1253,8 +1255,13 @@
         }
     }
 
-    if( b_mbaff && mb_x == 0 && !(mb_y&1) && mb_y > 0 )
-        h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_xy - h->mb.i_mb_stride];
+    if( b_mbaff && mb_x == 0 && !(mb_y&1) )
+    {
+        if( h->mb.i_mb_top_xy >= h->sh.i_first_mb )
+            h->mb.field_decoding_flag = h->mb.field[h->mb.i_mb_top_xy];
+        else
+            h->mb.field_decoding_flag = 0;
+    }
 
     /* Check whether skip here would cause decoder to predict interlace mode incorrectly.
      * FIXME: It might be better to change the interlace type rather than forcing a skip to be non-skip. */
@@ -1262,26 +1269,8 @@
     if( b_mbaff )
     {
         if( MB_INTERLACED != h->mb.field_decoding_flag &&
-            h->mb.i_mb_prev_xy >= 0 && IS_SKIP(h->mb.type[h->mb.i_mb_prev_xy]) )
+            (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) )
             h->mb.b_allow_skip = 0;
-        if( (mb_y&1) && IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride]) )
-        {
-            if( h->mb.i_neighbour & MB_LEFT )
-            {
-                if( h->mb.field[h->mb.i_mb_xy - 1] != MB_INTERLACED )
-                    h->mb.b_allow_skip = 0;
-            }
-            else if( h->mb.i_neighbour & MB_TOP )
-            {
-                if( h->mb.field[h->mb.i_mb_top_xy] != MB_INTERLACED )
-                    h->mb.b_allow_skip = 0;
-            }
-            else // Frame mb pair is predicted
-            {
-                if( MB_INTERLACED )
-                    h->mb.b_allow_skip = 0;
-            }
-        }
     }
 
     if( h->param.b_cabac )
diff -Nru x264-0.142.2389+git956c8d8/common/macroblock.h x264-0.142.2431+gita5831aa/common/macroblock.h
--- x264-0.142.2389+git956c8d8/common/macroblock.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/macroblock.h	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/mc.c x264-0.142.2431+gita5831aa/common/mc.c
--- x264-0.142.2389+git956c8d8/common/mc.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/mc.c	2014-07-11 01:16:23.000000000 +0000
@@ -483,20 +483,97 @@
 
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given macroblock. */
-static void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+static void mbtree_propagate_cost( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                    uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 {
-    float fps = *fps_factor / 256.f;
+    float fps = *fps_factor;
     for( int i = 0; i < len; i++ )
     {
-        float intra_cost       = intra_costs[i] * inv_qscales[i];
-        float propagate_amount = propagate_in[i] + intra_cost*fps;
-        float propagate_num    = intra_costs[i] - (inter_costs[i] & LOWRES_COST_MASK);
-        float propagate_denom  = intra_costs[i];
-        dst[i] = (int)(propagate_amount * propagate_num / propagate_denom + 0.5f);
+        int intra_cost = intra_costs[i];
+        int inter_cost = X264_MIN(intra_costs[i], inter_costs[i] & LOWRES_COST_MASK);
+        float propagate_intra  = intra_cost * inv_qscales[i];
+        float propagate_amount = propagate_in[i] + propagate_intra*fps;
+        float propagate_num    = intra_cost - inter_cost;
+        float propagate_denom  = intra_cost;
+        dst[i] = X264_MIN((int)(propagate_amount * propagate_num / propagate_denom + 0.5f), 32767);
     }
 }
 
+static void mbtree_propagate_list( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
+                                   int16_t *propagate_amount, uint16_t *lowres_costs,
+                                   int bipred_weight, int mb_y, int len, int list )
+{
+    unsigned stride = h->mb.i_mb_stride;
+    unsigned width = h->mb.i_mb_width;
+    unsigned height = h->mb.i_mb_height;
+
+    for( unsigned i = 0; i < len; i++ )
+    {
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+        int lists_used = lowres_costs[i]>>LOWRES_COST_SHIFT;
+
+        if( !(lists_used & (1 << list)) )
+            continue;
+
+        int listamount = propagate_amount[i];
+        /* Apply bipred weighting. */
+        if( lists_used == 3 )
+            listamount = (listamount * bipred_weight + 32) >> 6;
+
+        /* Early termination for simple case of mv0. */
+        if( !M32( mvs[i] ) )
+        {
+            CLIP_ADD( ref_costs[mb_y*stride + i], listamount );
+            continue;
+        }
+
+        int x = mvs[i][0];
+        int y = mvs[i][1];
+        unsigned mbx = (x>>5)+i;
+        unsigned mby = (y>>5)+mb_y;
+        unsigned idx0 = mbx + mby * stride;
+        unsigned idx2 = idx0 + stride;
+        x &= 31;
+        y &= 31;
+        int idx0weight = (32-y)*(32-x);
+        int idx1weight = (32-y)*x;
+        int idx2weight = y*(32-x);
+        int idx3weight = y*x;
+        idx0weight = (idx0weight * listamount + 512) >> 10;
+        idx1weight = (idx1weight * listamount + 512) >> 10;
+        idx2weight = (idx2weight * listamount + 512) >> 10;
+        idx3weight = (idx3weight * listamount + 512) >> 10;
+
+        if( mbx < width-1 && mby < height-1 )
+        {
+            CLIP_ADD( ref_costs[idx0+0], idx0weight );
+            CLIP_ADD( ref_costs[idx0+1], idx1weight );
+            CLIP_ADD( ref_costs[idx2+0], idx2weight );
+            CLIP_ADD( ref_costs[idx2+1], idx3weight );
+        }
+        else
+        {
+            /* Note: this takes advantage of unsigned representation to
+             * catch negative mbx/mby. */
+            if( mby < height )
+            {
+                if( mbx < width )
+                    CLIP_ADD( ref_costs[idx0+0], idx0weight );
+                if( mbx+1 < width )
+                    CLIP_ADD( ref_costs[idx0+1], idx1weight );
+            }
+            if( mby+1 < height )
+            {
+                if( mbx < width )
+                    CLIP_ADD( ref_costs[idx2+0], idx2weight );
+                if( mbx+1 < width )
+                    CLIP_ADD( ref_costs[idx2+1], idx3weight );
+            }
+        }
+    }
+#undef CLIP_ADD
+}
+
 void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
 {
     pf->mc_luma   = mc_luma;
@@ -552,6 +629,7 @@
     pf->integral_init8v = integral_init8v;
 
     pf->mbtree_propagate_cost = mbtree_propagate_cost;
+    pf->mbtree_propagate_list = mbtree_propagate_list;
 
 #if HAVE_MMX
     x264_mc_init_mmx( cpu, pf );
@@ -565,7 +643,10 @@
 #endif
 
     if( cpu_independent )
+    {
         pf->mbtree_propagate_cost = mbtree_propagate_cost;
+        pf->mbtree_propagate_list = mbtree_propagate_list;
+    }
 }
 
 void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
diff -Nru x264-0.142.2389+git956c8d8/common/mc.h x264-0.142.2431+gita5831aa/common/mc.h
--- x264-0.142.2389+git956c8d8/common/mc.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/mc.h	2014-07-11 01:16:23.000000000 +0000
@@ -122,8 +122,12 @@
     weight_fn_t *offsetsub;
     void (*weight_cache)( x264_t *, x264_weight_t * );
 
-    void (*mbtree_propagate_cost)( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+    void (*mbtree_propagate_cost)( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                    uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
+
+    void (*mbtree_propagate_list)( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
+                                   int16_t *propagate_amount, uint16_t *lowres_costs,
+                                   int bipred_weight, int mb_y, int len, int list );
 } x264_mc_functions_t;
 
 void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent );
diff -Nru x264-0.142.2389+git956c8d8/common/mvpred.c x264-0.142.2431+gita5831aa/common/mvpred.c
--- x264-0.142.2389+git956c8d8/common/mvpred.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/mvpred.c	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
  * Copyright (C) 2003-2014 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Laurent Aimar <fenrir@via.ecp.fr>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/osdep.h x264-0.142.2431+gita5831aa/common/osdep.h
--- x264-0.142.2389+git956c8d8/common/osdep.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/osdep.h	2014-07-11 01:16:23.000000000 +0000
@@ -126,7 +126,7 @@
 
 #define EXPAND(x) x
 
-#if HAVE_32B_STACK_ALIGNMENT
+#if STACK_ALIGNMENT >= 32
 #define ALIGNED_ARRAY_32( type, name, sub1, ... )\
     ALIGNED_32( type name sub1 __VA_ARGS__ )
 #else
diff -Nru x264-0.142.2389+git956c8d8/common/pixel.c x264-0.142.2431+gita5831aa/common/pixel.c
--- x264-0.142.2389+git956c8d8/common/pixel.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/pixel.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -36,6 +36,7 @@
 #endif
 #if ARCH_ARM
 #   include "arm/pixel.h"
+#   include "arm/predict.h"
 #endif
 #if ARCH_UltraSPARC
 #   include "sparc/pixel.h"
@@ -532,6 +533,10 @@
 INTRA_MBCMP_8x8( sad, _mmx2,  _c )
 INTRA_MBCMP_8x8(sa8d, _sse2,  _sse2 )
 #endif
+#if !HIGH_BIT_DEPTH && HAVE_ARMV6
+INTRA_MBCMP_8x8( sad, _neon, _neon )
+INTRA_MBCMP_8x8(sa8d, _neon, _neon )
+#endif
 
 #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\
 void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
@@ -587,6 +592,16 @@
 INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _xop, _mmx2 )
 #endif
 #endif
+#if !HIGH_BIT_DEPTH && HAVE_ARMV6
+INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _neon, _armv6 )
+INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _armv6 )
+INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _neon, _neon )
+INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _neon, _neon )
+INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _c )
+INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _neon, _c )
+INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _neon, _neon )
+INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _neon, _neon )
+#endif
 
 // No C implementation of intra_satd_x9. See checkasm for its behavior,
 // or see x264_mb_analyse_intra for the entirely different algorithm we
@@ -1006,8 +1021,16 @@
     }
     if( cpu&X264_CPU_XOP )
     {
+        INIT5( sad_x3, _xop );
+        INIT5( sad_x4, _xop );
+        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_xop;
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
+        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_xop;
         pixf->vsad = x264_pixel_vsad_xop;
         pixf->asd8 = x264_pixel_asd8_xop;
+#if ARCH_X86_64
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
+#endif
     }
     if( cpu&X264_CPU_AVX2 )
     {
@@ -1293,6 +1316,7 @@
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_xop;
         pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
+        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_xop;
         pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
         pixf->var[PIXEL_8x16]  = x264_pixel_var_8x16_xop;
         pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_xop;
@@ -1347,8 +1371,21 @@
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
         pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
+        pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
         pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
         pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
+        pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_neon;
+
+        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_neon;
+        pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4_neon;
+        pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_neon;
+        pixf->intra_sa8d_x3_8x8   = x264_intra_sa8d_x3_8x8_neon;
+        pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_neon;
+        pixf->intra_satd_x3_8x8c  = x264_intra_satd_x3_8x8c_neon;
+        pixf->intra_sad_x3_8x16c  = x264_intra_sad_x3_8x16c_neon;
+        pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon;
+        pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_neon;
+        pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
 
         pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
         pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
diff -Nru x264-0.142.2389+git956c8d8/common/pixel.h x264-0.142.2431+gita5831aa/common/pixel.h
--- x264-0.142.2389+git956c8d8/common/pixel.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/pixel.h	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
  * Copyright (C) 2004-2014 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
             Henrik Gramner <henrik@gramner.com>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/predict.c x264-0.142.2431+gita5831aa/common/predict.c
--- x264-0.142.2389+git956c8d8/common/predict.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/predict.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Henrik Gramner <henrik@gramner.com>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/quant.c x264-0.142.2431+gita5831aa/common/quant.c
--- x264-0.142.2389+git956c8d8/common/quant.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/quant.c	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
  * Copyright (C) 2005-2014 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Christian Heine <sennindemokrit@gmx.net>
  *          Henrik Gramner <henrik@gramner.com>
  *
@@ -725,7 +725,10 @@
 
 #if HAVE_ARMV6
     if( cpu&X264_CPU_ARMV6 )
+    {
         pf->coeff_last4 = x264_coeff_last4_arm;
+        pf->coeff_last8 = x264_coeff_last8_arm;
+    }
 
     if( cpu&X264_CPU_NEON )
     {
diff -Nru x264-0.142.2389+git956c8d8/common/quant.h x264-0.142.2431+gita5831aa/common/quant.h
--- x264-0.142.2389+git956c8d8/common/quant.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/quant.h	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
  * Copyright (C) 2005-2014 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/rectangle.c x264-0.142.2431+gita5831aa/common/rectangle.c
--- x264-0.142.2389+git956c8d8/common/rectangle.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/rectangle.c	2014-07-11 01:16:23.000000000 +0000
@@ -3,7 +3,7 @@
  *****************************************************************************
  * Copyright (C) 2010-2014 x264 project
  *
- * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
+ * Authors: Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/rectangle.h x264-0.142.2431+gita5831aa/common/rectangle.h
--- x264-0.142.2389+git956c8d8/common/rectangle.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/rectangle.h	2014-07-11 01:16:23.000000000 +0000
@@ -3,7 +3,7 @@
  *****************************************************************************
  * Copyright (C) 2003-2014 x264 project
  *
- * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
+ * Authors: Fiona Glaser <fiona@x264.com>
  *          Loren Merritt <lorenm@u.washington.edu>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/set.c x264-0.142.2431+gita5831aa/common/set.c
--- x264-0.142.2389+git956c8d8/common/set.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/set.c	2014-07-11 01:16:23.000000000 +0000
@@ -105,9 +105,9 @@
         }\
         else\
         {\
-            CHECKED_MALLOC( h->  quant##w##_mf[i], (QP_MAX+1)*size*sizeof(udctcoef) );\
+            CHECKED_MALLOC( h->  quant##w##_mf[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\
             CHECKED_MALLOC( h->dequant##w##_mf[i],  6*size*sizeof(int) );\
-            CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX+1)*size*sizeof(int) );\
+            CHECKED_MALLOC( h->unquant##w##_mf[i], (QP_MAX_SPEC+1)*size*sizeof(int) );\
         }\
         for( j = 0; j < i; j++ )\
             if( deadzone[j] == deadzone[i] &&\
@@ -120,8 +120,8 @@
         }\
         else\
         {\
-            CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX+1)*size*sizeof(udctcoef) );\
-            CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX+1)*size*sizeof(udctcoef) );\
+            CHECKED_MALLOC( h->quant##w##_bias[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\
+            CHECKED_MALLOC( h->quant##w##_bias0[i], (QP_MAX_SPEC+1)*size*sizeof(udctcoef) );\
         }\
     }
 
diff -Nru x264-0.142.2389+git956c8d8/common/vlc.c x264-0.142.2431+gita5831aa/common/vlc.c
--- x264-0.142.2389+git956c8d8/common/vlc.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/vlc.c	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
  * Copyright (C) 2003-2014 x264 project
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Henrik Gramner <henrik@gramner.com>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/x86/bitstream-a.asm x264-0.142.2431+gita5831aa/common/x86/bitstream-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/bitstream-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/bitstream-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -3,7 +3,7 @@
 ;*****************************************************************************
 ;* Copyright (C) 2010-2014 x264 project
 ;*
-;* Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Authors: Fiona Glaser <fiona@x264.com>
 ;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/x86/cabac-a.asm x264-0.142.2431+gita5831aa/common/x86/cabac-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/cabac-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/cabac-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
 ;* Copyright (C) 2008-2014 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Holger Lubitz <holger@lubitz.org>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/x86/const-a.asm x264-0.142.2431+gita5831aa/common/x86/const-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/const-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/const-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
 ;* Copyright (C) 2010-2014 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -36,6 +36,7 @@
 const pw_512,      times 16 dw 512
 const pw_00ff,     times 16 dw 0x00ff
 const pw_pixel_max,times 16 dw ((1 << BIT_DEPTH)-1)
+const pw_0to15,    dw 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
 const pd_1,        times 8 dd 1
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
 const pb_unpackbd1, times 2 db 0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3
diff -Nru x264-0.142.2389+git956c8d8/common/x86/cpu-a.asm x264-0.142.2431+gita5831aa/common/x86/cpu-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/cpu-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/cpu-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
 ;*
 ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Loren Merritt <lorenm@u.washington.edu>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/x86/dct-a.asm x264-0.142.2431+gita5831aa/common/x86/dct-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/dct-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/dct-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -7,7 +7,7 @@
 ;*          Loren Merritt <lorenm@u.washington.edu>
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Min Chen <chenm001.163.com>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -675,7 +675,7 @@
     mova        m6, [pw_pixel_max]
     mova        m7, [pd_32]
     pxor        m5, m5
-.loop
+.loop:
     mova        m3, [r1]
     paddd       m3, m7
     psrad       m3, 6         ; dc0   0 dc1   0 dc2   0 dc3   0
diff -Nru x264-0.142.2389+git956c8d8/common/x86/dct.h x264-0.142.2431+gita5831aa/common/x86/dct.h
--- x264-0.142.2389+git956c8d8/common/x86/dct.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/dct.h	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/x86/deblock-a.asm x264-0.142.2431+gita5831aa/common/x86/deblock-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/deblock-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/deblock-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
 ;* Copyright (C) 2005-2014 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
@@ -621,7 +621,7 @@
     mov     r6, 2
     mova    m0, [pw_2]
     LOAD_AB aa, bb, r2d, r3d
-.loop
+.loop:
     mova    p2, [r4+r1]
     mova    p1, [r4+2*r1]
     mova    p0, [r4+r5]
@@ -671,7 +671,7 @@
     add     r4, r0     ; pix+4*stride
     mov     r6, 2
     mova    m0, [pw_2]
-.loop
+.loop:
     movu    q3, [r0-8]
     movu    q2, [r0+r1-8]
     movu    q1, [r0+r1*2-8]
@@ -804,35 +804,6 @@
 %define PASS8ROWS(base, base3, stride, stride3, offset) \
     PASS8ROWS(base+offset, base3+offset, stride, stride3)
 
-; in: 8 rows of 4 bytes in %4..%11
-; out: 4 rows of 8 bytes in m0..m3
-%macro TRANSPOSE4x8_LOAD 11
-    movh       m0, %4
-    movh       m2, %5
-    movh       m1, %6
-    movh       m3, %7
-    punpckl%1  m0, m2
-    punpckl%1  m1, m3
-    mova       m2, m0
-    punpckl%2  m0, m1
-    punpckh%2  m2, m1
-
-    movh       m4, %8
-    movh       m6, %9
-    movh       m5, %10
-    movh       m7, %11
-    punpckl%1  m4, m6
-    punpckl%1  m5, m7
-    mova       m6, m4
-    punpckl%2  m4, m5
-    punpckh%2  m6, m5
-
-    punpckh%3  m1, m0, m4
-    punpckh%3  m3, m2, m6
-    punpckl%3  m0, m4
-    punpckl%3  m2, m6
-%endmacro
-
 ; in: 4 rows of 8 bytes in m0..m3
 ; out: 8 rows of 4 bytes in %1..%8
 %macro TRANSPOSE8x4B_STORE 8
@@ -844,24 +815,24 @@
     punpcklbw  m2, m3
     punpcklwd  m1, m0, m2
     punpckhwd  m0, m2
-    movh       %1, m1
+    movd       %1, m1
     punpckhdq  m1, m1
-    movh       %2, m1
-    movh       %3, m0
+    movd       %2, m1
+    movd       %3, m0
     punpckhdq  m0, m0
-    movh       %4, m0
+    movd       %4, m0
 
     punpckhdq  m3, m3
     punpcklbw  m4, m5
     punpcklbw  m6, m3
     punpcklwd  m5, m4, m6
     punpckhwd  m4, m6
-    movh       %5, m5
+    movd       %5, m5
     punpckhdq  m5, m5
-    movh       %6, m5
-    movh       %7, m4
+    movd       %6, m5
+    movd       %7, m4
     punpckhdq  m4, m4
-    movh       %8, m4
+    movd       %8, m4
 %endmacro
 
 ; in: 8 rows of 4 bytes in %9..%10
@@ -877,34 +848,94 @@
     pextrd %8, %10, 3
 %endmacro
 
-%macro TRANSPOSE4x8B_LOAD 8
-    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
-%endmacro
-
-%macro TRANSPOSE4x8W_LOAD 8
-%if mmsize==16
-    TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
-%else
+; in: 4 rows of 4 words in %1..%4
+; out: 4 rows of 4 word in m0..m3
+; clobbers: m4
+%macro TRANSPOSE4x4W_LOAD 4-8
+%if mmsize==8
     SWAP  1, 4, 2, 3
-    mova  m0, [t5]
-    mova  m1, [t5+r1]
-    mova  m2, [t5+r1*2]
-    mova  m3, [t5+t6]
+    movq  m0, %1
+    movq  m1, %2
+    movq  m2, %3
+    movq  m3, %4
     TRANSPOSE4x4W 0, 1, 2, 3, 4
+%else
+    movq       m0, %1
+    movq       m2, %2
+    movq       m1, %3
+    movq       m3, %4
+    punpcklwd  m0, m2
+    punpcklwd  m1, m3
+    mova       m2, m0
+    punpckldq  m0, m1
+    punpckhdq  m2, m1
+    movhlps    m1, m0
+    movhlps    m3, m2
 %endif
 %endmacro
 
-%macro TRANSPOSE8x2W_STORE 8
+; in: 2 rows of 4 words in m1..m2
+; out: 4 rows of 2 words in %1..%4
+; clobbers: m0, m1
+%macro TRANSPOSE4x2W_STORE 4-8
+%if mmsize==8
     punpckhwd  m0, m1, m2
     punpcklwd  m1, m2
-%if mmsize==8
+%else
+    punpcklwd  m1, m2
+    movhlps    m0, m1
+%endif
     movd       %3, m0
     movd       %1, m1
     psrlq      m1, 32
     psrlq      m0, 32
     movd       %2, m1
     movd       %4, m0
+%endmacro
+
+; in: 4/8 rows of 4 words in %1..%8
+; out: 4 rows of 4/8 word in m0..m3
+; clobbers: m4, m5, m6, m7
+%macro TRANSPOSE4x8W_LOAD 8
+%if mmsize==8
+    TRANSPOSE4x4W_LOAD %1, %2, %3, %4
 %else
+    movq       m0, %1
+    movq       m2, %2
+    movq       m1, %3
+    movq       m3, %4
+    punpcklwd  m0, m2
+    punpcklwd  m1, m3
+    mova       m2, m0
+    punpckldq  m0, m1
+    punpckhdq  m2, m1
+
+    movq       m4, %5
+    movq       m6, %6
+    movq       m5, %7
+    movq       m7, %8
+    punpcklwd  m4, m6
+    punpcklwd  m5, m7
+    mova       m6, m4
+    punpckldq  m4, m5
+    punpckhdq  m6, m5
+
+    punpckhqdq m1, m0, m4
+    punpckhqdq m3, m2, m6
+    punpcklqdq m0, m4
+    punpcklqdq m2, m6
+%endif
+%endmacro
+
+; in: 2 rows of 4/8 words in m1..m2
+; out: 4/8 rows of 2 words in %1..%8
+; clobbers: m0, m1
+%macro TRANSPOSE8x2W_STORE 8
+%if mmsize==8
+    TRANSPOSE4x2W_STORE %1, %2, %3, %4
+%else
+    punpckhwd  m0, m1, m2
+    punpcklwd  m1, m2
     movd       %5, m0
     movd       %1, m1
     psrldq     m1, 4
@@ -1118,7 +1149,7 @@
 %endif
     mova     m6, [pb_1]
     psubusb  m4, m6              ; alpha - 1
-    psubusb  m5, m6              ; alpha - 2
+    psubusb  m5, m6              ; beta - 1
 %if %0>2
     mova     %3, m4
 %endif
@@ -1361,19 +1392,18 @@
 ;-----------------------------------------------------------------------------
 ; void deblock_h_luma( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 )
 ;-----------------------------------------------------------------------------
-
 %if cpuflag(avx)
 INIT_XMM cpuname
 %else
 INIT_MMX cpuname
 %endif
-cglobal deblock_h_luma, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
-    mov    r0, r0mp
+cglobal deblock_h_luma, 1,5,8,0x60+12
     mov    r3, r1m
     lea    r4, [r3*3]
     sub    r0, 4
     lea    r1, [r0+r4]
-    %define pix_tmp esp+12*HAVE_ALIGNED_STACK
+    %define pix_tmp esp+12
+    ; esp is intentionally misaligned to make it aligned after pushing the arguments for deblock_%1_luma.
 
     ; transpose 6x16 -> tmp space
     TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
@@ -2098,17 +2128,14 @@
 ;-----------------------------------------------------------------------------
 %macro DEBLOCK_H_CHROMA_420_MBAFF 0
 cglobal deblock_h_chroma_mbaff, 5,7,8
-    sub    r0, 4
-    lea    t6, [r1*3]
-    mov    t5, r0
-    add    r0, t6
-    TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+    CHROMA_H_START
+    TRANSPOSE4x4W_LOAD PASS8ROWS(t5, r0, r1, t6)
     LOAD_MASK  r2d, r3d
     movd       m6, [r4] ; tc0
     punpcklbw  m6, m6
     pand       m7, m6
     DEBLOCK_P0_Q0
-    TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+    TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
     RET
 %endmacro
 
@@ -2249,9 +2276,9 @@
 INIT_MMX mmx2
 cglobal deblock_h_chroma_intra_mbaff, 4,6,8
     CHROMA_H_START
-    TRANSPOSE4x8W_LOAD  PASS8ROWS(t5, r0, r1, t6)
+    TRANSPOSE4x4W_LOAD  PASS8ROWS(t5, r0, r1, t6)
     call chroma_intra_body
-    TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+    TRANSPOSE4x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
     RET
 %endif ; !HIGH_BIT_DEPTH
 
diff -Nru x264-0.142.2389+git956c8d8/common/x86/mc-a2.asm x264-0.142.2431+gita5831aa/common/x86/mc-a2.asm
--- x264-0.142.2389+git956c8d8/common/x86/mc-a2.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/mc-a2.asm	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
 ;* Copyright (C) 2005-2014 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Holger Lubitz <holger@lubitz.org>
 ;*          Mathieu Monnier <manao@melix.net>
 ;*          Oskar Arvidsson <oskar@irock.se>
@@ -32,12 +32,14 @@
 
 SECTION_RODATA 32
 
+pw_1024: times 16 dw 1024
 filt_mul20: times 32 db 20
 filt_mul15: times 16 db 1, -5
 filt_mul51: times 16 db -5, 1
 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
 
+%if HIGH_BIT_DEPTH
 v210_mask: times 4 dq 0xc00ffc003ff003ff
 v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
 v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
@@ -45,18 +47,18 @@
 v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
            dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
 
-%if HIGH_BIT_DEPTH
 deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
 %else
+deinterleave_rgb_shuf: db 0,3,6,9,1,4,7,10,2,5,8,11,-1,-1,-1,-1
+                       db 0,4,8,12,1,5,9,13,2,6,10,14,-1,-1,-1,-1
+
 deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
-%endif
-pw_1024: times 16 dw 1024
+%endif ; !HIGH_BIT_DEPTH
 
 pd_16: times 4 dd 16
 pd_0f: times 4 dd 0xffff
-pf_inv256: times 8 dd 0.00390625
 
 pad10: times 8 dw    10*PIXEL_MAX
 pad20: times 8 dw    20*PIXEL_MAX
@@ -67,16 +69,22 @@
 tap2: times 4 dw 20, 20
 tap3: times 4 dw -5,  1
 
+pw_0xc000: times 8 dw 0xc000
+pw_31: times 8 dw 31
+pd_4: times 4 dd 4
+
 SECTION .text
 
 cextern pb_0
 cextern pw_1
+cextern pw_8
 cextern pw_16
 cextern pw_32
 cextern pw_512
 cextern pw_00ff
 cextern pw_3fff
 cextern pw_pixel_max
+cextern pw_0to15
 cextern pd_ffff
 
 %macro LOAD_ADD 4
@@ -1202,6 +1210,105 @@
     RET
 %endmacro ; PLANE_DEINTERLEAVE
 
+%macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
+%if cpuflag(ssse3)
+    mova        m3, [deinterleave_rgb_shuf+(%1-3)*16]
+%endif
+%%loopy:
+    mov         %8, r6
+    mov         %9, %6
+%%loopx:
+    movu        m0, [%8]
+    movu        m1, [%8+%1*mmsize/4]
+%if cpuflag(ssse3)
+    pshufb      m0, m3        ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
+    pshufb      m1, m3        ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
+%elif %1 == 3
+    psrldq      m2, m0, 6
+    punpcklqdq  m0, m1        ; b0 g0 r0 b1 g1 r1 __ __ b4 g4 r4 b5 g5 r5
+    psrldq      m1, 6
+    punpcklqdq  m2, m1        ; b2 g2 r2 b3 g3 r3 __ __ b6 g6 r6 b7 g7 r7
+    psrlq       m3, m0, 24
+    psrlq       m4, m2, 24
+    punpckhbw   m1, m0, m3    ; b4 b5 g4 g5 r4 r5
+    punpcklbw   m0, m3        ; b0 b1 g0 g1 r0 r1
+    punpckhbw   m3, m2, m4    ; b6 b7 g6 g7 r6 r7
+    punpcklbw   m2, m4        ; b2 b3 g2 g3 r2 r3
+    punpcklwd   m0, m2        ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
+    punpcklwd   m1, m3        ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
+%else
+    pshufd      m3, m0, q2301
+    pshufd      m4, m1, q2301
+    punpckhbw   m2, m0, m3    ; b2 b3 g2 g3 r2 r3
+    punpcklbw   m0, m3        ; b0 b1 g0 g1 r0 r1
+    punpckhbw   m3, m1, m4    ; b6 b7 g6 g7 r6 r7
+    punpcklbw   m1, m4        ; b4 b5 g4 g5 r4 r5
+    punpcklwd   m0, m2        ; b0 b1 b2 b3 g0 g1 g2 g3 r0 r1 r2 r3
+    punpcklwd   m1, m3        ; b4 b5 b6 b7 g4 g5 g6 g7 r4 r5 r6 r7
+%endif
+    punpckldq   m2, m0, m1    ; b0 b1 b2 b3 b4 b5 b6 b7 g0 g1 g2 g3 g4 g5 g6 g7
+    punpckhdq   m0, m1        ; r0 r1 r2 r3 r4 r5 r6 r7
+    movh   [r0+%9], m2
+    movhps [r2+%9], m2
+    movh   [r4+%9], m0
+    add         %8, %1*mmsize/2
+    add         %9, mmsize/2
+    jl %%loopx
+    add         r0, %2
+    add         r2, %3
+    add         r4, %4
+    add         r6, %5
+    dec        %7d
+    jg %%loopy
+%endmacro
+
+%macro PLANE_DEINTERLEAVE_RGB 0
+;-----------------------------------------------------------------------------
+; void x264_plane_copy_deinterleave_rgb( pixel *dsta, intptr_t i_dsta,
+;                                        pixel *dstb, intptr_t i_dstb,
+;                                        pixel *dstc, intptr_t i_dstc,
+;                                        pixel *src,  intptr_t i_src, int pw, int w, int h )
+;-----------------------------------------------------------------------------
+%if ARCH_X86_64
+cglobal plane_copy_deinterleave_rgb, 8,12
+    %define %%args r1, r3, r5, r7, r8, r9, r10, r11
+    mov        r8d, r9m
+    mov        r9d, r10m
+    add         r0, r8
+    add         r2, r8
+    add         r4, r8
+    neg         r8
+%else
+cglobal plane_copy_deinterleave_rgb, 1,7
+    %define %%args r1m, r3m, r5m, r7m, r9m, r1, r3, r5
+    mov         r1, r9m
+    mov         r2, r2m
+    mov         r4, r4m
+    mov         r6, r6m
+    add         r0, r1
+    add         r2, r1
+    add         r4, r1
+    neg         r1
+    mov        r9m, r1
+    mov         r1, r10m
+%endif
+    cmp  dword r8m, 4
+    je .pw4
+    PLANE_DEINTERLEAVE_RGB_CORE 3, %%args ; BGR
+    jmp .ret
+.pw4:
+    PLANE_DEINTERLEAVE_RGB_CORE 4, %%args ; BGRA
+.ret:
+    REP_RET
+%endmacro
+
+%if HIGH_BIT_DEPTH == 0
+INIT_XMM sse2
+PLANE_DEINTERLEAVE_RGB
+INIT_XMM ssse3
+PLANE_DEINTERLEAVE_RGB
+%endif ; !HIGH_BIT_DEPTH
+
 %macro PLANE_DEINTERLEAVE_V210 0
 ;-----------------------------------------------------------------------------
 ; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
@@ -1881,62 +1988,64 @@
 ;                             uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
 ;-----------------------------------------------------------------------------
 %macro MBTREE 0
-cglobal mbtree_propagate_cost, 7,7,7
-    add        r6d, r6d
-    lea         r0, [r0+r6*2]
-    add         r1, r6
-    add         r2, r6
-    add         r3, r6
-    add         r4, r6
-    neg         r6
-    pxor      xmm4, xmm4
-    movss     xmm6, [r5]
-    shufps    xmm6, xmm6, 0
-    mulps     xmm6, [pf_inv256]
-    movdqa    xmm5, [pw_3fff]
-.loop:
-    movq      xmm2, [r2+r6] ; intra
-    movq      xmm0, [r4+r6] ; invq
-    movq      xmm3, [r3+r6] ; inter
-    movq      xmm1, [r1+r6] ; prop
-    punpcklwd xmm2, xmm4
-    punpcklwd xmm0, xmm4
-    pmaddwd   xmm0, xmm2
-    pand      xmm3, xmm5
-    punpcklwd xmm1, xmm4
-    punpcklwd xmm3, xmm4
+cglobal mbtree_propagate_cost, 6,6,7
+    movss     m6, [r5]
+    mov      r5d, r6m
+    lea       r0, [r0+r5*2]
+    add      r5d, r5d
+    add       r1, r5
+    add       r2, r5
+    add       r3, r5
+    add       r4, r5
+    neg       r5
+    pxor      m4, m4
+    shufps    m6, m6, 0
+    mova      m5, [pw_3fff]
+.loop:
+    movq      m2, [r2+r5] ; intra
+    movq      m0, [r4+r5] ; invq
+    movq      m3, [r3+r5] ; inter
+    movq      m1, [r1+r5] ; prop
+    pand      m3, m5
+    pminsw    m3, m2
+    punpcklwd m2, m4
+    punpcklwd m0, m4
+    pmaddwd   m0, m2
+    punpcklwd m1, m4
+    punpcklwd m3, m4
 %if cpuflag(fma4)
-    cvtdq2ps  xmm0, xmm0
-    cvtdq2ps  xmm1, xmm1
-    fmaddps   xmm0, xmm0, xmm6, xmm1
-    cvtdq2ps  xmm1, xmm2
-    psubd     xmm2, xmm3
-    cvtdq2ps  xmm2, xmm2
-    rcpps     xmm3, xmm1
-    mulps     xmm1, xmm3
-    mulps     xmm0, xmm2
-    addps     xmm2, xmm3, xmm3
-    fnmaddps  xmm3, xmm1, xmm3, xmm2
-    mulps     xmm0, xmm3
-%else
-    cvtdq2ps  xmm0, xmm0
-    mulps     xmm0, xmm6    ; intra*invq*fps_factor>>8
-    cvtdq2ps  xmm1, xmm1    ; prop
-    addps     xmm0, xmm1    ; prop + (intra*invq*fps_factor>>8)
-    cvtdq2ps  xmm1, xmm2    ; intra
-    psubd     xmm2, xmm3    ; intra - inter
-    cvtdq2ps  xmm2, xmm2    ; intra - inter
-    rcpps     xmm3, xmm1    ; 1 / intra 1st approximation
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)
-    mulps     xmm1, xmm3    ; intra * (1/intra 1st approx)^2
-    mulps     xmm0, xmm2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
-    addps     xmm3, xmm3    ; 2 * (1/intra 1st approx)
-    subps     xmm3, xmm1    ; 2nd approximation for 1/intra
-    mulps     xmm0, xmm3    ; / intra
-%endif
-    cvtps2dq  xmm0, xmm0
-    movdqa [r0+r6*2], xmm0
-    add         r6, 8
+    cvtdq2ps  m0, m0
+    cvtdq2ps  m1, m1
+    fmaddps   m0, m0, m6, m1
+    cvtdq2ps  m1, m2
+    psubd     m2, m3
+    cvtdq2ps  m2, m2
+    rcpps     m3, m1
+    mulps     m1, m3
+    mulps     m0, m2
+    addps     m2, m3, m3
+    fnmaddps  m3, m1, m3, m2
+    mulps     m0, m3
+%else
+    cvtdq2ps  m0, m0
+    mulps     m0, m6    ; intra*invq*fps_factor>>8
+    cvtdq2ps  m1, m1    ; prop
+    addps     m0, m1    ; prop + (intra*invq*fps_factor>>8)
+    cvtdq2ps  m1, m2    ; intra
+    psubd     m2, m3    ; intra - inter
+    cvtdq2ps  m2, m2    ; intra - inter
+    rcpps     m3, m1    ; 1 / intra 1st approximation
+    mulps     m1, m3    ; intra * (1/intra 1st approx)
+    mulps     m1, m3    ; intra * (1/intra 1st approx)^2
+    mulps     m0, m2    ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+    addps     m3, m3    ; 2 * (1/intra 1st approx)
+    subps     m3, m1    ; 2nd approximation for 1/intra
+    mulps     m0, m3    ; / intra
+%endif
+    cvtps2dq  m0, m0
+    packssdw  m0, m0
+    movh [r0+r5], m0
+    add       r5, 8
     jl .loop
     RET
 %endmacro
@@ -1948,34 +2057,35 @@
 MBTREE
 
 %macro INT16_UNPACK 1
-    vpunpckhwd   xm4, xm%1, xm7
-    vpunpcklwd  xm%1, xm7
-    vinsertf128  m%1, m%1, xm4, 1
-%endmacro
-
-; FIXME: align loads/stores to 16 bytes
-%macro MBTREE_AVX 0
-cglobal mbtree_propagate_cost, 7,7,8
-    add          r6d, r6d
-    lea           r0, [r0+r6*2]
-    add           r1, r6
-    add           r2, r6
-    add           r3, r6
-    add           r4, r6
-    neg           r6
-    mova         xm5, [pw_3fff]
-    vbroadcastss  m6, [r5]
-    mulps         m6, [pf_inv256]
+    punpckhwd   xm4, xm%1, xm7
+    punpcklwd  xm%1, xm7
+    vinsertf128 m%1, m%1, xm4, 1
+%endmacro
+
+; FIXME: align loads to 16 bytes
+%macro MBTREE_AVX 1
+cglobal mbtree_propagate_cost, 6,6,%1
+    vbroadcastss m6, [r5]
+    mov         r5d, r6m
+    lea          r0, [r0+r5*2]
+    add         r5d, r5d
+    add          r1, r5
+    add          r2, r5
+    add          r3, r5
+    add          r4, r5
+    neg          r5
+    mova        xm5, [pw_3fff]
 %if notcpuflag(avx2)
-    pxor         xm7, xm7
+    pxor        xm7, xm7
 %endif
 .loop:
 %if cpuflag(avx2)
-    pmovzxwd     m0, [r2+r6]      ; intra
-    pmovzxwd     m1, [r4+r6]      ; invq
-    pmovzxwd     m2, [r1+r6]      ; prop
-    pand        xm3, xm5, [r3+r6] ; inter
+    pmovzxwd     m0, [r2+r5]      ; intra
+    pmovzxwd     m1, [r4+r5]      ; invq
+    pmovzxwd     m2, [r1+r5]      ; prop
+    pand        xm3, xm5, [r3+r5] ; inter
     pmovzxwd     m3, xm3
+    pminsd       m3, m0
     pmaddwd      m1, m0
     psubd        m4, m0, m3
     cvtdq2ps     m0, m0
@@ -1990,10 +2100,11 @@
     fnmaddps     m4, m2, m3, m4
     mulps        m1, m4
 %else
-    movu        xm0, [r2+r6]
-    movu        xm1, [r4+r6]
-    movu        xm2, [r1+r6]
-    pand        xm3, xm5, [r3+r6]
+    movu        xm0, [r2+r5]
+    movu        xm1, [r4+r5]
+    movu        xm2, [r1+r5]
+    pand        xm3, xm5, [r3+r5]
+    pminsw      xm3, xm0
     INT16_UNPACK 0
     INT16_UNPACK 1
     INT16_UNPACK 2
@@ -2015,13 +2126,107 @@
     mulps        m1, m3         ; / intra
 %endif
     vcvtps2dq    m1, m1
-    movu  [r0+r6*2], m1
-    add          r6, 16
+    vextractf128 xm2, m1, 1
+    packssdw    xm1, xm2
+    mova    [r0+r5], xm1
+    add          r5, 16
     jl .loop
     RET
 %endmacro
 
 INIT_YMM avx
-MBTREE_AVX
+MBTREE_AVX 8
 INIT_YMM avx2,fma3
-MBTREE_AVX
+MBTREE_AVX 7
+
+%macro MBTREE_PROPAGATE_LIST 0
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int *propagate_amount, uint16_t *lowres_costs,
+;                                      int16_t *output, int bipred_weight, int mb_y, int len )
+;-----------------------------------------------------------------------------
+cglobal mbtree_propagate_list_internal, 4,6,8
+    movh     m6, [pw_0to15] ; mb_x
+    movd     m7, r5m
+    pshuflw  m7, m7, 0
+    punpcklwd m6, m7       ; 0 y 1 y 2 y 3 y
+    movd     m7, r4m
+    SPLATW   m7, m7        ; bipred_weight
+    psllw    m7, 9         ; bipred_weight << 9
+
+    mov     r5d, r6m
+    xor     r4d, r4d
+.loop:
+    mova     m3, [r1+r4*2]
+    movu     m4, [r2+r4*2]
+    mova     m5, [pw_0xc000]
+    pand     m4, m5
+    pcmpeqw  m4, m5
+    pmulhrsw m5, m3, m7    ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+%if cpuflag(avx)
+    pblendvb m5, m3, m5, m4
+%else
+    pand     m5, m4
+    pandn    m4, m3
+    por      m5, m4        ; if( lists_used == 3 )
+                           ;     propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+%endif
+
+    movu     m0, [r0+r4*4] ; x,y
+    movu     m1, [r0+r4*4+mmsize]
+
+    psraw    m2, m0, 5
+    psraw    m3, m1, 5
+    mova     m4, [pd_4]
+    paddw    m2, m6        ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
+    paddw    m6, m4        ; {mbx, mby} += {4, 0}
+    paddw    m3, m6        ; {mbx, mby} = ({x,y}>>5)+{h->mb.i_mb_x,h->mb.i_mb_y}
+    paddw    m6, m4        ; {mbx, mby} += {4, 0}
+
+    mova [r3+mmsize*0], m2
+    mova [r3+mmsize*1], m3
+
+    mova     m3, [pw_31]
+    pand     m0, m3        ; x &= 31
+    pand     m1, m3        ; y &= 31
+    packuswb m0, m1
+    psrlw    m1, m0, 3
+    pand     m0, m3        ; x
+    SWAP      1, 3
+    pandn    m1, m3        ; y premultiplied by (1<<5) for later use of pmulhrsw
+
+    mova     m3, [pw_32]
+    psubw    m3, m0        ; 32 - x
+    mova     m4, [pw_1024]
+    psubw    m4, m1        ; (32 - y) << 5
+
+    pmullw   m2, m3, m4    ; idx0weight = (32-y)*(32-x) << 5
+    pmullw   m4, m0        ; idx1weight = (32-y)*x << 5
+    pmullw   m0, m1        ; idx3weight = y*x << 5
+    pmullw   m1, m3        ; idx2weight = y*(32-x) << 5
+
+    ; avoid overflow in the input to pmulhrsw
+    psrlw    m3, m2, 15
+    psubw    m2, m3        ; idx0weight -= (idx0weight == 32768)
+
+    pmulhrsw m2, m5        ; idx0weight * propagate_amount + 512 >> 10
+    pmulhrsw m4, m5        ; idx1weight * propagate_amount + 512 >> 10
+    pmulhrsw m1, m5        ; idx2weight * propagate_amount + 512 >> 10
+    pmulhrsw m0, m5        ; idx3weight * propagate_amount + 512 >> 10
+
+    SBUTTERFLY wd, 2, 4, 3
+    SBUTTERFLY wd, 1, 0, 3
+    mova [r3+mmsize*2], m2
+    mova [r3+mmsize*3], m4
+    mova [r3+mmsize*4], m1
+    mova [r3+mmsize*5], m0
+    add     r4d, mmsize/2
+    add      r3, mmsize*6
+    cmp     r4d, r5d
+    jl .loop
+    REP_RET
+%endmacro
+
+INIT_XMM ssse3
+MBTREE_PROPAGATE_LIST
+INIT_XMM avx
+MBTREE_PROPAGATE_LIST
diff -Nru x264-0.142.2389+git956c8d8/common/x86/mc-a.asm x264-0.142.2431+gita5831aa/common/x86/mc-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/mc-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/mc-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
 ;* Copyright (C) 2003-2014 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Dylan Yudaken <dyudaken@gmail.com>
 ;*          Holger Lubitz <holger@lubitz.org>
diff -Nru x264-0.142.2389+git956c8d8/common/x86/mc-c.c x264-0.142.2431+gita5831aa/common/x86/mc-c.c
--- x264-0.142.2389+git956c8d8/common/x86/mc-c.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/mc-c.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -116,6 +116,14 @@
 void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
                                        uint16_t *dstv, intptr_t i_dstv,
                                        uint16_t *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta,
+                                             pixel *dstb, intptr_t i_dstb,
+                                             pixel *dstc, intptr_t i_dstc,
+                                             pixel *src,  intptr_t i_src, int pw, int w, int h );
+void x264_plane_copy_deinterleave_rgb_ssse3( pixel *dsta, intptr_t i_dsta,
+                                             pixel *dstb, intptr_t i_dstb,
+                                             pixel *dstc, intptr_t i_dstc,
+                                             pixel *src,  intptr_t i_src, int pw, int w, int h );
 void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
                                               uint16_t *dstv, intptr_t i_dstv,
                                               uint32_t *src,  intptr_t i_src, int w, int h );
@@ -153,13 +161,13 @@
 void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
 void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
 void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
-void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx ( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_fma4( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                       uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
-void x264_mbtree_propagate_cost_avx2_fma3( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+void x264_mbtree_propagate_cost_avx2_fma3( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
                                            uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
 
 #define MC_CHROMA(cpu)\
@@ -525,6 +533,113 @@
 PLANE_INTERLEAVE(avx)
 #endif
 
+#if HAVE_X86_INLINE_ASM
+#define CLIP_ADD(s,x)\
+do\
+{\
+    int temp;\
+    asm("movd       %0, %%xmm0     \n"\
+        "movd       %2, %%xmm1     \n"\
+        "paddsw %%xmm1, %%xmm0     \n"\
+        "movd   %%xmm0, %1         \n"\
+        :"+m"(s), "=&r"(temp)\
+        :"m"(x)\
+    );\
+    s = temp;\
+} while(0)
+
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    asm("movd       %0, %%xmm0     \n"\
+        "movd       %1, %%xmm1     \n"\
+        "paddsw %%xmm1, %%xmm0     \n"\
+        "movd   %%xmm0, %0         \n"\
+        :"+m"(M32(s))\
+        :"m"(M32(x))\
+    );\
+} while(0)
+#else
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    CLIP_ADD((s)[0], (x)[0]);\
+    CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+#endif
+
+#define PROPAGATE_LIST(cpu)\
+void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
+                                                uint16_t *lowres_costs, int16_t *output,\
+                                                int bipred_weight, int mb_y, int len );\
+\
+static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
+                                              int16_t *propagate_amount, uint16_t *lowres_costs,\
+                                              int bipred_weight, int mb_y, int len, int list )\
+{\
+    int16_t *current = h->scratch_buffer2;\
+\
+    x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
+                                               current, bipred_weight, mb_y, len );\
+\
+    unsigned stride = h->mb.i_mb_stride;\
+    unsigned width = h->mb.i_mb_width;\
+    unsigned height = h->mb.i_mb_height;\
+\
+    for( unsigned i = 0; i < len; current += 32 )\
+    {\
+        int end = X264_MIN( i+8, len );\
+        for( ; i < end; i++, current += 2 )\
+        {\
+            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
+                continue;\
+\
+            unsigned mbx = current[0];\
+            unsigned mby = current[1];\
+            unsigned idx0 = mbx + mby * stride;\
+            unsigned idx2 = idx0 + stride;\
+\
+            /* Shortcut for the simple/common case of zero MV */\
+            if( !M32( mvs[i] ) )\
+            {\
+                CLIP_ADD( ref_costs[idx0], current[16] );\
+                continue;\
+            }\
+\
+            if( mbx < width-1 && mby < height-1 )\
+            {\
+                CLIP_ADD2( ref_costs+idx0, current+16 );\
+                CLIP_ADD2( ref_costs+idx2, current+32 );\
+            }\
+            else\
+            {\
+                /* Note: this takes advantage of unsigned representation to\
+                 * catch negative mbx/mby. */\
+                if( mby < height )\
+                {\
+                    if( mbx < width )\
+                        CLIP_ADD( ref_costs[idx0+0], current[16] );\
+                    if( mbx+1 < width )\
+                        CLIP_ADD( ref_costs[idx0+1], current[17] );\
+                }\
+                if( mby+1 < height )\
+                {\
+                    if( mbx < width )\
+                        CLIP_ADD( ref_costs[idx2+0], current[32] );\
+                    if( mbx+1 < width )\
+                        CLIP_ADD( ref_costs[idx2+1], current[33] );\
+                }\
+            }\
+        }\
+    }\
+}
+
+PROPAGATE_LIST(ssse3)
+PROPAGATE_LIST(avx)
+#undef CLIP_ADD
+#undef CLIP_ADD2
+
 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 {
     if( !(cpu&X264_CPU_MMX) )
@@ -637,6 +752,7 @@
 
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
     pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
 
     if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
@@ -688,6 +804,7 @@
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2_amd;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
+    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2;
 
     if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
     {
@@ -738,6 +855,8 @@
     pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_ssse3;
     pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_ssse3;
     pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_ssse3;
+    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_ssse3;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_ssse3;
 
     if( !(cpu&X264_CPU_SLOW_PSHUFB) )
     {
@@ -814,6 +933,7 @@
         return;
     pf->memzero_aligned = x264_memzero_aligned_avx;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx;
 
     if( cpu&X264_CPU_FMA4 )
         pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_fma4;
diff -Nru x264-0.142.2389+git956c8d8/common/x86/pixel-a.asm x264-0.142.2431+gita5831aa/common/x86/pixel-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/pixel-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/pixel-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -7,7 +7,7 @@
 ;*          Holger Lubitz <holger@lubitz.org>
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Alex Izvorski <aizvorksi@gmail.com>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
@@ -205,7 +205,7 @@
     mov    r4d, %%n
 %endif
     pxor    m0, m0
-.loop
+.loop:
     mova    m1, [r0]
     mova    m2, [r0+offset0_1]
     mova    m3, [r0+offset0_2]
@@ -561,10 +561,15 @@
     pshufhw     m0, m0, q3120
     pshufhw     m1, m1, q3120
 %endif
+%if cpuflag(xop)
+    pmadcswd    m2, m0, m0, m2
+    pmadcswd    m3, m1, m1, m3
+%else
     pmaddwd     m0, m0
     pmaddwd     m1, m1
     paddd       m2, m0
     paddd       m3, m1
+%endif
     add         r6, 2*mmsize
     jl .loopx
 %if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
@@ -657,10 +662,15 @@
     por     m0, m1
     psrlw   m2, m0, 8
     pand    m0, m5
+%if cpuflag(xop)
+    pmadcswd m4, m2, m2, m4
+    pmadcswd m3, m0, m0, m3
+%else
     pmaddwd m2, m2
     pmaddwd m0, m0
-    paddd   m3, m0
     paddd   m4, m2
+    paddd   m3, m0
+%endif
     add     r6, mmsize
     jl .loopx
 %if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
@@ -695,6 +705,8 @@
 SSD_NV12
 INIT_XMM avx
 SSD_NV12
+INIT_XMM xop
+SSD_NV12
 INIT_YMM avx2
 SSD_NV12
 
@@ -1265,7 +1277,7 @@
 ; clobber: m3..m7
 ; out: %1 = satd
 %macro SATD_4x4_MMX 3
-    %xdefine %%n n%1
+    %xdefine %%n nn%1
     %assign offset %2*SIZEOF_PIXEL
     LOAD_DIFF m4, m3, none, [r0+     offset], [r2+     offset]
     LOAD_DIFF m5, m3, none, [r0+  r1+offset], [r2+  r3+offset]
diff -Nru x264-0.142.2389+git956c8d8/common/x86/pixel.h x264-0.142.2431+gita5831aa/common/x86/pixel.h
--- x264-0.142.2389+git956c8d8/common/x86/pixel.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/pixel.h	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -56,6 +56,7 @@
 DECL_X4( sad, sse2 )
 DECL_X4( sad, sse3 )
 DECL_X4( sad, ssse3 )
+DECL_X4( sad, xop )
 DECL_X4( sad, avx )
 DECL_X4( sad, avx2 )
 DECL_X1( ssd, mmx )
@@ -153,6 +154,9 @@
 void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
                                     pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );
+void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1,
+                                    pixel *pixuv2, intptr_t stride2, int width,
+                                    int height, uint64_t *ssd_u, uint64_t *ssd_v );
 void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1,
                                     pixel *pixuv2, intptr_t stride2, int width,
                                     int height, uint64_t *ssd_u, uint64_t *ssd_v );
diff -Nru x264-0.142.2389+git956c8d8/common/x86/predict-a.asm x264-0.142.2431+gita5831aa/common/x86/predict-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/predict-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/predict-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Holger Lubitz <holger@lubitz.org>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
@@ -31,7 +31,6 @@
 
 SECTION_RODATA 32
 
-pw_0to15:    dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
 pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
 pw_m3:       times 16 dw -3
 pw_m7:       times 16 dw -7
@@ -56,6 +55,7 @@
 cextern pw_16
 cextern pw_00ff
 cextern pw_pixel_max
+cextern pw_0to15
 
 %macro STORE8 1
     mova [r0+0*FDEC_STRIDEB], %1
diff -Nru x264-0.142.2389+git956c8d8/common/x86/predict-c.c x264-0.142.2431+gita5831aa/common/x86/predict-c.c
--- x264-0.142.2389+git956c8d8/common/x86/predict-c.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/predict-c.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/common/x86/quant-a.asm x264-0.142.2431+gita5831aa/common/x86/quant-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/quant-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/quant-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
 ;* Copyright (C) 2005-2014 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Christian Heine <sennindemokrit@gmx.net>
 ;*          Oskar Arvidsson <oskar@irock.se>
 ;*          Henrik Gramner <henrik@gramner.com>
diff -Nru x264-0.142.2389+git956c8d8/common/x86/quant.h x264-0.142.2431+gita5831aa/common/x86/quant.h
--- x264-0.142.2389+git956c8d8/common/x86/quant.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/quant.h	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
  * Copyright (C) 2005-2014 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Christian Heine <sennindemokrit@gmx.net>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/x86/sad16-a.asm x264-0.142.2431+gita5831aa/common/x86/sad16-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/sad16-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/sad16-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -519,6 +519,19 @@
 SAD_X 4,  8, 16
 SAD_X 4,  8,  8
 SAD_X 4,  8,  4
+INIT_XMM xop
+%define XMM_REGS 7
+SAD_X 3, 16, 16
+SAD_X 3, 16,  8
+SAD_X 3,  8, 16
+SAD_X 3,  8,  8
+SAD_X 3,  8,  4
+%define XMM_REGS 9
+SAD_X 4, 16, 16
+SAD_X 4, 16,  8
+SAD_X 4,  8, 16
+SAD_X 4,  8,  8
+SAD_X 4,  8,  4
 INIT_YMM avx2
 %define XMM_REGS 7
 SAD_X 3, 16, 16
diff -Nru x264-0.142.2389+git956c8d8/common/x86/sad-a.asm x264-0.142.2431+gita5831aa/common/x86/sad-a.asm
--- x264-0.142.2389+git956c8d8/common/x86/sad-a.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/sad-a.asm	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
 ;* Copyright (C) 2003-2014 x264 project
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Laurent Aimar <fenrir@via.ecp.fr>
 ;*          Alex Izvorski <aizvorksi@gmail.com>
 ;*
diff -Nru x264-0.142.2389+git956c8d8/common/x86/util.h x264-0.142.2431+gita5831aa/common/x86/util.h
--- x264-0.142.2389+git956c8d8/common/x86/util.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/util.h	2014-07-11 01:16:23.000000000 +0000
@@ -3,7 +3,7 @@
  *****************************************************************************
  * Copyright (C) 2008-2014 x264 project
  *
- * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
+ * Authors: Fiona Glaser <fiona@x264.com>
  *          Loren Merritt <lorenm@u.washington.edu>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/common/x86/x86inc.asm x264-0.142.2431+gita5831aa/common/x86/x86inc.asm
--- x264-0.142.2389+git956c8d8/common/x86/x86inc.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/x86inc.asm	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
 ;*
 ;* Authors: Loren Merritt <lorenm@u.washington.edu>
 ;*          Anton Mitrofanov <BugMaster@narod.ru>
-;*          Jason Garrett-Glaser <darkshikari@gmail.com>
+;*          Fiona Glaser <fiona@x264.com>
 ;*          Henrik Gramner <henrik@gramner.com>
 ;*
 ;* Permission to use, copy, modify, and/or distribute this software for any
@@ -42,6 +42,14 @@
     %define public_prefix private_prefix
 %endif
 
+%ifndef STACK_ALIGNMENT
+    %if ARCH_X86_64
+        %define STACK_ALIGNMENT 16
+    %else
+        %define STACK_ALIGNMENT 4
+    %endif
+%endif
+
 %define WIN64  0
 %define UNIX64 0
 %if ARCH_X86_64
@@ -94,8 +102,9 @@
 ; %1 = number of arguments. loads them from stack if needed.
 ; %2 = number of registers used. pushes callee-saved regs if needed.
 ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
-; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x,
-;      MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes),
+; %4 = (optional) stack size to be allocated. The stack will be aligned before
+;      allocating the specified stack size. If the required stack alignment is
+;      larger than the known stack alignment the stack will be manually aligned
 ;      and an extra register will be allocated to hold the original stack
 ;      pointer (to not invalidate r0m etc.). To prevent the use of an extra
 ;      register as stack pointer, request a negative stack size.
@@ -103,8 +112,10 @@
 ; PROLOGUE can also be invoked by adding the same options to cglobal
 
 ; e.g.
-; cglobal foo, 2,3,0, dst, src, tmp
-; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+; cglobal foo, 2,3,7,0x40, dst, src, tmp
+; declares a function (foo) that automatically loads two arguments (dst and
+; src) into registers, uses one additional register (tmp) plus 7 vector
+; registers (m0-m6) and allocates 0x40 bytes of stack space.
 
 ; TODO Some functions can use some args directly from the stack. If they're the
 ; last args then you can just not declare them, but if they're in the middle
@@ -304,26 +315,28 @@
     %assign n_arg_names %0
 %endmacro
 
+%define required_stack_alignment ((mmsize + 15) & ~15)
+
 %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
     %ifnum %1
         %if %1 != 0
-            %assign %%stack_alignment ((mmsize + 15) & ~15)
+            %assign %%pad 0
             %assign stack_size %1
             %if stack_size < 0
                 %assign stack_size -stack_size
             %endif
-            %assign stack_size_padded stack_size
             %if WIN64
-                %assign stack_size_padded stack_size_padded + 32 ; reserve 32 bytes for shadow space
+                %assign %%pad %%pad + 32 ; shadow space
                 %if mmsize != 8
                     %assign xmm_regs_used %2
                     %if xmm_regs_used > 8
-                        %assign stack_size_padded stack_size_padded + (xmm_regs_used-8)*16
+                        %assign %%pad %%pad + (xmm_regs_used-8)*16 ; callee-saved xmm registers
                     %endif
                 %endif
             %endif
-            %if mmsize <= 16 && HAVE_ALIGNED_STACK
-                %assign stack_size_padded stack_size_padded + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1))
+            %if required_stack_alignment <= STACK_ALIGNMENT
+                ; maintain the current stack alignment
+                %assign stack_size_padded stack_size + %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
                 SUB rsp, stack_size_padded
             %else
                 %assign %%reg_num (regs_used - 1)
@@ -332,17 +345,17 @@
                 ; it, i.e. in [rsp+stack_size_padded], so we can restore the
                 ; stack in a single instruction (i.e. mov rsp, rstk or mov
                 ; rsp, [rsp+stack_size_padded])
-                mov  rstk, rsp
                 %if %1 < 0 ; need to store rsp on stack
-                    sub  rsp, gprsize+stack_size_padded
-                    and  rsp, ~(%%stack_alignment-1)
-                    %xdefine rstkm [rsp+stack_size_padded]
-                    mov rstkm, rstk
+                    %xdefine rstkm [rsp + stack_size + %%pad]
+                    %assign %%pad %%pad + gprsize
                 %else ; can keep rsp in rstk during whole function
-                    sub  rsp, stack_size_padded
-                    and  rsp, ~(%%stack_alignment-1)
                     %xdefine rstkm rstk
                 %endif
+                %assign stack_size_padded stack_size + ((%%pad + required_stack_alignment-1) & ~(required_stack_alignment-1))
+                mov rstk, rsp
+                and rsp, ~(required_stack_alignment-1)
+                sub rsp, stack_size_padded
+                movifnidn rstkm, rstk
             %endif
             WIN64_PUSH_XMM
         %endif
@@ -351,7 +364,7 @@
 
 %macro SETUP_STACK_POINTER 1
     %ifnum %1
-        %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32)
+        %if %1 != 0 && required_stack_alignment > STACK_ALIGNMENT
             %if %1 > 0
                 %assign regs_used (regs_used + 1)
             %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2
@@ -425,7 +438,9 @@
     %assign xmm_regs_used %1
     ASSERT xmm_regs_used <= 16
     %if xmm_regs_used > 8
-        %assign stack_size_padded (xmm_regs_used-8)*16 + (~stack_offset&8) + 32
+        ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
+        %assign %%pad (xmm_regs_used-8)*16 + 32
+        %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
         SUB rsp, stack_size_padded
     %endif
     WIN64_PUSH_XMM
@@ -441,7 +456,7 @@
         %endrep
     %endif
     %if stack_size_padded > 0
-        %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0)
+        %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
             mov rsp, rstkm
         %else
             add %1, stack_size_padded
@@ -507,7 +522,7 @@
 
 %macro RET 0
 %if stack_size_padded > 0
-%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+%if required_stack_alignment > STACK_ALIGNMENT
     mov rsp, rstkm
 %else
     add rsp, stack_size_padded
@@ -563,7 +578,7 @@
 
 %macro RET 0
 %if stack_size_padded > 0
-%if mmsize == 32 || HAVE_ALIGNED_STACK == 0
+%if required_stack_alignment > STACK_ALIGNMENT
     mov rsp, rstkm
 %else
     add rsp, stack_size_padded
@@ -803,12 +818,12 @@
     %assign %%i 0
     %rep 8
     CAT_XDEFINE m, %%i, mm %+ %%i
-    CAT_XDEFINE nmm, %%i, %%i
+    CAT_XDEFINE nnmm, %%i, %%i
     %assign %%i %%i+1
     %endrep
     %rep 8
     CAT_UNDEF m, %%i
-    CAT_UNDEF nmm, %%i
+    CAT_UNDEF nnmm, %%i
     %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
@@ -829,7 +844,7 @@
     %assign %%i 0
     %rep num_mmregs
     CAT_XDEFINE m, %%i, xmm %+ %%i
-    CAT_XDEFINE nxmm, %%i, %%i
+    CAT_XDEFINE nnxmm, %%i, %%i
     %assign %%i %%i+1
     %endrep
     INIT_CPUFLAGS %1
@@ -899,7 +914,7 @@
 %endrep
 %rep %0/2
     %xdefine m%1 %%tmp%2
-    CAT_XDEFINE n, m%1, %1
+    CAT_XDEFINE nn, m%1, %1
     %rotate 2
 %endrep
 %endmacro
@@ -917,16 +932,16 @@
         %xdefine %%tmp m%1
         %xdefine m%1 m%2
         %xdefine m%2 %%tmp
-        CAT_XDEFINE n, m%1, %1
-        CAT_XDEFINE n, m%2, %2
+        CAT_XDEFINE nn, m%1, %1
+        CAT_XDEFINE nn, m%2, %2
     %rotate 1
     %endrep
 %endmacro
 
 %macro SWAP_INTERNAL_NAME 2-*
-    %xdefine %%args n %+ %1
+    %xdefine %%args nn %+ %1
     %rep %0-1
-        %xdefine %%args %%args, n %+ %2
+        %xdefine %%args %%args, nn %+ %2
     %rotate 1
     %endrep
     SWAP_INTERNAL_NUM %%args
@@ -953,7 +968,7 @@
         %assign %%i 0
         %rep num_mmregs
             CAT_XDEFINE m, %%i, %1_m %+ %%i
-            CAT_XDEFINE n, m %+ %%i, %%i
+            CAT_XDEFINE nn, m %+ %%i, %%i
         %assign %%i %%i+1
         %endrep
     %endif
@@ -1385,15 +1400,18 @@
     %macro %1 4-7 %1, %2, %3
         %if cpuflag(xop)
             v%5 %1, %2, %3, %4
-        %else
+        %elifnidn %1, %4
             %6 %1, %2, %3
             %7 %1, %4
+        %else
+            %error non-xop emulation of ``%5 %1, %2, %3, %4'' is not supported
         %endif
     %endmacro
 %endmacro
 
-FMA_INSTR  pmacsdd,  pmulld, paddd
 FMA_INSTR  pmacsww,  pmullw, paddw
+FMA_INSTR  pmacsdd,  pmulld, paddd ; sse4 emulation
+FMA_INSTR pmacsdql,  pmuldq, paddq ; sse4 emulation
 FMA_INSTR pmadcswd, pmaddwd, paddd
 
 ; convert FMA4 to FMA3 if possible
diff -Nru x264-0.142.2389+git956c8d8/common/x86/x86util.asm x264-0.142.2431+gita5831aa/common/x86/x86util.asm
--- x264-0.142.2389+git956c8d8/common/x86/x86util.asm	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/common/x86/x86util.asm	2014-07-11 01:16:23.000000000 +0000
@@ -298,11 +298,16 @@
     paddd   %1, %2
 %endif
 %if mmsize >= 16
+%if cpuflag(xop) && sizeof%1 == 16
+    vphadddq %1, %1
+%endif
     movhlps %2, %1
     paddd   %1, %2
 %endif
+%if notcpuflag(xop) || sizeof%1 != 16
     PSHUFLW %2, %1, q0032
     paddd   %1, %2
+%endif
 %undef %1
 %undef %2
 %endmacro
diff -Nru x264-0.142.2389+git956c8d8/config.guess x264-0.142.2431+gita5831aa/config.guess
--- x264-0.142.2389+git956c8d8/config.guess	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/config.guess	2014-07-11 01:16:23.000000000 +0000
@@ -1,14 +1,12 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-#   2011, 2012 Free Software Foundation, Inc.
+#   Copyright 1992-2013 Free Software Foundation, Inc.
 
-timestamp='2012-09-25'
+timestamp='2013-06-10'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
+# the Free Software Foundation; either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
@@ -22,19 +20,17 @@
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-
-# Originally written by Per Bothner.  Please send patches (context
-# diff format) to <config-patches@gnu.org> and include a ChangeLog
-# entry.
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
 #
-# This script attempts to guess a canonical system name similar to
-# config.sub.  If it succeeds, it prints the system name on stdout, and
-# exits with 0.  Otherwise, it exits with 1.
+# Originally written by Per Bothner.
 #
 # You can get the latest version of this script from:
 # http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+#
+# Please send patches with a ChangeLog entry to config-patches@gnu.org.
+
 
 me=`echo "$0" | sed -e 's,.*/,,'`
 
@@ -54,9 +50,7 @@
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
-Free Software Foundation, Inc.
+Copyright 1992-2013 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -138,6 +132,27 @@
 UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
 UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
 
+case "${UNAME_SYSTEM}" in
+Linux|GNU|GNU/*)
+	# If the system lacks a compiler, then just pick glibc.
+	# We could probably try harder.
+	LIBC=gnu
+
+	eval $set_cc_for_build
+	cat <<-EOF > $dummy.c
+	#include <features.h>
+	#if defined(__UCLIBC__)
+	LIBC=uclibc
+	#elif defined(__dietlibc__)
+	LIBC=dietlibc
+	#else
+	LIBC=gnu
+	#endif
+	EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	;;
+esac
+
 # Note: order is significant - the case branches are not exclusive.
 
 case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
@@ -859,21 +874,21 @@
 	exit ;;
     *:GNU:*:*)
 	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
 	exit ;;
     *:GNU/*:*:*)
 	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
 	exit ;;
     i*86:Minix:*:*)
 	echo ${UNAME_MACHINE}-pc-minix
 	exit ;;
     aarch64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     aarch64_be:Linux:*:*)
 	UNAME_MACHINE=aarch64_be
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     alpha:Linux:*:*)
 	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
@@ -886,59 +901,54 @@
 	  EV68*) UNAME_MACHINE=alphaev68 ;;
 	esac
 	objdump --private-headers /bin/sh | grep -q ld.so.1
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	if test "$?" = 0 ; then LIBC="gnulibc1" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
+    arc:Linux:*:* | arceb:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     arm*:Linux:*:*)
 	eval $set_cc_for_build
 	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
 	    | grep -q __ARM_EABI__
 	then
-	    echo ${UNAME_MACHINE}-unknown-linux-gnu
+	    echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	else
 	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
 		| grep -q __ARM_PCS_VFP
 	    then
-		echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
 	    else
-		echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
+		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf
 	    fi
 	fi
 	exit ;;
     avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     cris:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
 	exit ;;
     crisv32:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
 	exit ;;
     frv:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     hexagon:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     i*86:Linux:*:*)
-	LIBC=gnu
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#ifdef __dietlibc__
-	LIBC=dietlibc
-	#endif
-EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
-	echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
 	exit ;;
     ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     mips:Linux:*:* | mips64:Linux:*:*)
 	eval $set_cc_for_build
@@ -957,54 +967,63 @@
 	#endif
 EOF
 	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
 	;;
+    or1k:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	exit ;;
     or32:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     padre:Linux:*:*)
-	echo sparc-unknown-linux-gnu
+	echo sparc-unknown-linux-${LIBC}
 	exit ;;
     parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
+	echo hppa64-unknown-linux-${LIBC}
 	exit ;;
     parisc:Linux:*:* | hppa:Linux:*:*)
 	# Look for CPU level
 	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
-	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
-	  *)    echo hppa-unknown-linux-gnu ;;
+	  PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
+	  PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
+	  *)    echo hppa-unknown-linux-${LIBC} ;;
 	esac
 	exit ;;
     ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
+	echo powerpc64-unknown-linux-${LIBC}
 	exit ;;
     ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
+	echo powerpc-unknown-linux-${LIBC}
+	exit ;;
+    ppc64le:Linux:*:*)
+	echo powerpc64le-unknown-linux-${LIBC}
+	exit ;;
+    ppcle:Linux:*:*)
+	echo powerpcle-unknown-linux-${LIBC}
 	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux
+	echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
 	exit ;;
     sh64*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     tile*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-gnu
+	echo ${UNAME_MACHINE}-dec-linux-${LIBC}
 	exit ;;
     x86_64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     xtensa*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
 	exit ;;
     i*86:DYNIX/ptx:4*:*)
 	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
@@ -1237,19 +1256,21 @@
 	exit ;;
     *:Darwin:*:*)
 	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
-	case $UNAME_PROCESSOR in
-	    i386)
-		eval $set_cc_for_build
-		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
-		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
-		      grep IS_64BIT_ARCH >/dev/null
-		  then
-		      UNAME_PROCESSOR="x86_64"
-		  fi
-		fi ;;
-	    unknown) UNAME_PROCESSOR=powerpc ;;
-	esac
+	eval $set_cc_for_build
+	if test "$UNAME_PROCESSOR" = unknown ; then
+	    UNAME_PROCESSOR=powerpc
+	fi
+	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+	    if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		grep IS_64BIT_ARCH >/dev/null
+	    then
+		case $UNAME_PROCESSOR in
+		    i386) UNAME_PROCESSOR=x86_64 ;;
+		    powerpc) UNAME_PROCESSOR=powerpc64 ;;
+		esac
+	    fi
+	fi
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
 	exit ;;
     *:procnto*:*:* | *:QNX:[0123456789]*:*)
diff -Nru x264-0.142.2389+git956c8d8/config.sub x264-0.142.2431+gita5831aa/config.sub
--- x264-0.142.2389+git956c8d8/config.sub	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/config.sub	2014-07-11 01:16:23.000000000 +0000
@@ -1,24 +1,18 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
-#   2011, 2012 Free Software Foundation, Inc.
-
-timestamp='2012-12-06'
-
-# This file is (in principle) common to ALL GNU software.
-# The presence of a machine in this file suggests that SOME GNU software
-# can handle that machine.  It does not imply ALL GNU software can.
-#
-# This file is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2 of the License, or
+#   Copyright 1992-2013 Free Software Foundation, Inc.
+
+timestamp='2013-08-10'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
 # (at your option) any later version.
 #
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, see <http://www.gnu.org/licenses/>.
@@ -26,11 +20,12 @@
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
 
 
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted GNU ChangeLog entry.
+# Please send patches with a ChangeLog entry to config-patches@gnu.org.
 #
 # Configuration subroutine to validate and canonicalize a configuration type.
 # Supply the specified configuration type as an argument.
@@ -73,9 +68,7 @@
 version="\
 GNU config.sub ($timestamp)
 
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
-2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
-Free Software Foundation, Inc.
+Copyright 1992-2013 Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -259,12 +252,12 @@
 	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
 	| am33_2.0 \
-	| arc \
+	| arc | arceb \
 	| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
 	| avr | avr32 \
 	| be32 | be64 \
 	| bfin \
-	| c4x | clipper \
+	| c4x | c8051 | clipper \
 	| d10v | d30v | dlx | dsp16xx \
 	| epiphany \
 	| fido | fr30 | frv \
@@ -293,16 +286,17 @@
 	| mipsisa64r2 | mipsisa64r2el \
 	| mipsisa64sb1 | mipsisa64sb1el \
 	| mipsisa64sr71k | mipsisa64sr71kel \
+	| mipsr5900 | mipsr5900el \
 	| mipstx39 | mipstx39el \
 	| mn10200 | mn10300 \
 	| moxie \
 	| mt \
 	| msp430 \
 	| nds32 | nds32le | nds32be \
-	| nios | nios2 \
+	| nios | nios2 | nios2eb | nios2el \
 	| ns16k | ns32k \
 	| open8 \
-	| or32 \
+	| or1k | or32 \
 	| pdp10 | pdp11 | pj | pjl \
 	| powerpc | powerpc64 | powerpc64le | powerpcle \
 	| pyramid \
@@ -372,13 +366,13 @@
 	| aarch64-* | aarch64_be-* \
 	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
-	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
 	| avr-* | avr32-* \
 	| be32-* | be64-* \
 	| bfin-* | bs2000-* \
 	| c[123]* | c30-* | [cjt]90-* | c4x-* \
-	| clipper-* | craynv-* | cydra-* \
+	| c8051-* | clipper-* | craynv-* | cydra-* \
 	| d10v-* | d30v-* | dlx-* \
 	| elxsi-* \
 	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
@@ -410,12 +404,13 @@
 	| mipsisa64r2-* | mipsisa64r2el-* \
 	| mipsisa64sb1-* | mipsisa64sb1el-* \
 	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
+	| mipsr5900-* | mipsr5900el-* \
 	| mipstx39-* | mipstx39el-* \
 	| mmix-* \
 	| mt-* \
 	| msp430-* \
 	| nds32-* | nds32le-* | nds32be-* \
-	| nios-* | nios2-* \
+	| nios-* | nios2-* | nios2eb-* | nios2el-* \
 	| none-* | np1-* | ns16k-* | ns32k-* \
 	| open8-* \
 	| orion-* \
@@ -799,7 +794,7 @@
 		os=-mingw64
 		;;
 	mingw32)
-		basic_machine=i386-pc
+		basic_machine=i686-pc
 		os=-mingw32
 		;;
 	mingw32ce)
@@ -835,7 +830,7 @@
 		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
 		;;
 	msys)
-		basic_machine=i386-pc
+		basic_machine=i686-pc
 		os=-msys
 		;;
 	mvs)
@@ -1357,7 +1352,7 @@
 	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
 	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
 	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
-	      | -sym* | -kopensolaris* \
+	      | -sym* | -kopensolaris* | -plan9* \
 	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
 	      | -aos* | -aros* \
 	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
@@ -1503,9 +1498,6 @@
 	-aros*)
 		os=-aros
 		;;
-	-kaos*)
-		os=-kaos
-		;;
 	-zvmoe)
 		os=-zvmoe
 		;;
@@ -1554,6 +1546,9 @@
 	c4x-* | tic4x-*)
 		os=-coff
 		;;
+	c8051-*)
+		os=-elf
+		;;
 	hexagon-*)
 		os=-elf
 		;;
@@ -1597,6 +1592,9 @@
 	mips*-*)
 		os=-elf
 		;;
+	or1k-*)
+		os=-elf
+		;;
 	or32-*)
 		os=-coff
 		;;
diff -Nru x264-0.142.2389+git956c8d8/configure x264-0.142.2431+gita5831aa/configure
--- x264-0.142.2389+git956c8d8/configure	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/configure	2014-07-11 01:16:23.000000000 +0000
@@ -467,7 +467,6 @@
         ;;
     darwin*)
         SYS="MACOSX"
-        CFLAGS="$CFLAGS -falign-loops=16"
         libm="-lm"
         if [ "$pic" = "no" ]; then
             cc_check "" -mdynamic-no-pic && CFLAGS="$CFLAGS -mdynamic-no-pic"
@@ -557,7 +556,7 @@
 
 LDFLAGS="$LDFLAGS $libm"
 
-aligned_stack=1
+stack_alignment=16
 case $host_cpu in
     i*86)
         ARCH="X86"
@@ -577,8 +576,7 @@
             if [ $SYS = LINUX ]; then
                 # < 11 is completely incapable of keeping a mod16 stack
                 if cpp_check "" "" "__INTEL_COMPILER < 1100" ; then
-                    define BROKEN_STACK_ALIGNMENT
-                    aligned_stack=0
+                    stack_alignment=4
                 # 11 <= x < 12 is capable of keeping a mod16 stack, but defaults to not doing so.
                 elif cpp_check "" "" "__INTEL_COMPILER < 1200" ; then
                     CFLAGS="$CFLAGS -falign-stack=assume-16-byte"
@@ -586,7 +584,7 @@
                 # >= 12 defaults to a mod16 stack
             fi
             # icl on windows has no mod16 stack support
-            [ $SYS = WINDOWS ] && define BROKEN_STACK_ALIGNMENT && aligned_stack=0
+            [ $SYS = WINDOWS ] && stack_alignment=4
         fi
         if [ "$SYS" = MACOSX ]; then
             ASFLAGS="$ASFLAGS -f macho -DPREFIX"
@@ -681,7 +679,6 @@
         ARCH="$(echo $host_cpu | tr a-z A-Z)"
         ;;
 esac
-ASFLAGS="$ASFLAGS -DHAVE_ALIGNED_STACK=${aligned_stack}"
 
 if [ $SYS = WINDOWS ]; then
     if ! rc_check "0 RCDATA {0}" ; then
@@ -713,7 +710,7 @@
     fi
 fi
 
-if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" \) ] ; then
+if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" -o $ARCH = "AARCH64" \) ] ; then
     pic="yes"
 fi
 
@@ -733,10 +730,11 @@
         echo "If you really want to compile without asm, configure with --disable-asm."
         exit 1
     fi
+    ASFLAGS="$ASFLAGS -Worphan-labels"
     define HAVE_MMX
-    if cc_check '' -mpreferred-stack-boundary=5 ; then
+    if [ $compiler = GNU ] && cc_check '' -mpreferred-stack-boundary=5 ; then
         CFLAGS="$CFLAGS -mpreferred-stack-boundary=5"
-        define HAVE_32B_STACK_ALIGNMENT
+        stack_alignment=32
     fi
 fi
 
@@ -761,6 +759,9 @@
 define ARCH_$ARCH
 define SYS_$SYS
 
+define STACK_ALIGNMENT $stack_alignment
+ASFLAGS="$ASFLAGS -DSTACK_ALIGNMENT=$stack_alignment"
+
 # skip endianness check for Intel Compiler, as all supported platforms are little. the -ipo flag will also cause the check to fail
 if [ $compiler = GNU ]; then
     echo "int i[2] = {0x42494745,0}; double f[2] = {0x1.0656e6469616ep+102,0};" > conftest.c
@@ -806,10 +807,15 @@
             fi
             ;;
         QNX)
-            cc_check pthread.h -lc && thread="posix" && libpthread="-lc"
+            cc_check pthread.h -lc "pthread_create(0,0,0,0);" && thread="posix" && libpthread="-lc"
             ;;
         *)
-            cc_check pthread.h -lpthread && thread="posix" && libpthread="-lpthread"
+            if cc_check pthread.h -lpthread "pthread_create(0,0,0,0);" ; then
+               thread="posix"
+               libpthread="-lpthread"
+            else
+                cc_check pthread.h "" "pthread_create(0,0,0,0);" && thread="posix" && libpthread=""
+            fi
             ;;
     esac
 fi
@@ -998,7 +1004,6 @@
 fi
 
 if [ "$strip" = "yes" ]; then
-    CFLAGS="$CFLAGS -s"
     LDFLAGS="$LDFLAGS -s"
 fi
 
diff -Nru x264-0.142.2389+git956c8d8/debian/changelog x264-0.142.2431+gita5831aa/debian/changelog
--- x264-0.142.2389+git956c8d8/debian/changelog	2014-05-12 23:02:24.000000000 +0000
+++ x264-0.142.2431+gita5831aa/debian/changelog	2014-07-11 01:21:53.000000000 +0000
@@ -1,3 +1,19 @@
+x264 (2:0.142.2431+gita5831aa-1) unstable; urgency=low
+
+  * Update to new upstream snapshot
+  * Imported Upstream version 0.142.2431+gita5831aa
+  * Fixes for armel and armhf (Closes: #752168)
+
+ -- Reinhard Tartler <siretart@tauware.de>  Thu, 10 Jul 2014 21:21:51 -0400
+
+x264 (2:0.142.2412+gitd7e6896-1) unstable; urgency=medium
+
+  * Update to new upstream snapshot
+  * Imported Upstream version 0.142.2412+gitd7e6896
+  * Drop powerpcspe.patch which got upstreamed
+
+ -- Rico Tzschichholz <ricotz@ubuntu.com>  Sat, 07 Jun 2014 09:16:36 +0200
+
 x264 (2:0.142.2389+git956c8d8-5) unstable; urgency=medium
 
   [ Peter Michael Green ]
diff -Nru x264-0.142.2389+git956c8d8/debian/patches/aarch64-pic.patch x264-0.142.2431+gita5831aa/debian/patches/aarch64-pic.patch
--- x264-0.142.2389+git956c8d8/debian/patches/aarch64-pic.patch	2014-05-11 19:10:39.000000000 +0000
+++ x264-0.142.2431+gita5831aa/debian/patches/aarch64-pic.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,14 +0,0 @@
-Description: Build with -fPIC on aarch64 to fix link failure.
-Author: William Grant <wgrant@ubuntu.com>
-
---- a/configure
-+++ b/configure
-@@ -713,7 +713,7 @@ if [ $compiler != ICL ]; then
-     fi
- fi
- 
--if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" \) ] ; then
-+if [ $shared = yes -a \( $ARCH = "X86_64" -o $ARCH = "PPC" -o $ARCH = "ALPHA" -o $ARCH = "ARM" -o $ARCH = "IA64" -o $ARCH = "PARISC" -o $ARCH = "MIPS" -o $ARCH = "AARCH64" \) ] ; then
-     pic="yes"
- fi
- 
diff -Nru x264-0.142.2389+git956c8d8/debian/patches/link_gpac_dynamically.patch x264-0.142.2431+gita5831aa/debian/patches/link_gpac_dynamically.patch
--- x264-0.142.2389+git956c8d8/debian/patches/link_gpac_dynamically.patch	2014-05-12 23:00:26.000000000 +0000
+++ x264-0.142.2431+gita5831aa/debian/patches/link_gpac_dynamically.patch	2014-07-11 01:22:39.000000000 +0000
@@ -1,6 +1,6 @@
 --- a/configure
 +++ b/configure
-@@ -940,7 +940,7 @@ fi
+@@ -946,7 +946,7 @@ fi
  
  if [ "$gpac" = "auto" -a "$lsmash" != "yes" ] ; then
      gpac="no"
diff -Nru x264-0.142.2389+git956c8d8/debian/patches/powerpcspe.patch x264-0.142.2431+gita5831aa/debian/patches/powerpcspe.patch
--- x264-0.142.2389+git956c8d8/debian/patches/powerpcspe.patch	2014-05-11 19:10:39.000000000 +0000
+++ x264-0.142.2431+gita5831aa/debian/patches/powerpcspe.patch	1970-01-01 00:00:00.000000000 +0000
@@ -1,20 +0,0 @@
---- a/common/cpu.c
-+++ b/common/cpu.c
-@@ -338,6 +338,9 @@ uint32_t x264_cpu_detect( void )
- 
- uint32_t x264_cpu_detect( void )
- {
-+#ifdef __NO_FPRS__
-+    return 0;
-+#else
-     static void (*oldsig)( int );
- 
-     oldsig = signal( SIGILL, sigill_handler );
-@@ -357,6 +360,7 @@ uint32_t x264_cpu_detect( void )
-     signal( SIGILL, oldsig );
- 
-     return X264_CPU_ALTIVEC;
-+#endif
- }
- #endif
- 
diff -Nru x264-0.142.2389+git956c8d8/debian/patches/series x264-0.142.2431+gita5831aa/debian/patches/series
--- x264-0.142.2389+git956c8d8/debian/patches/series	2014-05-12 22:58:28.000000000 +0000
+++ x264-0.142.2431+gita5831aa/debian/patches/series	2014-07-11 01:22:34.000000000 +0000
@@ -1,3 +1 @@
-aarch64-pic.patch
-powerpcspe.patch
 link_gpac_dynamically.patch
diff -Nru x264-0.142.2389+git956c8d8/debian/x264-get-orig-source x264-0.142.2431+gita5831aa/debian/x264-get-orig-source
--- x264-0.142.2389+git956c8d8/debian/x264-get-orig-source	2014-05-11 19:10:39.000000000 +0000
+++ x264-0.142.2431+gita5831aa/debian/x264-get-orig-source	2014-07-11 01:15:14.000000000 +0000
@@ -3,8 +3,8 @@
 # Script used to generate the orig source tarball for x264.
 
 X264_GIT_URL="git://git.videolan.org/x264.git"
-X264_GIT_COMMIT="956c8d8c2a3c2fb1f2f17807532321e492c75efc"
-DATE_RETRIEVED="20140116"
+X264_GIT_COMMIT="a5831aa256b3161f898d2577d2eb8daa838d88d2"
+DATE_RETRIEVED="20140422"
 COMMIT_SHORT_FORM="$(echo $X264_GIT_COMMIT | \
                      sed -e 's/^\([[:xdigit:]]\{,7\}\).*/\1/')"
 
diff -Nru x264-0.142.2389+git956c8d8/encoder/analyse.c x264-0.142.2431+gita5831aa/encoder/analyse.c
--- x264-0.142.2389+git956c8d8/encoder/analyse.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/analyse.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/encoder/cabac.c x264-0.142.2431+gita5831aa/encoder/cabac.c
--- x264-0.142.2389+git956c8d8/encoder/cabac.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/cabac.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/encoder/cavlc.c x264-0.142.2431+gita5831aa/encoder/cavlc.c
--- x264-0.142.2389+git956c8d8/encoder/cavlc.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/cavlc.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -500,6 +500,9 @@
         && (!(h->mb.i_mb_y & 1) || IS_SKIP(h->mb.type[h->mb.i_mb_xy - h->mb.i_mb_stride])) )
     {
         bs_write1( s, MB_INTERLACED );
+#if !RDO_SKIP_BS
+        h->mb.field_decoding_flag = MB_INTERLACED;
+#endif
     }
 
 #if !RDO_SKIP_BS
diff -Nru x264-0.142.2389+git956c8d8/encoder/encoder.c x264-0.142.2431+gita5831aa/encoder/encoder.c
--- x264-0.142.2389+git956c8d8/encoder/encoder.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/encoder.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -2557,7 +2557,7 @@
     }
 }
 
-static int x264_slice_write( x264_t *h )
+static intptr_t x264_slice_write( x264_t *h )
 {
     int i_skip;
     int mb_xy, i_mb_x, i_mb_y;
@@ -2567,7 +2567,8 @@
      * other inaccuracies. */
     int overhead_guess = (NALU_OVERHEAD - (h->param.b_annexb && h->out.i_nal)) + 1 + h->param.b_cabac + 5;
     int slice_max_size = h->param.i_slice_max_size > 0 ? (h->param.i_slice_max_size-overhead_guess)*8 : 0;
-    int back_up_bitstream = slice_max_size || (!h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH);
+    int back_up_bitstream_cavlc = !h->param.b_cabac && h->sps->i_profile_idc < PROFILE_HIGH;
+    int back_up_bitstream = slice_max_size || back_up_bitstream_cavlc;
     int starting_bits = bs_pos(&h->out.bs);
     int b_deblock = h->sh.i_disable_deblocking_filter_idc != 1;
     int b_hpel = h->fdec->b_kept_as_ref;
@@ -2575,9 +2576,10 @@
     int thread_last_mb = h->i_threadslice_end * h->mb.i_mb_width - 1;
     uint8_t *last_emu_check;
 #define BS_BAK_SLICE_MAX_SIZE 0
-#define BS_BAK_SLICE_MIN_MBS  1
-#define BS_BAK_ROW_VBV        2
-    x264_bs_bak_t bs_bak[3];
+#define BS_BAK_CAVLC_OVERFLOW 1
+#define BS_BAK_SLICE_MIN_MBS  2
+#define BS_BAK_ROW_VBV        3
+    x264_bs_bak_t bs_bak[4];
     b_deblock &= b_hpel || h->param.b_full_recon || h->param.psz_dump_yuv;
     bs_realign( &h->out.bs );
 
@@ -2630,11 +2632,16 @@
                 x264_fdec_filter_row( h, i_mb_y, 0 );
         }
 
-        if( !(i_mb_y & SLICE_MBAFF) && back_up_bitstream )
+        if( back_up_bitstream )
         {
-            x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], i_skip, 0 );
-            if( slice_max_size && (thread_last_mb+1-mb_xy) == h->param.i_slice_min_mbs )
-                x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], i_skip, 0 );
+            if( back_up_bitstream_cavlc )
+                x264_bitstream_backup( h, &bs_bak[BS_BAK_CAVLC_OVERFLOW], i_skip, 0 );
+            if( slice_max_size && !(i_mb_y & SLICE_MBAFF) )
+            {
+                x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], i_skip, 0 );
+                if( (thread_last_mb+1-mb_xy) == h->param.i_slice_min_mbs )
+                    x264_bitstream_backup( h, &bs_bak[BS_BAK_SLICE_MIN_MBS], i_skip, 0 );
+            }
         }
 
         if( PARAM_INTERLACED )
@@ -2698,7 +2705,7 @@
                     h->mb.i_skip_intra = 0;
                     h->mb.b_skip_mc = 0;
                     h->mb.b_overflow = 0;
-                    x264_bitstream_restore( h, &bs_bak[BS_BAK_SLICE_MAX_SIZE], &i_skip, 0 );
+                    x264_bitstream_restore( h, &bs_bak[BS_BAK_CAVLC_OVERFLOW], &i_skip, 0 );
                     goto reencode;
                 }
             }
diff -Nru x264-0.142.2389+git956c8d8/encoder/macroblock.c x264-0.142.2431+gita5831aa/encoder/macroblock.c
--- x264-0.142.2389+git956c8d8/encoder/macroblock.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/macroblock.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Henrik Gramner <henrik@gramner.com>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/encoder/me.c x264-0.142.2431+gita5831aa/encoder/me.c
--- x264-0.142.2389+git956c8d8/encoder/me.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/me.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/encoder/ratecontrol.c x264-0.142.2431+gita5831aa/encoder/ratecontrol.c
--- x264-0.142.2389+git956c8d8/encoder/ratecontrol.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/ratecontrol.c	2014-07-11 01:16:23.000000000 +0000
@@ -6,7 +6,7 @@
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Michael Niedermayer <michaelni@gmx.at>
  *          Gabriel Bouvigne <gabriel.bouvigne@joost.com>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          M�ns Rullg�rd <mru@mru.ath.cx>
  *
  * This program is free software; you can redistribute it and/or modify
diff -Nru x264-0.142.2389+git956c8d8/encoder/rdo.c x264-0.142.2431+gita5831aa/encoder/rdo.c
--- x264-0.142.2389+git956c8d8/encoder/rdo.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/rdo.c	2014-07-11 01:16:23.000000000 +0000
@@ -4,7 +4,7 @@
  * Copyright (C) 2005-2014 x264 project
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
diff -Nru x264-0.142.2389+git956c8d8/encoder/set.c x264-0.142.2431+gita5831aa/encoder/set.c
--- x264-0.142.2389+git956c8d8/encoder/set.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/set.c	2014-07-11 01:16:23.000000000 +0000
@@ -228,7 +228,8 @@
     }
 
     /* FIXME: not sufficient for interlaced video */
-    sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5;
+    sps->vui.b_chroma_loc_info_present = param->vui.i_chroma_loc > 0 && param->vui.i_chroma_loc <= 5 &&
+                                         sps->i_chroma_format_idc == CHROMA_420;
     if( sps->vui.b_chroma_loc_info_present )
     {
         sps->vui.i_chroma_loc_top = param->vui.i_chroma_loc;
diff -Nru x264-0.142.2389+git956c8d8/encoder/slicetype.c x264-0.142.2431+gita5831aa/encoder/slicetype.c
--- x264-0.142.2389+git956c8d8/encoder/slicetype.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/encoder/slicetype.c	2014-07-11 01:16:23.000000000 +0000
@@ -3,7 +3,7 @@
  *****************************************************************************
  * Copyright (C) 2005-2014 x264 project
  *
- * Authors: Jason Garrett-Glaser <darkshikari@gmail.com>
+ * Authors: Fiona Glaser <fiona@x264.com>
  *          Loren Merritt <lorenm@u.washington.edu>
  *          Dylan Yudaken <dyudaken@gmail.com>
  *
@@ -1022,9 +1022,12 @@
     return i_score;
 }
 
+/* Trade off precision in mbtree for increased range */
+#define MBTREE_PRECISION 0.5f
+
 static void x264_macroblock_tree_finish( x264_t *h, x264_frame_t *frame, float average_duration, int ref0_distance )
 {
-    int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 );
+    int fps_factor = round( CLIP_DURATION(average_duration) / CLIP_DURATION(frame->f_duration) * 256 / MBTREE_PRECISION );
     float weightdelta = 0.0;
     if( ref0_distance && frame->f_weighted_cost_delta[ref0_distance-1] > 0 )
         weightdelta = (1.0 - frame->f_weighted_cost_delta[ref0_distance-1]);
@@ -1051,11 +1054,12 @@
     int i_bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor>>2) : 32;
     int16_t (*mvs[2])[2] = { frames[b]->lowres_mvs[0][b-p0-1], frames[b]->lowres_mvs[1][p1-b-1] };
     int bipred_weights[2] = {i_bipred_weight, 64 - i_bipred_weight};
-    int *buf = h->scratch_buffer;
+    int16_t *buf = h->scratch_buffer;
     uint16_t *propagate_cost = frames[b]->i_propagate_cost;
+    uint16_t *lowres_costs = frames[b]->lowres_costs[b-p0][p1-b];
 
     x264_emms();
-    float fps_factor = CLIP_DURATION(frames[b]->f_duration) / CLIP_DURATION(average_duration);
+    float fps_factor = CLIP_DURATION(frames[b]->f_duration) / (CLIP_DURATION(average_duration) * 256.0f) * MBTREE_PRECISION;
 
     /* For non-reffed frames the source costs are always zero, so just memset one row and re-use it. */
     if( !referenced )
@@ -1065,72 +1069,17 @@
     {
         int mb_index = h->mb.i_mb_y*h->mb.i_mb_stride;
         h->mc.mbtree_propagate_cost( buf, propagate_cost,
-            frames[b]->i_intra_cost+mb_index, frames[b]->lowres_costs[b-p0][p1-b]+mb_index,
+            frames[b]->i_intra_cost+mb_index, lowres_costs+mb_index,
             frames[b]->i_inv_qscale_factor+mb_index, &fps_factor, h->mb.i_mb_width );
         if( referenced )
             propagate_cost += h->mb.i_mb_width;
-        for( h->mb.i_mb_x = 0; h->mb.i_mb_x < h->mb.i_mb_width; h->mb.i_mb_x++, mb_index++ )
+
+        h->mc.mbtree_propagate_list( h, ref_costs[0], &mvs[0][mb_index], buf, &lowres_costs[mb_index],
+                                     bipred_weights[0], h->mb.i_mb_y, h->mb.i_mb_width, 0 );
+        if( b != p1 )
         {
-            int propagate_amount = buf[h->mb.i_mb_x];
-            /* Don't propagate for an intra block. */
-            if( propagate_amount > 0 )
-            {
-                /* Access width-2 bitfield. */
-                int lists_used = frames[b]->lowres_costs[b-p0][p1-b][mb_index] >> LOWRES_COST_SHIFT;
-                /* Follow the MVs to the previous frame(s). */
-                for( int list = 0; list < 2; list++ )
-                    if( (lists_used >> list)&1 )
-                    {
-#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<16)-1)
-                        int listamount = propagate_amount;
-                        /* Apply bipred weighting. */
-                        if( lists_used == 3 )
-                            listamount = (listamount * bipred_weights[list] + 32) >> 6;
-
-                        /* Early termination for simple case of mv0. */
-                        if( !M32( mvs[list][mb_index] ) )
-                        {
-                            CLIP_ADD( ref_costs[list][mb_index], listamount );
-                            continue;
-                        }
-
-                        int x = mvs[list][mb_index][0];
-                        int y = mvs[list][mb_index][1];
-                        int mbx = (x>>5)+h->mb.i_mb_x;
-                        int mby = (y>>5)+h->mb.i_mb_y;
-                        int idx0 = mbx + mby * h->mb.i_mb_stride;
-                        int idx1 = idx0 + 1;
-                        int idx2 = idx0 + h->mb.i_mb_stride;
-                        int idx3 = idx0 + h->mb.i_mb_stride + 1;
-                        x &= 31;
-                        y &= 31;
-                        int idx0weight = (32-y)*(32-x);
-                        int idx1weight = (32-y)*x;
-                        int idx2weight = y*(32-x);
-                        int idx3weight = y*x;
-
-                        /* We could just clip the MVs, but pixels that lie outside the frame probably shouldn't
-                         * be counted. */
-                        if( mbx < h->mb.i_mb_width-1 && mby < h->mb.i_mb_height-1 && mbx >= 0 && mby >= 0 )
-                        {
-                            CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
-                            CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
-                            CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
-                            CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
-                        }
-                        else /* Check offsets individually */
-                        {
-                            if( mbx < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx >= 0 && mby >= 0 )
-                                CLIP_ADD( ref_costs[list][idx0], (listamount*idx0weight+512)>>10 );
-                            if( mbx+1 < h->mb.i_mb_width && mby < h->mb.i_mb_height && mbx+1 >= 0 && mby >= 0 )
-                                CLIP_ADD( ref_costs[list][idx1], (listamount*idx1weight+512)>>10 );
-                            if( mbx < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx >= 0 && mby+1 >= 0 )
-                                CLIP_ADD( ref_costs[list][idx2], (listamount*idx2weight+512)>>10 );
-                            if( mbx+1 < h->mb.i_mb_width && mby+1 < h->mb.i_mb_height && mbx+1 >= 0 && mby+1 >= 0 )
-                                CLIP_ADD( ref_costs[list][idx3], (listamount*idx3weight+512)>>10 );
-                        }
-                    }
-            }
+            h->mc.mbtree_propagate_list( h, ref_costs[1], &mvs[1][mb_index], buf, &lowres_costs[mb_index],
+                                         bipred_weights[1], h->mb.i_mb_y, h->mb.i_mb_width, 1 );
         }
     }
 
diff -Nru x264-0.142.2389+git956c8d8/output/matroska.c x264-0.142.2431+gita5831aa/output/matroska.c
--- x264-0.142.2389+git956c8d8/output/matroska.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/output/matroska.c	2014-07-11 01:16:23.000000000 +0000
@@ -33,6 +33,7 @@
     int width, height, d_width, d_height;
 
     int display_size_units;
+    int stereo_mode;
 
     int64_t frame_duration;
 
@@ -79,6 +80,7 @@
     p_mkv->width = p_mkv->d_width = p_param->i_width;
     p_mkv->height = p_mkv->d_height = p_param->i_height;
     p_mkv->display_size_units = DS_PIXELS;
+    p_mkv->stereo_mode = p_param->i_frame_packing;
 
     if( p_param->vui.i_sar_width && p_param->vui.i_sar_height
         && p_param->vui.i_sar_width != p_param->vui.i_sar_height )
@@ -147,7 +149,7 @@
     ret = mk_write_header( p_mkv->w, "x264" X264_VERSION, "V_MPEG4/ISO/AVC",
                            avcC, avcC_len, p_mkv->frame_duration, 50000,
                            p_mkv->width, p_mkv->height,
-                           p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units );
+                           p_mkv->d_width, p_mkv->d_height, p_mkv->display_size_units, p_mkv->stereo_mode );
     if( ret < 0 )
         return ret;
 
diff -Nru x264-0.142.2389+git956c8d8/output/matroska_ebml.c x264-0.142.2431+gita5831aa/output/matroska_ebml.c
--- x264-0.142.2389+git956c8d8/output/matroska_ebml.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/output/matroska_ebml.c	2014-07-11 01:16:23.000000000 +0000
@@ -317,13 +317,15 @@
     return w;
 }
 
+static const uint8_t mk_stereo_modes[6] = {5,9,7,1,3,13};
+
 int mk_write_header( mk_writer *w, const char *writing_app,
                      const char *codec_id,
                      const void *codec_private, unsigned codec_private_size,
                      int64_t default_frame_duration,
                      int64_t timescale,
                      unsigned width, unsigned height,
-                     unsigned d_width, unsigned d_height, int display_size_units )
+                     unsigned d_width, unsigned d_height, int display_size_units, int stereo_mode )
 {
     mk_context  *c, *ti, *v;
 
@@ -379,6 +381,8 @@
     CHECK( mk_write_uint( v, 0x54b2, display_size_units ) );
     CHECK( mk_write_uint( v, 0x54b0, d_width ) );
     CHECK( mk_write_uint( v, 0x54ba, d_height ) );
+    if( stereo_mode >= 0 && stereo_mode <= 5 )
+        CHECK( mk_write_uint( v, 0x53b8, mk_stereo_modes[stereo_mode] ) );
     CHECK( mk_close_context( v, 0 ) );
 
     CHECK( mk_close_context( ti, 0 ) );
diff -Nru x264-0.142.2389+git956c8d8/output/matroska_ebml.h x264-0.142.2431+gita5831aa/output/matroska_ebml.h
--- x264-0.142.2389+git956c8d8/output/matroska_ebml.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/output/matroska_ebml.h	2014-07-11 01:16:23.000000000 +0000
@@ -42,7 +42,7 @@
                      int64_t default_frame_duration,
                      int64_t timescale,
                      unsigned width, unsigned height,
-                     unsigned d_width, unsigned d_height, int display_size_units );
+                     unsigned d_width, unsigned d_height, int display_size_units, int stereo_mode );
 
 int mk_start_frame( mk_writer *w );
 int mk_add_frame_data( mk_writer *w, const void *data, unsigned size );
diff -Nru x264-0.142.2389+git956c8d8/tools/checkasm.c x264-0.142.2431+gita5831aa/tools/checkasm.c
--- x264-0.142.2389+git956c8d8/tools/checkasm.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/tools/checkasm.c	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -200,7 +200,7 @@
                     b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
 #endif
                     "",
-                    ((int64_t)10*b->cycles/b->den - nop_time)/4 );
+                    (int64_t)(10*b->cycles/b->den - nop_time)/4 );
         }
 }
 
@@ -1451,6 +1451,37 @@
                 }
         }
     }
+
+    if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb )
+    {
+        set_func_name( "plane_copy_deinterleave_rgb" );
+        used_asm = 1;
+        for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
+        {
+            int w = (plane_specs[i].w + 2) >> 2;
+            int h = plane_specs[i].h;
+            intptr_t src_stride = plane_specs[i].src_stride;
+            intptr_t dst_stride = ALIGN( w, 16 );
+            intptr_t offv = dst_stride*h + 16;
+
+            for( int pw = 3; pw <= 4; pw++ )
+            {
+                memset( pbuf3, 0, 0x1000 );
+                memset( pbuf4, 0, 0x1000 );
+                call_c( mc_c.plane_copy_deinterleave_rgb, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf3+2*offv, dst_stride, pbuf1, src_stride, pw, w, h );
+                call_a( mc_a.plane_copy_deinterleave_rgb, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf4+2*offv, dst_stride, pbuf1, src_stride, pw, w, h );
+                for( int y = 0; y < h; y++ )
+                    if( memcmp( pbuf3+y*dst_stride+0*offv, pbuf4+y*dst_stride+0*offv, w ) ||
+                        memcmp( pbuf3+y*dst_stride+1*offv, pbuf4+y*dst_stride+1*offv, w ) ||
+                        memcmp( pbuf3+y*dst_stride+2*offv, pbuf4+y*dst_stride+2*offv, w ) )
+                    {
+                        ok = 0;
+                        fprintf( stderr, "plane_copy_deinterleave_rgb FAILED: w=%d h=%d stride=%d pw=%d\n", w, h, (int)src_stride, pw );
+                        break;
+                    }
+            }
+        }
+    }
     report( "plane_copy :" );
 
     if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
@@ -1567,16 +1598,17 @@
     INTEGRAL_INIT( integral_init8v, 9, sum, stride );
     report( "integral init :" );
 
+    ok = 1; used_asm = 0;
     if( mc_a.mbtree_propagate_cost != mc_ref.mbtree_propagate_cost )
     {
-        ok = 1; used_asm = 1;
+        used_asm = 1;
         x264_emms();
         for( int i = 0; i < 10; i++ )
         {
-            float fps_factor = (rand()&65535) / 256.;
-            set_func_name( "mbtree_propagate" );
-            int *dsta = (int*)buf3;
-            int *dstc = dsta+400;
+            float fps_factor = (rand()&65535) / 65535.0f;
+            set_func_name( "mbtree_propagate_cost" );
+            int16_t *dsta = (int16_t*)buf3;
+            int16_t *dstc = dsta+400;
             uint16_t *prop = (uint16_t*)buf1;
             uint16_t *intra = (uint16_t*)buf4;
             uint16_t *inter = intra+128;
@@ -1598,12 +1630,60 @@
             {
                 ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
                 if( !ok )
-                    fprintf( stderr, "mbtree_propagate FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
+                    fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
             }
         }
-        report( "mbtree propagate :" );
     }
 
+    if( mc_a.mbtree_propagate_list != mc_ref.mbtree_propagate_list )
+    {
+        used_asm = 1;
+        for( int i = 0; i < 8; i++ )
+        {
+            set_func_name( "mbtree_propagate_list" );
+            x264_t h;
+            int height = 4;
+            int width = 128;
+            int size = width*height;
+            h.mb.i_mb_stride = width;
+            h.mb.i_mb_width = width;
+            h.mb.i_mb_height = height;
+
+            uint16_t *ref_costsc = (uint16_t*)buf3;
+            uint16_t *ref_costsa = (uint16_t*)buf4;
+            int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size);
+            int16_t *propagate_amount = (int16_t*)(mvs + width);
+            uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width);
+            h.scratch_buffer2 = (uint8_t*)(ref_costsa + size);
+            int bipred_weight = (rand()%63)+1;
+            int list = i&1;
+            for( int j = 0; j < size; j++ )
+                ref_costsc[j] = ref_costsa[j] = rand()&32767;
+            for( int j = 0; j < width; j++ )
+            {
+                static const uint8_t list_dist[2][8] = {{0,1,1,1,1,1,1,1},{1,1,3,3,3,3,3,2}};
+                for( int k = 0; k < 2; k++ )
+                    mvs[j][k] = (rand()&127) - 64;
+                propagate_amount[j] = rand()&32767;
+                lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT;
+            }
+
+            call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+            call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+
+            for( int j = 0; j < size && ok; j++ )
+            {
+                ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1;
+                if( !ok )
+                    fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] );
+            }
+
+            call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+            call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
+        }
+    }
+    report( "mbtree :" );
+
     if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
     {
         set_func_name( "memcpy_aligned" );
@@ -2530,7 +2610,7 @@
 {
     *cpu_ref = *cpu_new;
     *cpu_new |= flags;
-#if BROKEN_STACK_ALIGNMENT
+#if STACK_ALIGNMENT < 16
     *cpu_new |= X264_CPU_STACK_MOD4;
 #endif
     if( *cpu_new & X264_CPU_SSE2_IS_FAST )
diff -Nru x264-0.142.2389+git956c8d8/version.sh x264-0.142.2431+gita5831aa/version.sh
--- x264-0.142.2389+git956c8d8/version.sh	2014-02-13 23:26:08.000000000 +0000
+++ x264-0.142.2431+gita5831aa/version.sh	2014-07-11 01:16:23.000000000 +0000
@@ -1,5 +1,5 @@
 #!/bin/sh
 # Script modified from upstream source for Debian packaging since packaging
 # won't include .git repository.
-echo '#define X264_VERSION " r2389 956c8d8"'
-echo '#define X264_POINTVER "0.142.2389 956c8d8"'
+echo '#define X264_VERSION " r2431 a5831aa"'
+echo '#define X264_POINTVER "0.142.2431 a5831aa"'
diff -Nru x264-0.142.2389+git956c8d8/x264.c x264-0.142.2431+gita5831aa/x264.c
--- x264-0.142.2389+git956c8d8/x264.c	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/x264.c	2014-07-11 01:16:23.000000000 +0000
@@ -6,7 +6,7 @@
  * Authors: Loren Merritt <lorenm@u.washington.edu>
  *          Laurent Aimar <fenrir@via.ecp.fr>
  *          Steven Walters <kemuri9@gmail.com>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *          Kieran Kunhya <kieran@kunhya.com>
  *          Henrik Gramner <henrik@gramner.com>
  *
diff -Nru x264-0.142.2389+git956c8d8/x264.h x264-0.142.2431+gita5831aa/x264.h
--- x264-0.142.2389+git956c8d8/x264.h	2014-02-13 23:26:07.000000000 +0000
+++ x264-0.142.2431+gita5831aa/x264.h	2014-07-11 01:16:23.000000000 +0000
@@ -5,7 +5,7 @@
  *
  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
  *          Loren Merritt <lorenm@u.washington.edu>
- *          Jason Garrett-Glaser <darkshikari@gmail.com>
+ *          Fiona Glaser <fiona@x264.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -884,13 +884,15 @@
 /* x264_encoder_headers:
  *      return the SPS and PPS that will be used for the whole stream.
  *      *pi_nal is the number of NAL units outputted in pp_nal.
+ *      returns the number of bytes in the returned NALs.
  *      returns negative on error.
  *      the payloads of all output NALs are guaranteed to be sequential in memory. */
 int     x264_encoder_headers( x264_t *, x264_nal_t **pp_nal, int *pi_nal );
 /* x264_encoder_encode:
  *      encode one picture.
  *      *pi_nal is the number of NAL units outputted in pp_nal.
- *      returns negative on error, zero if no NAL units returned.
+ *      returns the number of bytes in the returned NALs.
+ *      returns negative on error and zero if no NAL units returned.
  *      the payloads of all output NALs are guaranteed to be sequential in memory. */
 int     x264_encoder_encode( x264_t *, x264_nal_t **pp_nal, int *pi_nal, x264_picture_t *pic_in, x264_picture_t *pic_out );
 /* x264_encoder_close: