diff --git a/library/src/main/cpp/libx264/Makefile b/library/src/main/cpp/libx264/Makefile index d0b1633..81bce65 100644 --- a/library/src/main/cpp/libx264/Makefile +++ b/library/src/main/cpp/libx264/Makefile @@ -278,7 +278,8 @@ clean: rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc distclean: clean - rm -f config.mak x264_config.h config.h config.log x264.pc x264.def conftest* + rm -f config.mak x264_config.h config.h config.log x264.pc x264.def + rm -rf conftest* install-cli: cli $(INSTALL) -d $(DESTDIR)$(bindir) diff --git a/library/src/main/cpp/libx264/common/aarch64/pixel-a.S b/library/src/main/cpp/libx264/common/aarch64/pixel-a.S index 48209b2..047d3db 100644 --- a/library/src/main/cpp/libx264/common/aarch64/pixel-a.S +++ b/library/src/main/cpp/libx264/common/aarch64/pixel-a.S @@ -569,57 +569,65 @@ endfunc .macro pixel_var2_8 h function x264_pixel_var2_8x\h\()_neon, export=1 - ld1 {v16.8b}, [x0], x1 - ld1 {v18.8b}, [x2], x3 - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x2], x3 - mov x5, \h - 4 - usubl v6.8h, v16.8b, v18.8b - usubl v7.8h, v17.8b, v19.8b - ld1 {v16.8b}, [x0], x1 - ld1 {v18.8b}, [x2], x3 - smull v2.4s, v6.4h, v6.4h - smull2 v3.4s, v6.8h, v6.8h - add v0.8h, v6.8h, v7.8h - smlal v2.4s, v7.4h, v7.4h - smlal2 v3.4s, v7.8h, v7.8h + mov x3, #16 + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 + mov x5, \h - 2 + usubl v0.8h, v16.8b, v18.8b + usubl v1.8h, v17.8b, v19.8b + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + smull v2.4s, v0.4h, v0.4h + smull2 v3.4s, v0.8h, v0.8h + smull v4.4s, v1.4h, v1.4h + smull2 v5.4s, v1.8h, v1.8h usubl v6.8h, v16.8b, v18.8b -1: subs x5, x5, #2 - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x2], x3 +1: subs x5, x5, #1 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h - ld1 {v16.8b}, [x0], x1 - ld1 {v18.8b}, [x2], x3 - smlal v2.4s, v7.4h, v7.4h - smlal2 v3.4s, v7.8h, v7.8h + ld1 {v16.8b}, [x0], #8 + ld1 {v18.8b}, [x1], x3 + smlal v4.4s, v7.4h, v7.4h + smlal2 v5.4s, v7.8h, v7.8h usubl v6.8h, v16.8b, v18.8b - add v0.8h, v0.8h, v7.8h + add v1.8h, v1.8h, v7.8h b.gt 1b - ld1 {v17.8b}, [x0], x1 - ld1 {v19.8b}, [x2], x3 + ld1 {v17.8b}, [x0], #8 + ld1 {v19.8b}, [x1], x3 smlal v2.4s, v6.4h, v6.4h smlal2 v3.4s, v6.8h, v6.8h usubl v7.8h, v17.8b, v19.8b add v0.8h, v0.8h, v6.8h - smlal v2.4s, v7.4h, v7.4h - add v0.8h, v0.8h, v7.8h - smlal2 v3.4s, v7.8h, v7.8h + smlal v4.4s, v7.4h, v7.4h + add v1.8h, v1.8h, v7.8h + smlal2 v5.4s, v7.8h, v7.8h saddlv s0, v0.8h + saddlv s1, v1.8h add v2.4s, v2.4s, v3.4s + add v4.4s, v4.4s, v5.4s mov w0, v0.s[0] - addv s1, v2.4s - sxtw x0, w0 mov w1, v1.s[0] - mul x0, x0, x0 - str w1, [x4] - sub x0, x1, x0, lsr # 6 + (\h >> 4) + addv s2, v2.4s + addv s4, v4.4s + mul w0, w0, w0 + mul w1, w1, w1 + mov w3, v2.s[0] + mov w4, v4.s[0] + sub w0, w3, w0, lsr # 6 + (\h >> 4) + sub w1, w4, w1, lsr # 6 + (\h >> 4) + str w3, [x2] + add w0, w0, w1 + str w4, [x2, #4] ret endfunc diff --git a/library/src/main/cpp/libx264/common/aarch64/pixel.h b/library/src/main/cpp/libx264/common/aarch64/pixel.h index 8a7b83e..5206a0c 100644 --- a/library/src/main/cpp/libx264/common/aarch64/pixel.h +++ b/library/src/main/cpp/libx264/common/aarch64/pixel.h @@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); -int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * ); uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); diff --git a/library/src/main/cpp/libx264/common/arm/asm.S b/library/src/main/cpp/libx264/common/arm/asm.S index 62a5e57..0472d11 100644 --- a/library/src/main/cpp/libx264/common/arm/asm.S +++ b/library/src/main/cpp/libx264/common/arm/asm.S @@ -28,15 +28,10 @@ .syntax unified -#if HAVE_NEON - .arch armv7-a -#elif HAVE_ARMV6T2 - .arch armv6t2 -#elif HAVE_ARMV6 - .arch armv6 -#endif - +#ifndef __APPLE__ +.arch armv7-a .fpu neon +#endif #ifdef PREFIX # define EXTERN_ASM _ @@ -50,6 +45,14 @@ # define ELF @ #endif +#ifdef __MACH__ +# define MACH +# define NONMACH @ +#else +# define MACH @ +# define NONMACH +#endif + #if HAVE_AS_FUNC # define FUNC #else @@ -76,6 +79,7 @@ ELF .size \name, . - \name FUNC .endfunc .purgem endfunc .endm + .text .align 2 .if \export == 1 .global EXTERN_ASM\name @@ -99,7 +103,8 @@ ELF .size \name, . - \name .if HAVE_SECTION_DATA_REL_RO && \relocate .section .data.rel.ro .else - .section .rodata +NONMACH .section .rodata +MACH .const_data .endif .align \align \name: diff --git a/library/src/main/cpp/libx264/common/arm/dct-a.S b/library/src/main/cpp/libx264/common/arm/dct-a.S index 13d5061..48a3498 100644 --- a/library/src/main/cpp/libx264/common/arm/dct-a.S +++ b/library/src/main/cpp/libx264/common/arm/dct-a.S @@ -26,14 +26,12 @@ #include "asm.S" -.section .rodata -.align 4 - -scan4x4_frame: +const scan4x4_frame, align=4 .byte 0,1, 8,9, 2,3, 4,5 .byte 2,3, 8,9, 16,17, 10,11 .byte 12,13, 6,7, 14,15, 20,21 .byte 10,11, 12,13, 6,7, 14,15 +endconst .text diff --git a/library/src/main/cpp/libx264/common/arm/mc-a.S b/library/src/main/cpp/libx264/common/arm/mc-a.S index e8d3d03..2d6dc2e 100644 --- a/library/src/main/cpp/libx264/common/arm/mc-a.S +++ b/library/src/main/cpp/libx264/common/arm/mc-a.S @@ -28,10 +28,9 @@ #include "asm.S" -.section .rodata -.align 4 -pw_0to15: +const pw_0to15, align=4 .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +endconst .text @@ -140,7 +139,7 @@ MEMCPY_ALIGNED 16, 8 MEMCPY_ALIGNED 8, 16 MEMCPY_ALIGNED 8, 8 -const memcpy_table align=2, relocate=1 +const memcpy_table, align=2, relocate=1 .word memcpy_aligned_16_16_neon .word memcpy_aligned_16_8_neon .word memcpy_aligned_8_16_neon diff --git a/library/src/main/cpp/libx264/common/arm/pixel-a.S b/library/src/main/cpp/libx264/common/arm/pixel-a.S index a1a0673..155e1cf 100644 --- a/library/src/main/cpp/libx264/common/arm/pixel-a.S +++ b/library/src/main/cpp/libx264/common/arm/pixel-a.S @@ -26,9 +26,7 @@ #include "asm.S" -.section .rodata -.align 4 - +const mask_array, align=4 .rept 16 .byte 0xff .endr @@ -36,11 +34,14 @@ mask_ff: .rept 16 .byte 0 .endr +endconst -mask_ac4: +const mask_ac4, align=4 .short 0, -1, -1, -1, 0, -1, -1, -1 -mask_ac8: +endconst +const mask_ac8, align=4 .short 0, -1, -1, -1, -1, -1, -1, -1 +endconst .text @@ -718,13 +719,24 @@ function x264_var_end, export=0 bx lr endfunc -.macro DIFF_SUM diff da db lastdiff - vld1.64 {\da}, [r0,:64], r1 - vld1.64 {\db}, [r2,:64], r3 -.ifnb \lastdiff - vadd.s16 q0, q0, \lastdiff +.macro DIFF_SUM diff1 diff2 da1 db1 da2 db2 lastdiff1 lastdiff2 acc1 acc2 + vld1.64 {\da1}, [r0,:64]! + vld1.64 {\db1}, [r1,:64], r3 +.ifnb \lastdiff1 + vadd.s16 \acc1, \acc1, \lastdiff1 + vadd.s16 \acc2, \acc2, \lastdiff2 .endif - vsubl.u8 \diff, \da, \db + vld1.64 {\da2}, [r0,:64]! + vld1.64 {\db2}, [r1,:64], r3 + vsubl.u8 \diff1, \da1, \db1 + vsubl.u8 \diff2, \da2, \db2 +.endm + +.macro SQR_ACC_DOUBLE acc1 acc2 d0 d1 d2 d3 vmlal=vmlal.s16 + \vmlal \acc1, \d0, \d0 + vmlal.s16 \acc1, \d1, \d1 + \vmlal \acc2, \d2, \d2 + vmlal.s16 \acc2, \d3, \d3 .endm .macro SQR_ACC acc d0 d1 vmlal=vmlal.s16 @@ -733,77 +745,89 @@ endfunc .endm function x264_pixel_var2_8x8_neon - DIFF_SUM q0, d0, d1 - DIFF_SUM q8, d16, d17 - SQR_ACC q1, d0, d1, vmull.s16 - DIFF_SUM q9, d18, d19, q8 - SQR_ACC q2, d16, d17, vmull.s16 + mov r3, #16 + DIFF_SUM q0, q10, d0, d1, d20, d21 + DIFF_SUM q8, q11, d16, d17, d22, d23 + SQR_ACC_DOUBLE q1, q13, d0, d1, d20, d21, vmull.s16 + DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10 + SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23, vmull.s16 .rept 2 - DIFF_SUM q8, d16, d17, q9 - SQR_ACC q1, d18, d19 - DIFF_SUM q9, d18, d19, q8 - SQR_ACC q2, d16, d17 + DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10 + SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25 + DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10 + SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23 .endr - DIFF_SUM q8, d16, d17, q9 - SQR_ACC q1, d18, d19 + DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10 + SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25 vadd.s16 q0, q0, q8 - SQR_ACC q2, d16, d17 + vadd.s16 q10, q10, q11 + SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23 - ldr ip, [sp] vadd.s16 d0, d0, d1 + vadd.s16 d20, d20, d21 vadd.s32 q1, q1, q2 + vadd.s32 q13, q13, q14 vpaddl.s16 d0, d0 + vpaddl.s16 d20, d20 vadd.s32 d1, d2, d3 - vpadd.s32 d0, d0, d1 + vadd.s32 d26, d26, d27 + vpadd.s32 d0, d0, d20 @ sum + vpadd.s32 d1, d1, d26 @ sqr + vmul.s32 d0, d0, d0 @ sum*sum + vshr.s32 d0, d0, #6 + vsub.s32 d0, d1, d0 + vpadd.s32 d0, d0, d0 vmov r0, r1, d0 - vst1.32 {d0[1]}, [ip,:32] - mul r0, r0, r0 - sub r0, r1, r0, lsr #6 + vst1.32 {d1}, [r2,:64] bx lr endfunc function x264_pixel_var2_8x16_neon - vld1.64 {d16}, [r0,:64], r1 - vld1.64 {d17}, [r2,:64], r3 - vld1.64 {d18}, [r0,:64], r1 - vld1.64 {d19}, [r2,:64], r3 - vsubl.u8 q10, d16, d17 - vsubl.u8 q11, d18, d19 - SQR_ACC q1, d20, d21, vmull.s16 - vld1.64 {d16}, [r0,:64], r1 - vadd.s16 q0, q10, q11 - vld1.64 {d17}, [r2,:64], r3 - SQR_ACC q2, d22, d23, vmull.s16 - mov ip, #14 -1: subs ip, ip, #2 - vld1.64 {d18}, [r0,:64], r1 + mov r3, #16 + vld1.64 {d16}, [r0,:64]! + vld1.64 {d17}, [r1,:64], r3 + vld1.64 {d18}, [r0,:64]! + vld1.64 {d19}, [r1,:64], r3 + vsubl.u8 q0, d16, d17 + vsubl.u8 q3, d18, d19 + SQR_ACC q1, d0, d1, vmull.s16 + vld1.64 {d16}, [r0,:64]! + mov ip, #15 + vld1.64 {d17}, [r1,:64], r3 + SQR_ACC q2, d6, d7, vmull.s16 +1: subs ip, ip, #1 + vld1.64 {d18}, [r0,:64]! vsubl.u8 q10, d16, d17 - vld1.64 {d19}, [r2,:64], r3 + vld1.64 {d19}, [r1,:64], r3 vadd.s16 q0, q0, q10 SQR_ACC q1, d20, d21 vsubl.u8 q11, d18, d19 beq 2f - vld1.64 {d16}, [r0,:64], r1 - vadd.s16 q0, q0, q11 - vld1.64 {d17}, [r2,:64], r3 + vld1.64 {d16}, [r0,:64]! + vadd.s16 q3, q3, q11 + vld1.64 {d17}, [r1,:64], r3 SQR_ACC q2, d22, d23 b 1b 2: - vadd.s16 q0, q0, q11 + vadd.s16 q3, q3, q11 SQR_ACC q2, d22, d23 - ldr ip, [sp] vadd.s16 d0, d0, d1 - vadd.s32 q1, q1, q2 + vadd.s16 d6, d6, d7 vpaddl.s16 d0, d0 - vadd.s32 d1, d2, d3 - vpadd.s32 d0, d0, d1 + vpaddl.s16 d6, d6 + vadd.s32 d2, d2, d3 + vadd.s32 d4, d4, d5 + vpadd.s32 d0, d0, d6 @ sum + vpadd.s32 d2, d2, d4 @ sqr + vmul.s32 d0, d0, d0 @ sum*sum + vshr.s32 d0, d0, #7 + vsub.s32 d0, d2, d0 + vpadd.s32 d0, d0, d0 vmov r0, r1, d0 - vst1.32 {d0[1]}, [ip,:32] - mul r0, r0, r0 - sub r0, r1, r0, lsr #7 + vst1.32 {d2}, [r2,:64] bx lr endfunc diff --git a/library/src/main/cpp/libx264/common/arm/pixel.h b/library/src/main/cpp/libx264/common/arm/pixel.h index 8a6751b..d9b02c4 100644 --- a/library/src/main/cpp/libx264/common/arm/pixel.h +++ b/library/src/main/cpp/libx264/common/arm/pixel.h @@ -63,8 +63,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t ); -int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * ); +int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * ); uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t ); uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t ); diff --git a/library/src/main/cpp/libx264/common/arm/predict-a.S b/library/src/main/cpp/libx264/common/arm/predict-a.S index a7d9f10..06366cc 100644 --- a/library/src/main/cpp/libx264/common/arm/predict-a.S +++ b/library/src/main/cpp/libx264/common/arm/predict-a.S @@ -27,10 +27,9 @@ #include "asm.S" -.section .rodata -.align 4 - -p16weight: .short 1,2,3,4,5,6,7,8 +const p16weight, align=4 +.short 1,2,3,4,5,6,7,8 +endconst .text diff --git a/library/src/main/cpp/libx264/common/arm/quant-a.S b/library/src/main/cpp/libx264/common/arm/quant-a.S index eb3fd36..a7f6cd2 100644 --- a/library/src/main/cpp/libx264/common/arm/quant-a.S +++ b/library/src/main/cpp/libx264/common/arm/quant-a.S @@ -26,19 +26,20 @@ #include "asm.S" -.section .rodata -.align 4 -pmovmskb_byte: +const pmovmskb_byte, align=4 .byte 1,2,4,8,16,32,64,128 .byte 1,2,4,8,16,32,64,128 +endconst -mask_2bit: +const mask_2bit, align=4 .byte 3,12,48,192,3,12,48,192 .byte 3,12,48,192,3,12,48,192 +endconst -mask_1bit: +const mask_1bit, align=4 .byte 128,64,32,16,8,4,2,1 .byte 128,64,32,16,8,4,2,1 +endconst .text diff --git a/library/src/main/cpp/libx264/common/bitstream.c b/library/src/main/cpp/libx264/common/bitstream.c index d6c1c2c..cc76300 100644 --- a/library/src/main/cpp/libx264/common/bitstream.c +++ b/library/src/main/cpp/libx264/common/bitstream.c @@ -43,16 +43,19 @@ uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end ); uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end ); uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end ); void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_internal_sse2_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); -void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_8x8_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); +void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb ); uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end ); @@ -116,7 +119,7 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) pf->nal_escape = x264_nal_escape_c; #if HAVE_MMX -#if ARCH_X86_64 +#if ARCH_X86_64 && !defined( __MACH__ ) pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2; pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2; pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2; @@ -126,18 +129,17 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) pf->nal_escape = x264_nal_escape_mmx2; if( cpu&X264_CPU_SSE2 ) { -#if ARCH_X86_64 - if( cpu&X264_CPU_LZCNT ) - { - pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt; - pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt; - pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt; - } -#endif if( cpu&X264_CPU_SSE2_IS_FAST ) pf->nal_escape = x264_nal_escape_sse2; } -#if ARCH_X86_64 +#if ARCH_X86_64 && !defined( __MACH__ ) + if( cpu&X264_CPU_LZCNT ) + { + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt; + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt; + } + if( cpu&X264_CPU_SSSE3 ) { pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3; @@ -152,8 +154,14 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf ) if( cpu&X264_CPU_AVX2 ) { pf->nal_escape = x264_nal_escape_avx2; - if( cpu&X264_CPU_BMI2 ) - pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2; + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2; + } + + if( cpu&X264_CPU_AVX512 ) + { + pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512; + pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512; + pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512; } #endif #endif diff --git a/library/src/main/cpp/libx264/common/cabac.h b/library/src/main/cpp/libx264/common/cabac.h index 5af856a..1378834 100644 --- a/library/src/main/cpp/libx264/common/cabac.h +++ b/library/src/main/cpp/libx264/common/cabac.h @@ -42,7 +42,7 @@ typedef struct uint8_t *p_end; /* aligned for memcpy_aligned starting here */ - ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision() + ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision() /* context */ uint8_t state[1024]; diff --git a/library/src/main/cpp/libx264/common/common.c b/library/src/main/cpp/libx264/common/common.c index 14d4670..561212d 100644 --- a/library/src/main/cpp/libx264/common/common.c +++ b/library/src/main/cpp/libx264/common/common.c @@ -669,7 +669,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) { if( !strcmp(value, "1b") ) p->i_level_idc = 9; - else if( atof(value) < 6 ) + else if( atof(value) < 7 ) p->i_level_idc = (int)(10*atof(value)+.5); else p->i_level_idc = atoi(value); @@ -1143,6 +1143,8 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, }, + [X264_CSP_YUYV] = { 1, { 256*2 }, { 256*1 }, }, + [X264_CSP_UYVY] = { 1, { 256*2 }, { 256*1 }, }, [X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } }, [X264_CSP_BGR] = { 1, { 256*3 }, { 256*1 }, }, diff --git a/library/src/main/cpp/libx264/common/common.h b/library/src/main/cpp/libx264/common/common.h index c7850ca..867b207 100644 --- a/library/src/main/cpp/libx264/common/common.h +++ b/library/src/main/cpp/libx264/common/common.h @@ -635,11 +635,11 @@ struct x264_t /* Current MB DCT coeffs */ struct { - ALIGNED_N( dctcoef luma16x16_dc[3][16] ); + ALIGNED_64( dctcoef luma16x16_dc[3][16] ); ALIGNED_16( dctcoef chroma_dc[2][8] ); // FIXME share memory? - ALIGNED_N( dctcoef luma8x8[12][64] ); - ALIGNED_N( dctcoef luma4x4[16*3][16] ); + ALIGNED_64( dctcoef luma8x8[12][64] ); + ALIGNED_64( dctcoef luma4x4[16*3][16] ); } dct; /* MB table and cache for current frame/mb */ @@ -729,7 +729,7 @@ struct x264_t int8_t *type; /* mb type */ uint8_t *partition; /* mb partition */ int8_t *qp; /* mb qp */ - int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc (all set for PCM)*/ + int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x200 and 0x400: chroma dc, 0x1000 PCM (all set for PCM) */ int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */ /* actually has only 7 entries; set to 8 for write-combining optimizations */ uint8_t (*non_zero_count)[16*3]; /* nzc. for I_PCM set to 16 */ @@ -740,8 +740,7 @@ struct x264_t int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */ int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */ int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */ - uint16_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of - * NOTE: this will fail on resolutions above 2^16 MBs... */ + uint32_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of */ uint8_t *field; /* buffer for weighted versions of the reference frames */ @@ -778,26 +777,27 @@ struct x264_t /* space for p_fenc and p_fdec */ #define FENC_STRIDE 16 #define FDEC_STRIDE 32 - ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] ); - ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] ); + ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] ); + ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] ); /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */ ALIGNED_16( pixel i4x4_fdec_buf[16*16] ); ALIGNED_16( pixel i8x8_fdec_buf[16*16] ); - ALIGNED_16( dctcoef i8x8_dct_buf[3][64] ); - ALIGNED_16( dctcoef i4x4_dct_buf[15][16] ); + ALIGNED_64( dctcoef i8x8_dct_buf[3][64] ); + ALIGNED_64( dctcoef i4x4_dct_buf[15][16] ); uint32_t i4x4_nnz_buf[4]; uint32_t i8x8_nnz_buf[4]; - int i4x4_cbp; - int i8x8_cbp; /* Psy trellis DCT data */ ALIGNED_16( dctcoef fenc_dct8[4][64] ); ALIGNED_16( dctcoef fenc_dct4[16][16] ); /* Psy RD SATD/SA8D scores cache */ - ALIGNED_N( uint64_t fenc_hadamard_cache[9] ); - ALIGNED_N( uint32_t fenc_satd_cache[32] ); + ALIGNED_64( uint32_t fenc_satd_cache[32] ); + ALIGNED_16( uint64_t fenc_hadamard_cache[9] ); + + int i4x4_cbp; + int i8x8_cbp; /* pointer over mb of the frame to be compressed */ pixel *p_fenc[3]; /* y,u,v */ @@ -822,10 +822,10 @@ struct x264_t struct { /* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */ - ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] ); + ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] ); - /* i_non_zero_count if available else 0x80 */ - ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] ); + /* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */ + ALIGNED_8( uint8_t non_zero_count[X264_SCAN8_SIZE] ); /* -1 if unused, -2 if unavailable */ ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); @@ -930,8 +930,8 @@ struct x264_t uint32_t (*nr_residual_sum)[64]; uint32_t *nr_count; - ALIGNED_N( udctcoef nr_offset_denoise[4][64] ); - ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] ); + ALIGNED_32( udctcoef nr_offset_denoise[4][64] ); + ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] ); uint32_t nr_count_buf[2][4]; uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */ diff --git a/library/src/main/cpp/libx264/common/cpu.c b/library/src/main/cpp/libx264/common/cpu.c index 636a40c..8638186 100644 --- a/library/src/main/cpp/libx264/common/cpu.c +++ b/library/src/main/cpp/libx264/common/cpu.c @@ -47,8 +47,7 @@ const x264_cpu_name_t x264_cpu_names[] = { #if HAVE_MMX // {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore -// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it -#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV +#define MMX2 X264_CPU_MMX|X264_CPU_MMX2 {"MMX2", MMX2}, {"MMXEXT", MMX2}, {"SSE", MMX2|X264_CPU_SSE}, @@ -56,6 +55,7 @@ const x264_cpu_name_t x264_cpu_names[] = {"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW}, {"SSE2", SSE2}, {"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST}, + {"LZCNT", SSE2|X264_CPU_LZCNT}, {"SSE3", SSE2|X264_CPU_SSE3}, {"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3}, {"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4}, @@ -66,16 +66,17 @@ const x264_cpu_name_t x264_cpu_names[] = {"XOP", AVX|X264_CPU_XOP}, {"FMA4", AVX|X264_CPU_FMA4}, {"FMA3", AVX|X264_CPU_FMA3}, - {"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2}, + {"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1}, + {"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2}, +#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2 + {"AVX2", AVX2}, + {"AVX512", AVX2|X264_CPU_AVX512}, +#undef AVX2 #undef AVX #undef SSE2 #undef MMX2 {"Cache32", X264_CPU_CACHELINE_32}, {"Cache64", X264_CPU_CACHELINE_64}, - {"LZCNT", X264_CPU_LZCNT}, - {"BMI1", X264_CPU_BMI1}, - {"BMI2", X264_CPU_BMI1|X264_CPU_BMI2}, - {"SlowCTZ", X264_CPU_SLOW_CTZ}, {"SlowAtom", X264_CPU_SLOW_ATOM}, {"SlowPshufb", X264_CPU_SLOW_PSHUFB}, {"SlowPalignr", X264_CPU_SLOW_PALIGNR}, @@ -118,7 +119,7 @@ static void sigill_handler( int sig ) #if HAVE_MMX int x264_cpu_cpuid_test( void ); void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx ); -void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx ); +uint64_t x264_cpu_xgetbv( int xcr ); uint32_t x264_cpu_detect( void ) { @@ -126,15 +127,14 @@ uint32_t x264_cpu_detect( void ) uint32_t eax, ebx, ecx, edx; uint32_t vendor[4] = {0}; uint32_t max_extended_cap, max_basic_cap; - int cache; + uint64_t xcr0 = 0; #if !ARCH_X86_64 if( !x264_cpu_cpuid_test() ) return 0; #endif - x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 ); - max_basic_cap = eax; + x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 ); if( max_basic_cap == 0 ) return 0; @@ -145,28 +145,24 @@ uint32_t x264_cpu_detect( void ) return cpu; if( edx&0x02000000 ) cpu |= X264_CPU_MMX2|X264_CPU_SSE; - if( edx&0x00008000 ) - cpu |= X264_CPU_CMOV; - else - return cpu; if( edx&0x04000000 ) cpu |= X264_CPU_SSE2; if( ecx&0x00000001 ) cpu |= X264_CPU_SSE3; if( ecx&0x00000200 ) - cpu |= X264_CPU_SSSE3; + cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST; if( ecx&0x00080000 ) cpu |= X264_CPU_SSE4; if( ecx&0x00100000 ) cpu |= X264_CPU_SSE42; - /* Check OXSAVE and AVX bits */ - if( (ecx&0x18000000) == 0x18000000 ) + + if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */ { - /* Check for OS support */ - x264_cpu_xgetbv( 0, &eax, &edx ); - if( (eax&0x6) == 0x6 ) + xcr0 = x264_cpu_xgetbv( 0 ); + if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */ { - cpu |= X264_CPU_AVX; + if( ecx&0x10000000 ) + cpu |= X264_CPU_AVX; if( ecx&0x00001000 ) cpu |= X264_CPU_FMA3; } @@ -175,20 +171,25 @@ uint32_t x264_cpu_detect( void ) if( max_basic_cap >= 7 ) { x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx ); - /* AVX2 requires OS support, but BMI1/2 don't. */ - if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) ) - cpu |= X264_CPU_AVX2; + if( ebx&0x00000008 ) - { cpu |= X264_CPU_BMI1; - if( ebx&0x00000100 ) - cpu |= X264_CPU_BMI2; + if( ebx&0x00000100 ) + cpu |= X264_CPU_BMI2; + + if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */ + { + if( ebx&0x00000020 ) + cpu |= X264_CPU_AVX2; + + if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */ + { + if( (ebx&0xD0030000) == 0xD0030000 ) + cpu |= X264_CPU_AVX512; + } } } - if( cpu & X264_CPU_SSSE3 ) - cpu |= X264_CPU_SSE2_IS_FAST; - x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx ); max_extended_cap = eax; @@ -228,8 +229,6 @@ uint32_t x264_cpu_detect( void ) { if( edx&0x00400000 ) cpu |= X264_CPU_MMX2; - if( !(cpu&X264_CPU_LZCNT) ) - cpu |= X264_CPU_SLOW_CTZ; if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) ) cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ } @@ -254,7 +253,6 @@ uint32_t x264_cpu_detect( void ) else if( model == 28 ) { cpu |= X264_CPU_SLOW_ATOM; - cpu |= X264_CPU_SLOW_CTZ; cpu |= X264_CPU_SLOW_PSHUFB; } /* Conroe has a slow shuffle unit. Check the model number to make sure not @@ -268,7 +266,7 @@ uint32_t x264_cpu_detect( void ) { /* cacheline size is specified in 3 places, any of which may be missing */ x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx ); - cache = (ebx&0xff00)>>5; // cflush size + int cache = (ebx&0xff00)>>5; // cflush size if( !cache && max_extended_cap >= 0x80000006 ) { x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx ); diff --git a/library/src/main/cpp/libx264/common/cpu.h b/library/src/main/cpp/libx264/common/cpu.h index eec1be2..845034c 100644 --- a/library/src/main/cpp/libx264/common/cpu.h +++ b/library/src/main/cpp/libx264/common/cpu.h @@ -56,7 +56,7 @@ void x264_cpu_sfence( void ); * alignment between functions (osdep.h handles manual alignment of arrays * if it doesn't). */ -#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX +#if HAVE_MMX && (STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4)) intptr_t x264_stack_align( void (*func)(), ... ); #define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__) #else @@ -65,7 +65,7 @@ intptr_t x264_stack_align( void (*func)(), ... ); typedef struct { - const char name[16]; + const char *name; uint32_t flags; } x264_cpu_name_t; extern const x264_cpu_name_t x264_cpu_names[]; diff --git a/library/src/main/cpp/libx264/common/dct.c b/library/src/main/cpp/libx264/common/dct.c index a270c4c..70853bf 100644 --- a/library/src/main/cpp/libx264/common/dct.c +++ b/library/src/main/cpp/libx264/common/dct.c @@ -711,6 +711,16 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2; #endif } + + if( cpu&X264_CPU_AVX512 ) + { + dctf->sub4x4_dct = x264_sub4x4_dct_avx512; + dctf->sub8x8_dct = x264_sub8x8_dct_avx512; + dctf->sub16x16_dct = x264_sub16x16_dct_avx512; + dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512; + dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512; + dctf->add8x8_idct = x264_add8x8_idct_avx512; + } #endif //HAVE_MMX #if HAVE_ALTIVEC @@ -986,6 +996,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx; } #endif // ARCH_X86_64 + if( cpu&X264_CPU_AVX512 ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; + } #endif // HAVE_MMX #else #if HAVE_MMX @@ -1026,6 +1043,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop; pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop; } + if( cpu&X264_CPU_AVX512 ) + { + pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512; + pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512; + pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512; + pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512; + } #endif // HAVE_MMX #if HAVE_ALTIVEC if( cpu&X264_CPU_ALTIVEC ) @@ -1068,6 +1092,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx; } + if( cpu&X264_CPU_AVX512 ) + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; + } #else if( cpu&X264_CPU_MMX ) { @@ -1091,6 +1120,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig pf_interlaced->interleave_8x8_cavlc = pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2; } + if( cpu&X264_CPU_AVX512 ) + { + pf_interlaced->interleave_8x8_cavlc = + pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512; + } #endif // HIGH_BIT_DEPTH #endif #if !HIGH_BIT_DEPTH diff --git a/library/src/main/cpp/libx264/common/dct.h b/library/src/main/cpp/libx264/common/dct.h index fc8434b..d443e22 100644 --- a/library/src/main/cpp/libx264/common/dct.h +++ b/library/src/main/cpp/libx264/common/dct.h @@ -75,7 +75,6 @@ typedef struct } x264_zigzag_function_t; void x264_dct_init( int cpu, x264_dct_function_t *dctf ); -void x264_dct_init_weights( void ); void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced ); #endif diff --git a/library/src/main/cpp/libx264/common/deblock.c b/library/src/main/cpp/libx264/common/deblock.c index 659fb35..0c7f128 100644 --- a/library/src/main/cpp/libx264/common/deblock.c +++ b/library/src/main/cpp/libx264/common/deblock.c @@ -676,21 +676,21 @@ void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, i void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta ); -void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); -void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); -void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); -void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); -void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], - int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], - int mvy_limit, int bframe ); +void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); +void x264_deblock_strength_avx512( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE], + int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], + int mvy_limit, int bframe ); void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta ); void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta ); @@ -803,7 +803,6 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) #if !HIGH_BIT_DEPTH pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2; #endif - pf->deblock_strength = x264_deblock_strength_mmx2; if( cpu&X264_CPU_SSE2 ) { pf->deblock_strength = x264_deblock_strength_sse2; @@ -852,6 +851,10 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff ) { pf->deblock_strength = x264_deblock_strength_avx2; } + if( cpu&X264_CPU_AVX512 ) + { + pf->deblock_strength = x264_deblock_strength_avx512; + } } #endif diff --git a/library/src/main/cpp/libx264/common/frame.c b/library/src/main/cpp/libx264/common/frame.c index e15c1a3..4d80cbb 100644 --- a/library/src/main/cpp/libx264/common/frame.c +++ b/library/src/main/cpp/libx264/common/frame.c @@ -54,6 +54,8 @@ static int x264_frame_internal_csp( int external_csp ) case X264_CSP_NV16: case X264_CSP_I422: case X264_CSP_YV16: + case X264_CSP_YUYV: + case X264_CSP_UYVY: case X264_CSP_V210: return X264_CSP_NV16; case X264_CSP_I444: @@ -76,7 +78,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) int i_padv = PADV << PARAM_INTERLACED; int align = 16; #if ARCH_X86 || ARCH_X86_64 - if( h->param.cpu&X264_CPU_CACHELINE_64 ) + if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 ) align = 64; else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX ) align = 32; @@ -221,11 +223,13 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec ) PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) ); PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) ); } - PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) ); + PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) ); for( int j = 0; j <= h->param.i_bframe+1; j++ ) for( int i = 0; i <= h->param.i_bframe+1; i++ ) - PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) ); + PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) ); + /* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */ + prealloc_size += NATIVE_ALIGN; } if( h->param.rc.i_aq_mode ) { @@ -408,7 +412,13 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src ) uint8_t *pix[3]; int stride[3]; - if( i_csp == X264_CSP_V210 ) + if( i_csp == X264_CSP_YUYV || i_csp == X264_CSP_UYVY ) + { + int p = i_csp == X264_CSP_UYVY; + h->mc.plane_copy_deinterleave_yuyv( dst->plane[p], dst->i_stride[p], dst->plane[p^1], dst->i_stride[p^1], + (pixel*)src->img.plane[0], src->img.i_stride[0], h->param.i_width, h->param.i_height ); + } + else if( i_csp == X264_CSP_V210 ) { stride[0] = src->img.i_stride[0]; pix[0] = src->img.plane[0]; diff --git a/library/src/main/cpp/libx264/common/macroblock.c b/library/src/main/cpp/libx264/common/macroblock.c index 661e678..6168671 100644 --- a/library/src/main/cpp/libx264/common/macroblock.c +++ b/library/src/main/cpp/libx264/common/macroblock.c @@ -121,8 +121,8 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y; int i_mode = x264_size2pixel[height][width]; intptr_t i_stride0 = 16, i_stride1 = 16; - ALIGNED_ARRAY_N( pixel, tmp0,[16*16] ); - ALIGNED_ARRAY_N( pixel, tmp1,[16*16] ); + ALIGNED_ARRAY_32( pixel, tmp0,[16*16] ); + ALIGNED_ARRAY_32( pixel, tmp1,[16*16] ); pixel *src0, *src1; MC_LUMA_BI( 0 ); @@ -260,7 +260,7 @@ int x264_macroblock_cache_allocate( x264_t *h ) PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) ); PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) ); PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) ); - PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) ); + PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint32_t) ); /* 0 -> 3 top(4), 4 -> 6 : left(3) */ PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) ); @@ -326,7 +326,7 @@ int x264_macroblock_cache_allocate( x264_t *h ) PREALLOC_END( h->mb.base ); - memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) ); + memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint32_t) ); for( int i = 0; i < 2; i++ ) { @@ -388,7 +388,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead ) ((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t)); scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa ); } - int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t); + int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+15)&~15) * sizeof(int16_t); scratch_size = X264_MAX( scratch_size, buf_mbtree ); if( scratch_size ) CHECKED_MALLOC( h->scratch_buffer, scratch_size ); @@ -532,16 +532,16 @@ void x264_macroblock_thread_init( x264_t *h ) h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf; h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE; h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE; - h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE; + h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE; if( CHROMA444 ) { h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE; - h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE; + h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE; } else { h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8; - h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16; + h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16; } } @@ -1738,7 +1738,7 @@ void x264_macroblock_cache_save( x264_t *h ) h->mb.i_last_dqp = 0; h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2; h->mb.i_cbp_luma = 0xf; - h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x700; + h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x1700; h->mb.b_transform_8x8 = 0; for( int i = 0; i < 48; i++ ) h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16; diff --git a/library/src/main/cpp/libx264/common/mc.c b/library/src/main/cpp/libx264/common/mc.c index 543a05c..65af5b9 100644 --- a/library/src/main/cpp/libx264/common/mc.c +++ b/library/src/main/cpp/libx264/common/mc.c @@ -325,15 +325,14 @@ void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, } } -static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu, - pixel *dstv, intptr_t i_dstv, - pixel *src, intptr_t i_src, int w, int h ) +void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, + pixel *src, intptr_t i_src, int w, int h ) { - for( int y=0; yplane_copy_swap = x264_plane_copy_swap_c; pf->plane_copy_interleave = x264_plane_copy_interleave_c; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_c; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c; diff --git a/library/src/main/cpp/libx264/common/mc.h b/library/src/main/cpp/libx264/common/mc.h index 8f9a772..f3e7079 100644 --- a/library/src/main/cpp/libx264/common/mc.h +++ b/library/src/main/cpp/libx264/common/mc.h @@ -160,6 +160,39 @@ static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src, x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\ } +void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, + pixel *src, intptr_t i_src, int w, int h ); + +/* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV + * input with the additional constraint that we cannot overread src. */ +#define PLANE_COPY_YUYV(align, cpu)\ +static void x264_plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\ + pixel *src, intptr_t i_src, int w, int h )\ +{\ + int c_w = (align>>1) / sizeof(pixel) - 1;\ + if( !(w&c_w) )\ + x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\ + else if( w > c_w )\ + {\ + if( --h > 0 )\ + {\ + if( i_src > 0 )\ + {\ + x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\ + dsta += i_dsta * h;\ + dstb += i_dstb * h;\ + src += i_src * h;\ + }\ + else\ + x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\ + src+i_src, i_src, w, h );\ + }\ + x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\ + }\ + else\ + x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\ +} + void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); @@ -260,6 +293,8 @@ typedef struct /* may write up to 15 pixels off the end of each plane */ void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv, pixel *src, intptr_t i_src, int w, int h ); + void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, + pixel *src, intptr_t i_src, int w, int h ); void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty, diff --git a/library/src/main/cpp/libx264/common/osdep.h b/library/src/main/cpp/libx264/common/osdep.h index ed3ed59..ca2455d 100644 --- a/library/src/main/cpp/libx264/common/osdep.h +++ b/library/src/main/cpp/libx264/common/osdep.h @@ -108,10 +108,10 @@ int x264_is_pipe( const char *path ); #else #define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n))) #endif -#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 ) -#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 ) -#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 ) + #define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 ) +#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 ) +#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 ) // ARM compiliers don't reliably align stack variables // - EABI requires only 8 byte stack alignment to be maintained @@ -125,39 +125,39 @@ int x264_is_pipe( const char *path ); type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask) #if ARCH_ARM && SYS_MACOSX -#define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) +#define ALIGNED_ARRAY_8( ... ) EXPAND( ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) ) #else -#define ALIGNED_ARRAY_8( type, name, sub1, ... )\ - ALIGNED_8( type name sub1 __VA_ARGS__ ) +#define ALIGNED_ARRAY_8( type, name, sub1, ... ) ALIGNED_8( type name sub1 __VA_ARGS__ ) #endif #if ARCH_ARM -#define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) +#define ALIGNED_ARRAY_16( ... ) EXPAND( ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) ) #else -#define ALIGNED_ARRAY_16( type, name, sub1, ... )\ - ALIGNED_16( type name sub1 __VA_ARGS__ ) +#define ALIGNED_ARRAY_16( type, name, sub1, ... ) ALIGNED_16( type name sub1 __VA_ARGS__ ) #endif #define EXPAND(x) x +#if ARCH_X86 || ARCH_X86_64 +#define NATIVE_ALIGN 64 +#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 ) +#define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 ) #if STACK_ALIGNMENT >= 32 -#define ALIGNED_ARRAY_32( type, name, sub1, ... )\ - ALIGNED_32( type name sub1 __VA_ARGS__ ) +#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ ) #else #define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) ) #endif - +#if STACK_ALIGNMENT >= 64 +#define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ ) +#else #define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) ) - -/* For AVX2 */ -#if ARCH_X86 || ARCH_X86_64 -#define NATIVE_ALIGN 32 -#define ALIGNED_N ALIGNED_32 -#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32 +#endif #else #define NATIVE_ALIGN 16 -#define ALIGNED_N ALIGNED_16 -#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16 +#define ALIGNED_32 ALIGNED_16 +#define ALIGNED_64 ALIGNED_16 +#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16 +#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16 #endif #if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0) diff --git a/library/src/main/cpp/libx264/common/pixel.c b/library/src/main/cpp/libx264/common/pixel.c index c5edc9e..00c1412 100644 --- a/library/src/main/cpp/libx264/common/pixel.c +++ b/library/src/main/cpp/libx264/common/pixel.c @@ -201,28 +201,32 @@ PIXEL_VAR_C( x264_pixel_var_8x8, 8, 8 ) /**************************************************************************** * pixel_var2_wxh ****************************************************************************/ -#define PIXEL_VAR2_C( name, w, h, shift ) \ -static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \ +#define PIXEL_VAR2_C( name, h, shift ) \ +static int name( pixel *fenc, pixel *fdec, int ssd[2] ) \ { \ - int var = 0, sum = 0, sqr = 0; \ + int sum_u = 0, sum_v = 0, sqr_u = 0, sqr_v = 0; \ for( int y = 0; y < h; y++ ) \ { \ - for( int x = 0; x < w; x++ ) \ + for( int x = 0; x < 8; x++ ) \ { \ - int diff = pix1[x] - pix2[x]; \ - sum += diff; \ - sqr += diff * diff; \ + int diff_u = fenc[x] - fdec[x]; \ + int diff_v = fenc[x+FENC_STRIDE/2] - fdec[x+FDEC_STRIDE/2]; \ + sum_u += diff_u; \ + sum_v += diff_v; \ + sqr_u += diff_u * diff_u; \ + sqr_v += diff_v * diff_v; \ } \ - pix1 += i_stride1; \ - pix2 += i_stride2; \ + fenc += FENC_STRIDE; \ + fdec += FDEC_STRIDE; \ } \ - var = sqr - ((int64_t)sum * sum >> shift); \ - *ssd = sqr; \ - return var; \ + ssd[0] = sqr_u; \ + ssd[1] = sqr_v; \ + return sqr_u - ((int64_t)sum_u * sum_u >> shift) + \ + sqr_v - ((int64_t)sum_v * sum_v >> shift); \ } -PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16, 7 ) -PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 8, 6 ) +PIXEL_VAR2_C( x264_pixel_var2_8x16, 16, 7 ) +PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 6 ) #if BIT_DEPTH > 8 typedef uint32_t sum_t; @@ -885,13 +889,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT8( ssd, _mmx2 ); INIT_ADS( _mmx2 ); - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2; -#if ARCH_X86 - pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2; - pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2; -#endif - pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2; pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2; @@ -962,7 +959,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT7( sad, _ssse3 ); INIT7( sad_x3, _ssse3 ); INIT7( sad_x4, _ssse3 ); +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _ssse3 ); +#endif INIT6( satd, _ssse3 ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3; @@ -1003,7 +1002,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( cpu&X264_CPU_AVX ) { INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */ +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _avx ); +#endif INIT6( satd, _avx ); pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx; if( !(cpu&X264_CPU_STACK_MOD4) ) @@ -1028,8 +1029,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( sad_x3, _xop ); INIT5( sad_x4, _xop ); pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; pixf->vsad = x264_pixel_vsad_xop; pixf->asd8 = x264_pixel_asd8_xop; #if ARCH_X86_64 @@ -1044,10 +1043,19 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x3, _avx2 ); INIT2( sad_x4, _avx2 ); pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2; pixf->vsad = x264_pixel_vsad_avx2; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2; } + if( cpu&X264_CPU_AVX512 ) + { + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX @@ -1067,16 +1075,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT7( satd_x4, _mmx2 ); INIT4( hadamard_ac, _mmx2 ); INIT_ADS( _mmx2 ); - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2; - pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2; #if ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2; pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2; pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2; - pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2; - pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2; pixf->vsad = x264_pixel_vsad_mmx2; if( cpu&X264_CPU_CACHELINE_32 ) @@ -1197,7 +1200,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3; #endif } +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _ssse3 ); +#endif if( cpu&X264_CPU_SLOW_ATOM ) { pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom; @@ -1280,7 +1285,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT8( satd, _avx ); INIT7( satd_x3, _avx ); INIT7( satd_x4, _avx ); +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _avx ); +#endif INIT4( hadamard_ac, _avx ); if( !(cpu&X264_CPU_STACK_MOD4) ) { @@ -1321,11 +1328,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; - pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; - pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop; - pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop; #if ARCH_X86_64 pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop; #endif @@ -1338,7 +1340,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x4, _avx2 ); INIT4( satd, _avx2 ); INIT2( hadamard_ac, _avx2 ); +#if ARCH_X86 || !defined( __MACH__ ) INIT_ADS( _avx2 ); +#endif pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2; pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2; @@ -1351,6 +1355,21 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2; #endif } + + if( cpu&X264_CPU_AVX512 ) + { + INIT8( sad, _avx512 ); + INIT8_NAME( sad_aligned, sad, _avx512 ); + INIT7( sad_x3, _avx512 ); + INIT7( sad_x4, _avx512 ); + INIT8( satd, _avx512 ); + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; + pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512; + pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512; + } #endif //HAVE_MMX #if HAVE_ARMV6 @@ -1480,8 +1499,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa; pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa; pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa; - pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa; - pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa; + //pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa; + //pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8; } diff --git a/library/src/main/cpp/libx264/common/pixel.h b/library/src/main/cpp/libx264/common/pixel.h index f634312..d4dbfaf 100644 --- a/library/src/main/cpp/libx264/common/pixel.h +++ b/library/src/main/cpp/libx264/common/pixel.h @@ -93,8 +93,7 @@ typedef struct uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 ); uint64_t (*var[4])( pixel *pix, intptr_t stride ); - int (*var2[4])( pixel *pix1, intptr_t stride1, - pixel *pix2, intptr_t stride2, int *ssd ); + int (*var2[4])( pixel *fenc, pixel *fdec, int ssd[2] ); uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride ); void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1, diff --git a/library/src/main/cpp/libx264/common/ppc/dct.c b/library/src/main/cpp/libx264/common/ppc/dct.c index 768f390..2858bd0 100644 --- a/library/src/main/cpp/libx264/common/ppc/dct.c +++ b/library/src/main/cpp/libx264/common/ppc/dct.c @@ -293,12 +293,8 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix vec_vsx_st( dcvsum8, 0, dest ); \ } -static void idct8_dc_altivec( uint8_t *dst, int16_t dc1, int16_t dc2 ) +static void idct8_dc_altivec( uint8_t *dst, vec_s16_t dcv ) { - dc1 = (dc1 + 32) >> 6; - dc2 = (dc2 + 32) >> 6; - vec_s16_t dcv = { dc1, dc1, dc1, dc1, dc2, dc2, dc2, dc2 }; - LOAD_ZERO; ALTIVEC_STORE8_DC_SUM_CLIP( &dst[0*FDEC_STRIDE], dcv ); ALTIVEC_STORE8_DC_SUM_CLIP( &dst[1*FDEC_STRIDE], dcv ); @@ -308,8 +304,18 @@ static void idct8_dc_altivec( uint8_t *dst, int16_t dc1, int16_t dc2 ) void x264_add8x8_idct_dc_altivec( uint8_t *p_dst, int16_t dct[4] ) { - idct8_dc_altivec( &p_dst[0], dct[0], dct[1] ); - idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2], dct[3] ); + vec_s16_t dcv; + vec_s16_t v32 = vec_sl( vec_splat_s16( 8 ), vec_splat_u16( 2 ) ); + vec_u16_t v6 = vec_splat_u16( 6 ); + vec_s16_t dctv = vec_vsx_ld( 0, dct ); + + dctv = vec_sra( vec_add( dctv, v32 ), v6 ); + dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 0 ), (vec_s32_t)vec_splat( dctv, 1 ) ); + dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv ); + idct8_dc_altivec( &p_dst[0], dcv ); + dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 2 ), (vec_s32_t)vec_splat( dctv, 3 ) ); + dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv ); + idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dcv ); } #define IDCT_1D_ALTIVEC(s0, s1, s2, s3, d0, d1, d2, d3) \ diff --git a/library/src/main/cpp/libx264/common/ppc/mc.c b/library/src/main/cpp/libx264/common/ppc/mc.c index 7fec750..6b24873 100644 --- a/library/src/main/cpp/libx264/common/ppc/mc.c +++ b/library/src/main/cpp/libx264/common/ppc/mc.c @@ -32,19 +32,6 @@ typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src, uint8_t *dst, intptr_t i_dst, int i_height ); -static inline int x264_tapfilter( uint8_t *pix, int i_pix_next ) -{ - return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + - pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] + - pix[ 3*i_pix_next]; -} - -static inline int x264_tapfilter1( uint8_t *pix ) -{ - return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] + - pix[ 3]; -} - static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src1, intptr_t i_src1, uint8_t *src2, int i_height ) diff --git a/library/src/main/cpp/libx264/common/quant.c b/library/src/main/cpp/libx264/common/quant.c index 7eef140..ae96222 100644 --- a/library/src/main/cpp/libx264/common/quant.c +++ b/library/src/main/cpp/libx264/common/quant.c @@ -460,9 +460,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) { #if ARCH_X86 pf->denoise_dct = x264_denoise_dct_mmx; - pf->decimate_score15 = x264_decimate_score15_mmx2; - pf->decimate_score16 = x264_decimate_score16_mmx2; - pf->decimate_score64 = x264_decimate_score64_mmx2; pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; @@ -473,8 +470,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) #endif pf->coeff_last4 = x264_coeff_last4_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; - if( cpu&X264_CPU_LZCNT ) - pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; } if( cpu&X264_CPU_SSE2 ) { @@ -499,17 +494,18 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_level_run8 = x264_coeff_level_run8_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; - pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt; - pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; - pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt; - pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt; - pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt; - } + } + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last4 = x264_coeff_last4_lzcnt; + pf->coeff_last8 = x264_coeff_last8_lzcnt; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt; + pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt; } if( cpu&X264_CPU_SSSE3 ) { @@ -557,8 +553,20 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->dequant_8x8 = x264_dequant_8x8_avx2; pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2; pf->denoise_dct = x264_denoise_dct_avx2; - if( cpu&X264_CPU_LZCNT ) - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2; + } + if( cpu&X264_CPU_AVX512 ) + { + pf->dequant_4x4 = x264_dequant_4x4_avx512; + pf->dequant_8x8 = x264_dequant_8x8_avx512; + pf->decimate_score15 = x264_decimate_score15_avx512; + pf->decimate_score16 = x264_decimate_score16_avx512; + pf->decimate_score64 = x264_decimate_score64_avx512; + pf->coeff_last4 = x264_coeff_last4_avx512; + pf->coeff_last8 = x264_coeff_last8_avx512; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512; } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH @@ -586,9 +594,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->quant_4x4 = x264_quant_4x4_mmx2; pf->quant_8x8 = x264_quant_8x8_mmx2; pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2; - pf->decimate_score15 = x264_decimate_score15_mmx2; - pf->decimate_score16 = x264_decimate_score16_mmx2; - pf->decimate_score64 = x264_decimate_score64_mmx2; pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2; pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2; pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2; @@ -599,13 +604,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_last8 = x264_coeff_last8_mmx2; pf->coeff_level_run4 = x264_coeff_level_run4_mmx2; pf->coeff_level_run8 = x264_coeff_level_run8_mmx2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt; - pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt; - pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt; - pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt; - } } if( cpu&X264_CPU_SSE2 ) @@ -634,14 +632,19 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2; - if( cpu&X264_CPU_LZCNT ) - { - pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt; - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt; - pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt; - pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt; - } + } + + if( cpu&X264_CPU_LZCNT ) + { + pf->coeff_last4 = x264_coeff_last4_lzcnt; + pf->coeff_last8 = x264_coeff_last8_lzcnt; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt; + pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt; + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt; } if( cpu&X264_CPU_SSSE3 ) @@ -657,17 +660,19 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) pf->decimate_score16 = x264_decimate_score16_ssse3; pf->decimate_score64 = x264_decimate_score64_ssse3; INIT_TRELLIS( ssse3 ); +#if ARCH_X86 || !defined( __MACH__ ) pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3; if( cpu&X264_CPU_LZCNT ) { - pf->coeff_level_run4 = x264_coeff_level_run4_ssse3; - pf->coeff_level_run8 = x264_coeff_level_run8_ssse3; + pf->coeff_level_run4 = x264_coeff_level_run4_ssse3_lzcnt; + pf->coeff_level_run8 = x264_coeff_level_run8_ssse3_lzcnt; pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt; pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt; } +#endif } if( cpu&X264_CPU_SSE4 ) @@ -717,12 +722,28 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf ) } pf->decimate_score64 = x264_decimate_score64_avx2; pf->denoise_dct = x264_denoise_dct_avx2; - if( cpu&X264_CPU_LZCNT ) + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2; +#if ARCH_X86 || !defined( __MACH__ ) + pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2; + pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2; +#endif + } + if( cpu&X264_CPU_AVX512 ) + { + if( h->param.i_cqm_preset == X264_CQM_FLAT ) + pf->dequant_8x8 = x264_dequant_8x8_flat16_avx512; + else { - pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt; - pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt; - pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt; + pf->dequant_4x4 = x264_dequant_4x4_avx512; + pf->dequant_8x8 = x264_dequant_8x8_avx512; } + pf->decimate_score15 = x264_decimate_score15_avx512; + pf->decimate_score16 = x264_decimate_score16_avx512; + pf->decimate_score64 = x264_decimate_score64_avx512; + pf->coeff_last8 = x264_coeff_last8_avx512; + pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512; + pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512; + pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512; } #endif // HAVE_MMX diff --git a/library/src/main/cpp/libx264/common/x86/cabac-a.asm b/library/src/main/cpp/libx264/common/x86/cabac-a.asm index d7870a3..49bca50 100644 --- a/library/src/main/cpp/libx264/common/x86/cabac-a.asm +++ b/library/src/main/cpp/libx264/common/x86/cabac-a.asm @@ -53,21 +53,32 @@ coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7 %endmacro cextern coeff_last4_mmx2 -cextern coeff_last4_mmx2_lzcnt +cextern coeff_last4_lzcnt +%if HIGH_BIT_DEPTH +cextern coeff_last4_avx512 +%endif cextern coeff_last15_sse2 -cextern coeff_last15_sse2_lzcnt +cextern coeff_last15_lzcnt +cextern coeff_last15_avx512 cextern coeff_last16_sse2 -cextern coeff_last16_sse2_lzcnt +cextern coeff_last16_lzcnt +cextern coeff_last16_avx512 cextern coeff_last64_sse2 -cextern coeff_last64_sse2_lzcnt -cextern coeff_last64_avx2_lzcnt +cextern coeff_last64_lzcnt +cextern coeff_last64_avx2 +cextern coeff_last64_avx512 %ifdef PIC SECTION .data %endif -coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 -coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 -coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_lzcnt: COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +coeff_last_avx2: COEFF_LAST_TABLE lzcnt, avx2, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +%if HIGH_BIT_DEPTH +coeff_last_avx512: COEFF_LAST_TABLE avx512, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +%else +coeff_last_avx512: COEFF_LAST_TABLE lzcnt, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64 +%endif %endif SECTION .text @@ -100,7 +111,7 @@ struc cb .start: pointer 1 .p: pointer 1 .end: pointer 1 - align 16, resb 1 + align 64, resb 1 .bits_encoded: resd 1 .state: resb 1024 endstruc @@ -352,25 +363,33 @@ CABAC bmi2 %endmacro %macro ABS_DCTCOEFS 2 -%assign i 0 -%rep %2/16 %if HIGH_BIT_DEPTH - ABSD m0, [%1+ 0+i*64], m4 - ABSD m1, [%1+16+i*64], m5 - ABSD m2, [%1+32+i*64], m4 - ABSD m3, [%1+48+i*64], m5 - mova [rsp+ 0+i*64], m0 - mova [rsp+16+i*64], m1 - mova [rsp+32+i*64], m2 - mova [rsp+48+i*64], m3 + %define %%abs ABSD %else - ABSW m0, [%1+ 0+i*32], m2 - ABSW m1, [%1+16+i*32], m3 - mova [rsp+ 0+i*32], m0 - mova [rsp+16+i*32], m1 -%endif + %define %%abs ABSW +%endif +%if mmsize == %2*SIZEOF_DCTCOEF + %%abs m0, [%1], m1 + mova [rsp], m0 +%elif mmsize == %2*SIZEOF_DCTCOEF/2 + %%abs m0, [%1+0*mmsize], m2 + %%abs m1, [%1+1*mmsize], m3 + mova [rsp+0*mmsize], m0 + mova [rsp+1*mmsize], m1 +%else +%assign i 0 +%rep %2*SIZEOF_DCTCOEF/(4*mmsize) + %%abs m0, [%1+(4*i+0)*mmsize], m4 + %%abs m1, [%1+(4*i+1)*mmsize], m5 + %%abs m2, [%1+(4*i+2)*mmsize], m4 + %%abs m3, [%1+(4*i+3)*mmsize], m5 + mova [rsp+(4*i+0)*mmsize], m0 + mova [rsp+(4*i+1)*mmsize], m1 + mova [rsp+(4*i+2)*mmsize], m2 + mova [rsp+(4*i+3)*mmsize], m3 %assign i i+1 %endrep +%endif %endmacro %macro SIG_OFFSET 1 @@ -403,16 +422,14 @@ CABAC bmi2 %endif %ifdef PIC - cglobal func, 4,13 + cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF lea r12, [$$] %define GLOBAL +r12-$$ %else - cglobal func, 4,12 + cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF %define GLOBAL %endif -%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15) - SUB rsp, pad shl r1d, 4 ; MB_INTERLACED*16 %if %1 lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8 @@ -429,15 +446,13 @@ CABAC bmi2 ABS_DCTCOEFS r0, 64 %else mov r4, r0 ; r4 = dct - mov r6, ~SIZEOF_DCTCOEF - and r6, r4 ; handle AC coefficient case - ABS_DCTCOEFS r6, 16 - sub r4, r6 ; calculate our new dct pointer + and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case + ABS_DCTCOEFS r4, 16 + xor r4, r0 ; calculate our new dct pointer add r4, rsp ; restore AC coefficient offset %endif - mov r1, [%2+gprsize*r2 GLOBAL] ; for improved OOE performance, run coeff_last on the original coefficients. - call r1 ; coeff_last[ctx_block_cat]( dct ) + call [%2+gprsize*r2 GLOBAL] ; coeff_last[ctx_block_cat]( dct ) ; we know on 64-bit that the SSE2 versions of this function only ; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we ; don't need r2 in 8x8 mode. @@ -521,7 +536,6 @@ CABAC bmi2 jge .coeff_loop .end: mov [r3+cb.bits_encoded-cb.state], r0d - ADD rsp, pad RET %endmacro @@ -529,15 +543,23 @@ CABAC bmi2 INIT_XMM sse2 CABAC_RESIDUAL_RD 0, coeff_last_sse2 CABAC_RESIDUAL_RD 1, coeff_last_sse2 -INIT_XMM sse2,lzcnt -CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt -CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt +INIT_XMM lzcnt +CABAC_RESIDUAL_RD 0, coeff_last_lzcnt +CABAC_RESIDUAL_RD 1, coeff_last_lzcnt INIT_XMM ssse3 CABAC_RESIDUAL_RD 0, coeff_last_sse2 CABAC_RESIDUAL_RD 1, coeff_last_sse2 INIT_XMM ssse3,lzcnt -CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt -CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt +CABAC_RESIDUAL_RD 0, coeff_last_lzcnt +CABAC_RESIDUAL_RD 1, coeff_last_lzcnt +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +%else +INIT_YMM avx512 +%endif +CABAC_RESIDUAL_RD 0, coeff_last_avx512 +INIT_ZMM avx512 +CABAC_RESIDUAL_RD 1, coeff_last_avx512 %endif ;----------------------------------------------------------------------------- @@ -615,7 +637,7 @@ CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt %endmacro %macro CABAC_RESIDUAL 1 -cglobal cabac_block_residual_internal, 4,15 +cglobal cabac_block_residual_internal, 4,15,0,-4*64 %ifdef PIC ; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register. lea r7, [$$] @@ -625,8 +647,6 @@ cglobal cabac_block_residual_internal, 4,15 %define lastm r7d %define GLOBAL %endif -%assign pad gprsize+4*2+4*64-(stack_offset&15) - SUB rsp, pad shl r1d, 4 %define sigoffq r8 @@ -653,8 +673,7 @@ cglobal cabac_block_residual_internal, 4,15 mov dct, r0 mov leveloffm, leveloffd - mov r1, [%1+gprsize*r2 GLOBAL] - call r1 + call [%1+gprsize*r2 GLOBAL] mov lastm, eax ; put cabac in r0; needed for cabac_encode_decision mov r0, r3 @@ -742,15 +761,16 @@ cglobal cabac_block_residual_internal, 4,15 %endif dec coeffidxd jge .level_loop - ADD rsp, pad RET %endmacro %if ARCH_X86_64 INIT_XMM sse2 CABAC_RESIDUAL coeff_last_sse2 -INIT_XMM sse2,lzcnt -CABAC_RESIDUAL coeff_last_sse2_lzcnt -INIT_XMM avx2,bmi2 -CABAC_RESIDUAL coeff_last_avx2_lzcnt +INIT_XMM lzcnt +CABAC_RESIDUAL coeff_last_lzcnt +INIT_XMM avx2 +CABAC_RESIDUAL coeff_last_avx2 +INIT_XMM avx512 +CABAC_RESIDUAL coeff_last_avx512 %endif diff --git a/library/src/main/cpp/libx264/common/x86/cpu-a.asm b/library/src/main/cpp/libx264/common/x86/cpu-a.asm index c961903..4692f65 100644 --- a/library/src/main/cpp/libx264/common/x86/cpu-a.asm +++ b/library/src/main/cpp/libx264/common/x86/cpu-a.asm @@ -53,18 +53,16 @@ cglobal cpu_cpuid, 5,7 RET ;----------------------------------------------------------------------------- -; void cpu_xgetbv( int op, int *eax, int *edx ) +; uint64_t cpu_xgetbv( int xcr ) ;----------------------------------------------------------------------------- -cglobal cpu_xgetbv, 3,7 - push r2 - push r1 - mov ecx, r0d +cglobal cpu_xgetbv + movifnidn ecx, r0m xgetbv - pop r4 - mov [r4], eax - pop r4 - mov [r4], edx - RET +%if ARCH_X86_64 + shl rdx, 32 + or rax, rdx +%endif + ret %if ARCH_X86_64 @@ -77,7 +75,7 @@ cglobal stack_align %if WIN64 sub rsp, 32 ; shadow space %endif - and rsp, ~31 + and rsp, ~(STACK_ALIGNMENT-1) mov rax, r0 mov r0, r1 mov r1, r2 @@ -118,7 +116,7 @@ cglobal stack_align push ebp mov ebp, esp sub esp, 12 - and esp, ~31 + and esp, ~(STACK_ALIGNMENT-1) mov ecx, [ebp+8] mov edx, [ebp+12] mov [esp], edx diff --git a/library/src/main/cpp/libx264/common/x86/dct-a.asm b/library/src/main/cpp/libx264/common/x86/dct-a.asm index c2f8973..33ed061 100644 --- a/library/src/main/cpp/libx264/common/x86/dct-a.asm +++ b/library/src/main/cpp/libx264/common/x86/dct-a.asm @@ -30,7 +30,41 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 +; AVX-512 permutation indices are bit-packed to save cache +%if HIGH_BIT_DEPTH +scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame + dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1 + dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2 + dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3 + ; bits 19-23: 8x8_frame4 +scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4: 8x8_field1 + dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2 + dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3 + dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4 +cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4: interleave1 + dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9: interleave2 + dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3 + dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4 +%else +dct_avx512: dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits 0-4: dct8x8_fenc bits 5-9: dct8x8_fdec + dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits 10-13: dct16x16_fenc bits 14-18: dct16x16_fdec + dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1 bits(e) 28-31: idct8x8_idct2 + dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather +scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame + dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1 + dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2 + dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30 +scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5: 8x8_field1 + dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2 + dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a + dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2 +cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5: interleave1 + dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11: interleave2 + dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd + dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd +%endif + pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1 pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15 pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15 @@ -580,6 +614,217 @@ cglobal sub16x16_dct, 3,3,6 DCT4_1D 0, 1, 2, 3, 4 STORE16_DCT_AVX2 0, 1, 2, 3, 4 ret + +%macro DCT4x4_AVX512 0 + psubw m0, m2 ; 0 1 + psubw m1, m3 ; 3 2 + SUMSUB_BA w, 1, 0, 2 + SBUTTERFLY wd, 1, 0, 2 + paddw m2, m1, m0 + psubw m3, m1, m0 + paddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1 + psubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3 + shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2 + punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1 + SUMSUB_BA w, 1, 2, 3 + shufps m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2 + shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3 + paddw m2, m1, m3 + psubw m0, m1, m3 + paddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1 + psubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3' +%endmacro + +INIT_XMM avx512 +cglobal sub4x4_dct + mov eax, 0xf0aa + kmovw k1, eax + PROLOGUE 3,3 + movd m0, [r1+0*FENC_STRIDE] + movd m2, [r2+0*FDEC_STRIDE] + vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE] + vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE] + movd m1, [r1+3*FENC_STRIDE] + movd m3, [r2+3*FDEC_STRIDE] + vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE] + vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE] + kshiftrw k2, k1, 8 + pxor m4, m4 + punpcklbw m0, m4 + punpcklbw m2, m4 + punpcklbw m1, m4 + punpcklbw m3, m4 + DCT4x4_AVX512 + mova [r0], m2 + mova [r0+16], m0 + RET + +INIT_ZMM avx512 +cglobal dct4x4x4_internal + punpcklbw m0, m1, m4 + punpcklbw m2, m3, m4 + punpckhbw m1, m4 + punpckhbw m3, m4 + DCT4x4_AVX512 + mova m1, m2 + vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0 + vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1 + ret + +%macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2 + movu %1, [r1+%3*FENC_STRIDE] + vpermt2d %1, %2, [r1+%4*FENC_STRIDE] +%endmacro + +%macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2 + movu %1, [r2+(%4 )*FDEC_STRIDE] + vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE] + movu %3, [r2+(%5 )*FDEC_STRIDE] + vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE] + vpermt2d %1, %2, %3 +%endmacro + +cglobal sub8x8_dct, 3,3 + mova m0, [dct_avx512] + DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3 + mov r1d, 0xaaaaaaaa + kmovd k1, r1d + psrld m0, 5 + DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4 + mov r1d, 0xf0f0f0f0 + kmovd k2, r1d + pxor xm4, xm4 + knotw k3, k2 + call dct4x4x4_internal_avx512 + mova [r0], m0 + mova [r0+64], m1 + RET + +%macro SUB4x16_DCT_AVX512 2 ; dst, src + vpermd m1, m5, [r1+1*%2*64] + mova m3, [r2+2*%2*64] + vpermt2d m3, m6, [r2+2*%2*64+64] + call dct4x4x4_internal_avx512 + mova [r0+%1*64 ], m0 + mova [r0+%1*64+128], m1 +%endmacro + +cglobal sub16x16_dct + psrld m5, [dct_avx512], 10 + mov eax, 0xaaaaaaaa + kmovd k1, eax + mov eax, 0xf0f0f0f0 + kmovd k2, eax + PROLOGUE 3,3 + pxor xm4, xm4 + knotw k3, k2 + psrld m6, m5, 4 + SUB4x16_DCT_AVX512 0, 0 + SUB4x16_DCT_AVX512 1, 1 + SUB4x16_DCT_AVX512 4, 2 + SUB4x16_DCT_AVX512 5, 3 + RET + +cglobal sub8x8_dct_dc, 3,3 + mova m3, [dct_avx512] + DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3 + mov r1d, 0xaa + kmovb k1, r1d + psrld m3, 5 + DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4 + pxor xm3, xm3 + psadbw m0, m3 + psadbw m1, m3 + psubw m0, m1 + vpmovqw xmm0, m0 + vprold xmm1, xmm0, 16 + paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3 + punpckhqdq xmm2, xmm0, xmm0 + psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3 + paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3 + punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3 + punpcklqdq xmm1, xmm0, xmm0 + psubw xmm0 {k1}, xm3, xmm0 + paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3 + movhps [r0], xmm0 + RET + +cglobal sub8x16_dct_dc, 3,3 + mova m5, [dct_avx512] + DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8 ; 0 4 1 5 + DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7 + mov r1d, 0xaa + kmovb k1, r1d + psrld m5, 5 + DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8 + DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12 + pxor xm4, xm4 + psadbw m0, m4 + psadbw m1, m4 + psadbw m2, m4 + psadbw m3, m4 + psubw m0, m2 + psubw m1, m3 + SBUTTERFLY qdq, 0, 1, 2 + paddw m0, m1 + vpmovqw xmm0, m0 ; 0 2 4 6 1 3 5 7 + psrlq xmm2, xmm0, 32 + psubw xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7 + paddw xmm0, xmm2 ; 0+4 2+6 1+5 3+7 + punpckhdq xmm2, xmm0, xmm1 + punpckldq xmm0, xmm1 + psubw xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7 + paddw xmm0, xmm2 ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7 + punpcklwd xmm0, xmm1 + psrlq xmm2, xmm0, 32 + psubw xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7 + paddw xmm0, xmm2 ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7 + shufps xmm0, xmm1, q0220 + mova [r0], xmm0 + RET + +%macro SARSUMSUB 3 ; a, b, tmp + mova m%3, m%1 + vpsraw m%1 {k1}, 1 + psubw m%1, m%2 ; 0-2 1>>1-3 + vpsraw m%2 {k1}, 1 + paddw m%2, m%3 ; 0+2 1+3>>1 +%endmacro + +cglobal add8x8_idct, 2,2 + mova m1, [r1] + mova m2, [r1+64] + mova m3, [dct_avx512] + vbroadcasti32x4 m4, [pw_32] + mov r1d, 0xf0f0f0f0 + kxnorb k2, k2, k2 + kmovd k1, r1d + kmovb k3, k2 + vshufi32x4 m0, m1, m2, q2020 ; 0 1 4 5 8 9 c d + vshufi32x4 m1, m2, q3131 ; 2 3 6 7 a b e f + psrlq m5, m3, 56 ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE + vpgatherqq m6 {k2}, [r0+m5] + SARSUMSUB 0, 1, 2 + SBUTTERFLY wd, 1, 0, 2 + psrlq m7, m3, 28 + SUMSUB_BA w, 0, 1, 2 ; 0+1+2+3>>1 0+1>>1-2-3 + vprold m1, 16 ; 0-1>>1-2+3 0-1+2-3>>1 + SBUTTERFLY dq, 0, 1, 2 + psrlq m3, 24 + SARSUMSUB 0, 1, 2 + vpermi2q m3, m1, m0 + vpermt2q m1, m7, m0 + paddw m3, m4 ; += 32 + SUMSUB_BA w, 1, 3, 0 + psraw m1, 6 ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3' + psraw m3, 6 ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3' + pxor xm0, xm0 + SBUTTERFLY bw, 6, 0, 2 + paddsw m1, m6 + paddsw m3, m0 + packuswb m1, m3 + vpscatterqq [r0+m5] {k3}, m1 + RET %endif ; HIGH_BIT_DEPTH INIT_MMX @@ -1883,3 +2128,161 @@ cglobal zigzag_interleave_8x8_cavlc, 3,3,6 mov [r2+8], r0w RET %endif ; !HIGH_BIT_DEPTH + +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal zigzag_scan_4x4_frame, 2,2 + mova m0, [scan_frame_avx512] + vpermd m0, m0, [r1] + mova [r0], m0 + RET + +cglobal zigzag_scan_4x4_field, 2,2 + mova m0, [r1] + pshufd xmm1, [r1+8], q3102 + mova [r0], m0 + movu [r0+8], xmm1 + RET + +cglobal zigzag_scan_8x8_frame, 2,2 + psrld m0, [scan_frame_avx512], 4 + mova m1, [r1+0*64] + mova m2, [r1+1*64] + mova m3, [r1+2*64] + mova m4, [r1+3*64] + mov r1d, 0x01fe7f80 + kmovd k1, r1d + kshiftrd k2, k1, 16 + vpermd m5, m0, m3 ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40 + psrld m6, m0, 5 + vpermi2d m0, m1, m2 ; 0 8 1 2 9 16 24 17 10 3 4 11 18 25 __ __ + vmovdqa64 m0 {k1}, m5 + mova [r0+0*64], m0 + mova m5, m1 + vpermt2d m1, m6, m2 ; __ 26 19 12 5 6 13 20 27 __ __ __ __ __ __ __ + psrld m0, m6, 5 + vpermi2d m6, m3, m4 ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35 + vmovdqa32 m6 {k2}, m1 + mova [r0+1*64], m6 + vpermt2d m5, m0, m2 ; 28 21 14 7 15 22 29 __ __ __ __ __ __ __ __ 30 + psrld m1, m0, 5 + vpermi2d m0, m3, m4 ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __ + vmovdqa32 m5 {k1}, m0 + mova [r0+2*64], m5 + vpermt2d m3, m1, m4 ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63 + vpermd m2, m1, m2 ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __ + vmovdqa64 m2 {k2}, m3 + mova [r0+3*64], m2 + RET + +cglobal zigzag_scan_8x8_field, 2,2 + mova m0, [scan_field_avx512] + mova m1, [r1+0*64] + mova m2, [r1+1*64] + mova m3, [r1+2*64] + mova m4, [r1+3*64] + mov r1d, 0x3f + kmovb k1, r1d + psrld m5, m0, 5 + vpermi2d m0, m1, m2 + vmovdqa64 m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15 + vpermt2d m1, m5, m2 + psrld m5, 5 + vmovdqa64 m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31 + vpermt2d m2, m5, m3 + psrld m5, 5 + vpermt2d m3, m5, m4 + mova [r0+0*64], m0 + mova [r0+1*64], m1 + mova [r0+2*64], m2 + mova [r0+3*64], m3 + RET + +cglobal zigzag_interleave_8x8_cavlc, 3,3 + mova m0, [cavlc_shuf_avx512] + mova m1, [r1+0*64] + mova m2, [r1+1*64] + mova m3, [r1+2*64] + mova m4, [r1+3*64] + kxnorb k1, k1, k1 + por m7, m1, m2 + psrld m5, m0, 5 + vpermi2d m0, m1, m2 ; a0 a1 b0 b1 + vpternlogd m7, m3, m4, 0xfe ; m1|m2|m3|m4 + psrld m6, m5, 5 + vpermi2d m5, m3, m4 ; b2 b3 a2 a3 + vptestmd k0, m7, m7 + vpermt2d m1, m6, m2 ; c0 c1 d0 d1 + psrld m6, 5 + vpermt2d m3, m6, m4 ; d2 d3 c2 c3 + vshufi32x4 m2, m0, m5, q1032 ; b0 b1 b2 b3 + vmovdqa32 m5 {k1}, m0 ; a0 a1 a2 a3 + vshufi32x4 m4, m1, m3, q1032 ; d0 d1 d2 d3 + vmovdqa32 m3 {k1}, m1 ; c0 c1 c2 c3 + mova [r0+0*64], m5 + mova [r0+1*64], m2 + mova [r0+2*64], m3 + mova [r0+3*64], m4 + kmovw r1d, k0 + test r1d, 0x1111 + setnz [r2] + test r1d, 0x2222 + setnz [r2+1] + test r1d, 0x4444 + setnz [r2+8] + test r1d, 0x8888 + setnz [r2+9] + RET + +%else ; !HIGH_BIT_DEPTH +INIT_YMM avx512 +cglobal zigzag_scan_4x4_frame, 2,2 + mova m0, [scan_frame_avx512] + vpermw m0, m0, [r1] + mova [r0], m0 + RET + +cglobal zigzag_scan_4x4_field, 2,2 + mova m0, [r1] + pshuflw xmm1, [r1+4], q3102 + mova [r0], m0 + movq [r0+4], xmm1 + RET + +INIT_ZMM avx512 +cglobal zigzag_scan_8x8_frame, 2,2 + psrlw m0, [scan_frame_avx512], 4 +scan8_avx512: + mova m1, [r1] + mova m2, [r1+64] + psrlw m3, m0, 6 + vpermi2w m0, m1, m2 + vpermt2w m1, m3, m2 + mova [r0], m0 + mova [r0+64], m1 + RET + +cglobal zigzag_scan_8x8_field, 2,2 + mova m0, [scan_field_avx512] + jmp scan8_avx512 + +cglobal zigzag_interleave_8x8_cavlc, 3,3 + mova m0, [cavlc_shuf_avx512] + mova m1, [r1] + mova m2, [r1+64] + psrlw m3, m0, 6 + vpermi2w m0, m1, m2 + vpermt2w m1, m3, m2 + kxnorb k2, k2, k2 + vptestmd k0, m0, m0 + vptestmd k1, m1, m1 + mova [r0], m0 + mova [r0+64], m1 + ktestw k2, k0 + setnz [r2] + setnc [r2+1] + ktestw k2, k1 + setnz [r2+8] + setnc [r2+9] + RET +%endif ; !HIGH_BIT_DEPTH diff --git a/library/src/main/cpp/libx264/common/x86/dct.h b/library/src/main/cpp/libx264/common/x86/dct.h index 67221c3..20a65c5 100644 --- a/library/src/main/cpp/libx264/common/x86/dct.h +++ b/library/src/main/cpp/libx264/common/x86/dct.h @@ -34,6 +34,7 @@ void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub4x4_dct_avx512 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); @@ -41,12 +42,16 @@ void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 ); void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); -void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); -void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); -void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); +void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 ); +void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 ); +void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 ); +void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 ); +void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 ); void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] ); void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] ); @@ -59,6 +64,7 @@ void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] ); void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] ); void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] ); void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] ); +void x264_add8x8_idct_avx512 ( uint8_t *p_dst, int16_t dct[ 4][16] ); void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] ); void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] ); void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] ); @@ -101,22 +107,26 @@ void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] ); void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] ); void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] ); -void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] ); -void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] ); -void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] ); -void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] ); -void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] ); -void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] ); -void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] ); -void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] ); -void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] ); -void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] ); -void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] ); -void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] ); -void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] ); +void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] ); +void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] ); +void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] ); +void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] ); +void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] ); +void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] ); +void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] ); +void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] ); +void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] ); +void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] ); int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); @@ -125,9 +135,10 @@ int x264_zigzag_sub_4x4_field_avx ( int16_t level[16], const uint8_t *src, u int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst ); int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc ); -void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz ); -void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz ); -void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); -void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_sse2 ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_avx2 ( int16_t *dst, int16_t *src, uint8_t *nnz ); +void x264_zigzag_interleave_8x8_cavlc_avx512( dctcoef *dst, dctcoef *src, uint8_t *nnz ); #endif diff --git a/library/src/main/cpp/libx264/common/x86/deblock-a.asm b/library/src/main/cpp/libx264/common/x86/deblock-a.asm index 9790fd2..917119b 100644 --- a/library/src/main/cpp/libx264/common/x86/deblock-a.asm +++ b/library/src/main/cpp/libx264/common/x86/deblock-a.asm @@ -28,10 +28,14 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 - -load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15 -insert_top_shuf: dd 0,1,4,5,7,2,3,6 +SECTION_RODATA 64 + +load_bytes_zmm_shuf: dd 0x50404032, 0x70606053, 0xd0c0c0b4, 0xf0e0e0d5 + dd 0x50404036, 0x70606057, 0xd0c0c0b8, 0xf0e0e0d9 + dd 0x50104001, 0x70306023, 0xd090c083, 0xf0b0e0a5 + dd 0x50104005, 0x70306027, 0xd090c087, 0xf0b0e0a9 +load_bytes_ymm_shuf: dd 0x06050403, 0x0e0d0c1b, 0x07060544, 0x0f0e0d5c + dd 0x06050473, 0x0e0d0c2b, 0x07060534, 0x0f0e0d6c transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15 SECTION .text @@ -906,9 +910,8 @@ DEBLOCK_LUMA_INTRA movq m3, %4 punpcklwd m0, m2 punpcklwd m1, m3 - mova m2, m0 + punpckhdq m2, m0, m1 punpckldq m0, m1 - punpckhdq m2, m1 movq m4, %5 movq m6, %6 @@ -916,9 +919,8 @@ DEBLOCK_LUMA_INTRA movq m7, %8 punpcklwd m4, m6 punpcklwd m5, m7 - mova m6, m4 + punpckhdq m6, m4, m5 punpckldq m4, m5 - punpckhdq m6, m5 punpckhqdq m1, m0, m4 punpckhqdq m3, m2, m6 @@ -2278,13 +2280,10 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8 RET %endif ; !HIGH_BIT_DEPTH - - ;----------------------------------------------------------------------------- ; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2], ; uint8_t bs[2][4][4], int mvy_limit, int bframe ) ;----------------------------------------------------------------------------- - %define scan8start (4+1*8) %define nnz r0+scan8start %define ref r1+scan8start @@ -2292,145 +2291,54 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8 %define bs0 r3 %define bs1 r3+32 -%macro LOAD_BYTES_MMX 1 - movd m2, [%1+8*0-1] - movd m0, [%1+8*0] - movd m3, [%1+8*2-1] - movd m1, [%1+8*2] - punpckldq m2, [%1+8*1-1] - punpckldq m0, [%1+8*1] - punpckldq m3, [%1+8*3-1] - punpckldq m1, [%1+8*3] -%endmacro - -%macro DEBLOCK_STRENGTH_REFS_MMX 0 - LOAD_BYTES_MMX ref - pxor m2, m0 - pxor m3, m1 - por m2, [bs0+0] - por m3, [bs0+8] - movq [bs0+0], m2 - movq [bs0+8], m3 - - movd m2, [ref-8*1] - movd m3, [ref+8*1] - punpckldq m2, m0 ; row -1, row 0 - punpckldq m3, m1 ; row 1, row 2 - pxor m0, m2 - pxor m1, m3 - por m0, [bs1+0] - por m1, [bs1+8] - movq [bs1+0], m0 - movq [bs1+8], m1 -%endmacro - -%macro DEBLOCK_STRENGTH_MVS_MMX 2 - mova m0, [mv-%2] - mova m1, [mv-%2+8] - psubw m0, [mv] - psubw m1, [mv+8] - packsswb m0, m1 - ABSB m0, m1 - psubusb m0, m7 - packsswb m0, m0 - por m0, [%1] - movd [%1], m0 -%endmacro - -%macro DEBLOCK_STRENGTH_NNZ_MMX 1 - por m2, m0 - por m3, m1 - mova m4, [%1] - mova m5, [%1+8] - pminub m2, m6 - pminub m3, m6 - pminub m4, m6 ; mv ? 1 : 0 - pminub m5, m6 - paddb m2, m2 ; nnz ? 2 : 0 - paddb m3, m3 - pmaxub m2, m4 - pmaxub m3, m5 -%endmacro - -%macro LOAD_BYTES_XMM 1 - movu m2, [%1-4] ; FIXME could be aligned if we changed nnz's allocation +%macro LOAD_BYTES_XMM 2 ; src, aligned +%if %2 + mova m2, [%1-4] + mova m1, [%1+12] +%else + movu m2, [%1-4] movu m1, [%1+12] - pslldq m0, m2, 1 +%endif + psllq m0, m2, 8 shufps m2, m1, q3131 ; cur nnz, all rows - pslldq m1, 1 + psllq m1, 8 shufps m0, m1, q3131 ; left neighbors +%if cpuflag(avx) || (%2 && cpuflag(ssse3)) + palignr m1, m2, [%1-20], 12 +%else pslldq m1, m2, 4 - movd m3, [%1-8] ; could be palignr if nnz was aligned + movd m3, [%1-8] por m1, m3 ; top neighbors +%endif %endmacro -INIT_MMX mmx2 -cglobal deblock_strength, 6,6 - ; Prepare mv comparison register - shl r4d, 8 - add r4d, 3 - (1<<8) - movd m7, r4d - SPLATW m7, m7 - mova m6, [pb_1] - pxor m0, m0 - mova [bs0+0], m0 - mova [bs0+8], m0 - mova [bs1+0], m0 - mova [bs1+8], m0 - -.lists: - DEBLOCK_STRENGTH_REFS_MMX - mov r4d, 4 -.mvs: - DEBLOCK_STRENGTH_MVS_MMX bs0, 4 - DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8 - add r2, 4*8 - add r3, 4 - dec r4d - jg .mvs - add r1, 40 - add r2, 4*8 - sub r3, 16 - dec r5d - jge .lists - - ; Check nnz - LOAD_BYTES_MMX nnz - DEBLOCK_STRENGTH_NNZ_MMX bs0 - ; Transpose column output - SBUTTERFLY bw, 2, 3, 4 - SBUTTERFLY bw, 2, 3, 4 - mova [bs0+0], m2 - mova [bs0+8], m3 - movd m2, [nnz-8*1] - movd m3, [nnz+8*1] - punpckldq m2, m0 ; row -1, row 0 - punpckldq m3, m1 ; row 1, row 2 - DEBLOCK_STRENGTH_NNZ_MMX bs1 - mova [bs1+0], m2 - mova [bs1+8], m3 - RET +%if UNIX64 + DECLARE_REG_TMP 5 +%else + DECLARE_REG_TMP 4 +%endif %macro DEBLOCK_STRENGTH_XMM 0 -cglobal deblock_strength, 6,6,7 +cglobal deblock_strength, 5,5,7 ; Prepare mv comparison register shl r4d, 8 add r4d, 3 - (1<<8) movd m6, r4d + movifnidn t0d, r5m SPLATW m6, m6 pxor m4, m4 ; bs0 pxor m5, m5 ; bs1 .lists: ; Check refs - LOAD_BYTES_XMM ref + LOAD_BYTES_XMM ref, 0 pxor m0, m2 pxor m1, m2 por m4, m0 por m5, m1 ; Check mvs -%if cpuflag(ssse3) +%if cpuflag(ssse3) && notcpuflag(avx) mova m0, [mv+4*8*0] mova m1, [mv+4*8*1] palignr m3, m0, [mv+4*8*0-16], 12 @@ -2483,11 +2391,11 @@ cglobal deblock_strength, 6,6,7 por m5, m0 add r1, 40 add r2, 4*8*5 - dec r5d + dec t0d jge .lists ; Check nnz - LOAD_BYTES_XMM nnz + LOAD_BYTES_XMM nnz, 1 por m0, m2 por m1, m2 mova m6, [pb_1] @@ -2520,68 +2428,121 @@ INIT_XMM avx DEBLOCK_STRENGTH_XMM %macro LOAD_BYTES_YMM 1 - movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX - pshufb m0, [load_bytes_shuf] ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX - mova m2, [insert_top_shuf] - vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2 - vpermd m0, m2, m0 ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS - vpbroadcastd m2, [%1-8] ; ABCD .... - vpblendd m0, m0, m2, 00010000b ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS + movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX + pshufb m0, m6 ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX + vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2 + vpbroadcastd m2, [%1-8] ; ABCD .... + vpblendd m0, m0, m2, 0x80 + vpermd m0, m7, m0 ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS %endmacro INIT_YMM avx2 -cglobal deblock_strength, 6,6,7 +cglobal deblock_strength, 5,5,8 + mova m6, [load_bytes_ymm_shuf] ; Prepare mv comparison register - shl r4d, 8 - add r4d, 3 - (1<<8) - movd xm6, r4d - vpbroadcastw m6, xm6 - pxor m5, m5 ; bs0,bs1 + shl r4d, 8 + add r4d, 3 - (1<<8) + movd xm5, r4d + movifnidn t0d, r5m + vpbroadcastw m5, xm5 + psrld m7, m6, 4 + pxor m4, m4 ; bs0,bs1 .lists: ; Check refs LOAD_BYTES_YMM ref - pxor m0, m1 - por m5, m0 + pxor m0, m1 + por m4, m0 ; Check mvs - movu xm0, [mv-4+4*8*0] - vinserti128 m0, m0, [mv+4*8*-1], 1 - vbroadcasti128 m2, [mv+4*8* 0] - vinserti128 m1, m2, [mv-4+4*8*1], 0 - vbroadcasti128 m3, [mv+4*8* 1] - psubw m0, m2 - psubw m1, m3 - - vinserti128 m2, m3, [mv-4+4*8*2], 0 - vbroadcasti128 m4, [mv+4*8* 2] - vinserti128 m3, m4, [mv-4+4*8*3], 0 - psubw m2, m4 - vbroadcasti128 m4, [mv+4*8* 3] - psubw m3, m4 - packsswb m0, m1 - packsswb m2, m3 - pabsb m0, m0 - pabsb m2, m2 - psubusb m0, m6 - psubusb m2, m6 - packsswb m0, m2 - por m5, m0 - - add r1, 40 - add r2, 4*8*5 - dec r5d + movu xm0, [mv+0*4*8-4] + vinserti128 m0, m0, [mv-1*4*8 ], 1 + vbroadcasti128 m2, [mv+0*4*8 ] + vinserti128 m1, m2, [mv+1*4*8-4], 0 + psubw m0, m2 + vbroadcasti128 m2, [mv+1*4*8 ] + psubw m1, m2 + packsswb m0, m1 + vinserti128 m1, m2, [mv+2*4*8-4], 0 + vbroadcasti128 m3, [mv+2*4*8 ] + vinserti128 m2, m3, [mv+3*4*8-4], 0 + psubw m1, m3 + vbroadcasti128 m3, [mv+3*4*8 ] + psubw m2, m3 + packsswb m1, m2 + pabsb m0, m0 + pabsb m1, m1 + psubusb m0, m5 + psubusb m1, m5 + packsswb m0, m1 + por m4, m0 + add r1, 40 + add r2, 4*8*5 + dec t0d jge .lists ; Check nnz LOAD_BYTES_YMM nnz - por m0, m1 - mova m6, [pb_1] - pminub m0, m6 - pminub m5, m6 ; mv ? 1 : 0 - paddb m0, m0 ; nnz ? 2 : 0 - pmaxub m5, m0 - vextracti128 [bs1], m5, 1 - pshufb xm5, [transpose_shuf] - mova [bs0], xm5 + mova m2, [pb_1] + por m0, m1 + pminub m0, m2 + pminub m4, m2 ; mv ? 1 : 0 + paddb m0, m0 ; nnz ? 2 : 0 + pmaxub m0, m4 + vextracti128 [bs1], m0, 1 + pshufb xm0, [transpose_shuf] + mova [bs0], xm0 + RET + +%macro LOAD_BYTES_ZMM 1 + vpermd m1, m6, [%1-12] + pshufb m1, m7 ; EF FG GH HI JK KL LM MN OP PQ QR RS TU UV VW WX +%endmacro ; AF BG CH DI FK GL HM IN KP LQ MR NS PU QV RW SX + +INIT_ZMM avx512 +cglobal deblock_strength, 5,5 + mova m6, [load_bytes_zmm_shuf] + shl r4d, 8 + add r4d, 3 - (1<<8) + vpbroadcastw m5, r4d + mov r4d, 0x34cc34cc ; {1,-1} * 11001100b + kmovb k1, r4d + vpbroadcastd m4, r4d + movifnidn t0d, r5m + psrld m7, m6, 4 + pxor xm3, xm3 + +.lists: + vbroadcasti64x2 m2, [mv+32] + vinserti64x2 m0, m2, [mv-32], 2 + vbroadcasti64x2 m1, [mv+ 0] + vinserti64x2 m0, m0, [mv- 4], 0 + vbroadcasti64x2 m1 {k1}, [mv+64] + vinserti64x2 m0, m0, [mv+60], 1 + psubw m0, m1 + vinserti64x2 m1, m1, [mv+28], 0 + vbroadcasti64x2 m2 {k1}, [mv+96] + vinserti64x2 m1, m1, [mv+92], 1 + psubw m1, m2 + packsswb m0, m1 + pabsb m0, m0 + psubusb m0, m5 + + LOAD_BYTES_ZMM ref + pmaddubsw m1, m4 ; E-F F-G G-H H-I ... + vpternlogd m3, m0, m1, 0xfe ; m3 | m0 | m1 + add r1, 40 + add r2, 4*8*5 + dec t0d + jge .lists + + LOAD_BYTES_ZMM nnz + mova ym2, [pb_1] + vptestmw k1, m1, m1 + vptestmw k2, m3, m3 + vpaddb ym0 {k1}{z}, ym2, ym2 ; nnz ? 2 : 0 + vpmaxub ym0 {k2}, ym2 ; mv ? 1 : 0 + vextracti128 [bs1], ym0, 1 + pshufb xm0, [transpose_shuf] + mova [bs0], xm0 RET diff --git a/library/src/main/cpp/libx264/common/x86/mc-a.asm b/library/src/main/cpp/libx264/common/x86/mc-a.asm index f16f958..3c1d214 100644 --- a/library/src/main/cpp/libx264/common/x86/mc-a.asm +++ b/library/src/main/cpp/libx264/common/x86/mc-a.asm @@ -83,11 +83,11 @@ cextern deinterleave_shufd %endmacro %endif -%macro AVG_END 0 - lea t4, [t4+t5*2*SIZEOF_PIXEL] +%macro AVG_END 0-1 2 ; rows lea t2, [t2+t3*2*SIZEOF_PIXEL] + lea t4, [t4+t5*2*SIZEOF_PIXEL] lea t0, [t0+t1*2*SIZEOF_PIXEL] - sub eax, 2 + sub eax, %1 jg .height_loop RET %endmacro @@ -147,17 +147,24 @@ cextern deinterleave_shufd %endmacro %macro BIWEIGHT_START_SSSE3 0 - movzx t6d, byte r6m ; FIXME x86_64 - mov t7d, 64 - sub t7d, t6d - shl t7d, 8 - add t6d, t7d - mova m4, [pw_512] - movd xm3, t6d + movzx t6d, byte r6m ; FIXME x86_64 +%if mmsize > 16 + vbroadcasti128 m4, [pw_512] +%else + mova m4, [pw_512] +%endif + lea t7d, [t6+(64<<8)] + shl t6d, 8 + sub t7d, t6d +%if cpuflag(avx512) + vpbroadcastw m3, t7d +%else + movd xm3, t7d %if cpuflag(avx2) - vpbroadcastw m3, xm3 + vpbroadcastw m3, xm3 %else - SPLATW m3, m3 ; weight_dst,src + SPLATW m3, m3 ; weight_dst,src +%endif %endif %endmacro @@ -268,6 +275,66 @@ cglobal pixel_avg_weight_w16 mova [t0], xm0 vextracti128 [t0+t1], m0, 1 AVG_END + +INIT_YMM avx512 +cglobal pixel_avg_weight_w8 + BIWEIGHT_START + kxnorb k1, k1, k1 + kaddb k1, k1, k1 + AVG_START 5 +.height_loop: + movq xm0, [t2] + movq xm2, [t4] + movq xm1, [t2+t3] + movq xm5, [t4+t5] + lea t2, [t2+t3*2] + lea t4, [t4+t5*2] + vpbroadcastq m0 {k1}, [t2] + vpbroadcastq m2 {k1}, [t4] + vpbroadcastq m1 {k1}, [t2+t3] + vpbroadcastq m5 {k1}, [t4+t5] + punpcklbw m0, m2 + punpcklbw m1, m5 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + vextracti128 xmm1, m0, 1 + movq [t0], xm0 + movhps [t0+t1], xm0 + lea t0, [t0+t1*2] + movq [t0], xmm1 + movhps [t0+t1], xmm1 + AVG_END 4 + +INIT_ZMM avx512 +cglobal pixel_avg_weight_w16 + BIWEIGHT_START + AVG_START 5 +.height_loop: + movu xm0, [t2] + movu xm1, [t4] + vinserti128 ym0, [t2+t3], 1 + vinserti128 ym1, [t4+t5], 1 + lea t2, [t2+t3*2] + lea t4, [t4+t5*2] + vinserti32x4 m0, [t2], 2 + vinserti32x4 m1, [t4], 2 + vinserti32x4 m0, [t2+t3], 3 + vinserti32x4 m1, [t4+t5], 3 + SBUTTERFLY bw, 0, 1, 2 + pmaddubsw m0, m3 + pmaddubsw m1, m3 + pmulhrsw m0, m4 + pmulhrsw m1, m4 + packuswb m0, m1 + mova [t0], xm0 + vextracti128 [t0+t1], ym0, 1 + lea t0, [t0+t1*2] + vextracti32x4 [t0], m0, 2 + vextracti32x4 [t0+t1], m0, 3 + AVG_END 4 %endif ;HIGH_BIT_DEPTH ;============================================================================= @@ -738,6 +805,12 @@ INIT_XMM avx2 AVG_FUNC 16, movdqu, movdqa AVGH 16, 16 AVGH 16, 8 +INIT_XMM avx512 +AVGH 16, 16 +AVGH 16, 8 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 %endif ;HIGH_BIT_DEPTH @@ -2125,7 +2198,7 @@ INIT_XMM sse2 MC_CHROMA INIT_XMM ssse3 MC_CHROMA_SSSE3 -INIT_XMM ssse3, cache64 +INIT_XMM cache64, ssse3 MC_CHROMA_SSSE3 INIT_XMM avx MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64 diff --git a/library/src/main/cpp/libx264/common/x86/mc-a2.asm b/library/src/main/cpp/libx264/common/x86/mc-a2.asm index 2e72b61..e93cfcc 100644 --- a/library/src/main/cpp/libx264/common/x86/mc-a2.asm +++ b/library/src/main/cpp/libx264/common/x86/mc-a2.asm @@ -30,18 +30,15 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 - -pw_1024: times 16 dw 1024 -filt_mul20: times 32 db 20 -filt_mul15: times 16 db 1, -5 -filt_mul51: times 16 db -5, 1 -hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 +SECTION_RODATA 64 %if HIGH_BIT_DEPTH -v210_mask: times 4 dq 0xc00ffc003ff003ff -v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15 -v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14 +v210_shuf_avx512: db 0, 0,34, 1,35,34, 4, 4,38, 5,39,38, 8, 8,42, 9, ; luma, chroma + db 43,42,12,12,46,13,47,46,16,16,50,17,51,50,20,20, + db 54,21,55,54,24,24,58,25,59,58,28,28,62,29,63,62 +v210_mask: dd 0x3ff003ff, 0xc00ffc00, 0x3ff003ff, 0xc00ffc00 +v210_luma_shuf: db 1, 2, 4, 5, 6, 7, 9,10,12,13,14,15,12,13,14,15 +v210_chroma_shuf: db 0, 1, 2, 3, 5, 6, 8, 9,10,11,13,14,10,11,13,14 ; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800 dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800 @@ -58,6 +55,13 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30 deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31 %endif ; !HIGH_BIT_DEPTH +pw_1024: times 16 dw 1024 +filt_mul20: times 32 db 20 +filt_mul15: times 16 db 1, -5 +filt_mul51: times 16 db -5, 1 +hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 + +mbtree_prop_list_avx512_shuf: dw 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6 db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14 mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14 @@ -1044,8 +1048,8 @@ PLANE_COPY_CORE 1 %endif ; HIGH_BIT_DEPTH %endmacro -%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned - mova m0, [%3] +%macro DEINTERLEAVE 6 ; dsta, dstb, src, dsta==dstb+8, shuffle constant, is aligned + mov%6 m0, [%3] %if mmsize == 32 pshufb m0, %5 vpermq m0, m0, q3120 @@ -1056,7 +1060,7 @@ PLANE_COPY_CORE 1 vextracti128 [%2], m0, 1 %endif %elif HIGH_BIT_DEPTH - mova m1, [%3+mmsize] + mov%6 m1, [%3+mmsize] psrld m2, m0, 16 psrld m3, m1, 16 pand m0, %5 @@ -1181,8 +1185,8 @@ cglobal store_interleave_chroma, 5,5 %macro PLANE_DEINTERLEAVE 0 ;----------------------------------------------------------------------------- -; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu, -; pixel *dstv, intptr_t i_dstv, +; void plane_copy_deinterleave( pixel *dsta, intptr_t i_dsta, +; pixel *dstb, intptr_t i_dstb, ; pixel *src, intptr_t i_src, int w, int h ) ;----------------------------------------------------------------------------- %if ARCH_X86_64 @@ -1400,43 +1404,64 @@ cglobal plane_copy_deinterleave_v210, 7,7,7 %define org_w r6m %define h dword r7m %endif - FIX_STRIDES r1, r3, r6d - shl r5, 2 - add r0, r6 - add r2, r6 - neg r6 - mov src, r4 - mov org_w, r6 - mova m2, [v210_mask] - mova m3, [v210_luma_shuf] - mova m4, [v210_chroma_shuf] - mova m5, [v210_mult] ; also functions as vpermd index for avx2 - pshufd m6, m5, q1102 - + FIX_STRIDES r1, r3, r6d + shl r5, 2 + add r0, r6 + add r2, r6 + neg r6 + mov src, r4 + mov org_w, r6 +%if cpuflag(avx512) + vpbroadcastd m2, [v210_mask] + vpbroadcastd m3, [v210_shuf_avx512] + psrlw m3, 6 ; dw 0, 4 + mova m4, [v210_shuf_avx512] ; luma + psrlw m5, m4, 8 ; chroma +%else +%if mmsize == 32 + vbroadcasti128 m2, [v210_mask] + vbroadcasti128 m3, [v210_luma_shuf] + vbroadcasti128 m4, [v210_chroma_shuf] +%else + mova m2, [v210_mask] + mova m3, [v210_luma_shuf] + mova m4, [v210_chroma_shuf] +%endif + mova m5, [v210_mult] ; also functions as vpermd index for avx2 + pshufd m6, m5, q1102 +%endif ALIGN 16 .loop: - movu m1, [r4] - pandn m0, m2, m1 - pand m1, m2 - pshufb m0, m3 - pshufb m1, m4 - pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __ - pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __ + movu m1, [r4] + pandn m0, m2, m1 + pand m1, m2 +%if cpuflag(avx512) + psrld m0, 10 + vpsrlvw m1, m3 + mova m6, m0 + vpermt2w m0, m4, m1 + vpermt2w m1, m5, m6 +%else + pshufb m0, m3 + pshufb m1, m4 + pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __ + pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __ %if mmsize == 32 - vpermd m0, m5, m0 - vpermd m1, m5, m1 + vpermd m0, m5, m0 + vpermd m1, m5, m1 +%endif %endif - movu [r0+r6], m0 - movu [r2+r6], m1 - add r4, mmsize - add r6, 3*mmsize/4 + movu [r0+r6], m0 + movu [r2+r6], m1 + add r4, mmsize + add r6, mmsize*3/4 jl .loop - add r0, r1 - add r2, r3 - add src, r5 - mov r4, src - mov r6, org_w - dec h + add r0, r1 + add r2, r3 + add src, r5 + mov r4, src + mov r6, org_w + dec h jg .loop RET %endmacro ; PLANE_DEINTERLEAVE_V210 @@ -1461,6 +1486,8 @@ PLANE_DEINTERLEAVE_V210 INIT_YMM avx2 LOAD_DEINTERLEAVE_CHROMA PLANE_DEINTERLEAVE_V210 +INIT_ZMM avx512 +PLANE_DEINTERLEAVE_V210 %else INIT_XMM sse2 PLANE_DEINTERLEAVE_RGB @@ -1473,82 +1500,85 @@ LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 PLANE_DEINTERLEAVE_RGB %endif -; These functions are not general-use; not only do the SSE ones require aligned input, -; but they also will fail if given a non-mod16 size. -; memzero SSE will fail for non-mod128. +; These functions are not general-use; not only do they require aligned input, but memcpy +; requires size to be a multiple of 16 and memzero requires size to be a multiple of 128. ;----------------------------------------------------------------------------- ; void *memcpy_aligned( void *dst, const void *src, size_t n ); ;----------------------------------------------------------------------------- %macro MEMCPY 0 cglobal memcpy_aligned, 3,3 -%if mmsize == 16 +%if mmsize == 32 test r2d, 16 - jz .copy2 - mova m0, [r1+r2-16] - mova [r0+r2-16], m0 + jz .copy32 + mova xm0, [r1+r2-16] + mova [r0+r2-16], xm0 sub r2d, 16 -.copy2: -%endif - test r2d, 2*mmsize - jz .copy4start + jle .ret +.copy32: +%endif + test r2d, mmsize + jz .loop + mova m0, [r1+r2-mmsize] + mova [r0+r2-mmsize], m0 + sub r2d, mmsize + jle .ret +.loop: mova m0, [r1+r2-1*mmsize] mova m1, [r1+r2-2*mmsize] mova [r0+r2-1*mmsize], m0 mova [r0+r2-2*mmsize], m1 sub r2d, 2*mmsize -.copy4start: - test r2d, r2d - jz .ret -.copy4: - mova m0, [r1+r2-1*mmsize] - mova m1, [r1+r2-2*mmsize] - mova m2, [r1+r2-3*mmsize] - mova m3, [r1+r2-4*mmsize] - mova [r0+r2-1*mmsize], m0 - mova [r0+r2-2*mmsize], m1 - mova [r0+r2-3*mmsize], m2 - mova [r0+r2-4*mmsize], m3 - sub r2d, 4*mmsize - jg .copy4 + jg .loop .ret: - REP_RET + RET %endmacro -INIT_MMX mmx -MEMCPY -INIT_XMM sse -MEMCPY - ;----------------------------------------------------------------------------- ; void *memzero_aligned( void *dst, size_t n ); ;----------------------------------------------------------------------------- -%macro MEMZERO 1 +%macro MEMZERO 0 cglobal memzero_aligned, 2,2 - add r0, r1 - neg r1 -%if mmsize == 8 - pxor m0, m0 -%else xorps m0, m0 -%endif .loop: -%assign i 0 -%rep %1 - mova [r0 + r1 + i], m0 -%assign i i+mmsize +%assign %%i mmsize +%rep 128 / mmsize + movaps [r0 + r1 - %%i], m0 +%assign %%i %%i+mmsize %endrep - add r1, mmsize*%1 - jl .loop + sub r1d, 128 + jg .loop RET %endmacro -INIT_MMX mmx -MEMZERO 8 INIT_XMM sse -MEMZERO 8 +MEMCPY +MEMZERO INIT_YMM avx -MEMZERO 4 +MEMCPY +MEMZERO +INIT_ZMM avx512 +MEMZERO + +cglobal memcpy_aligned, 3,4 + dec r2d ; offset of the last byte + rorx r3d, r2d, 2 + and r2d, ~63 + and r3d, 15 ; n = number of dwords minus one to copy in the tail + mova m0, [r1+r2] + not r3d ; bits 0-4: (n^15)+16, bits 16-31: 0xffff + shrx r3d, r3d, r3d ; 0xffff >> (n^15) + kmovw k1, r3d ; (1 << (n+1)) - 1 + vmovdqa32 [r0+r2] {k1}, m0 + sub r2d, 64 + jl .ret +.loop: + mova m0, [r1+r2] + mova [r0+r2], m0 + sub r2d, 64 + jge .loop +.ret: + RET %if HIGH_BIT_DEPTH == 0 ;----------------------------------------------------------------------------- @@ -2147,13 +2177,13 @@ MBTREE cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) vbroadcastss m5, [r5] mov r5d, r6m - lea r0, [r0+r5*2] + lea r2, [r2+r5*2] add r5d, r5d - add r1, r5 - add r2, r5 - add r3, r5 add r4, r5 neg r5 + sub r1, r5 + sub r3, r5 + sub r0, r5 mova xm4, [pw_3fff] %if notcpuflag(avx2) pxor xm7, xm7 @@ -2165,9 +2195,8 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) pmovzxwd m2, [r1+r5] ; prop pand xm3, xm4, [r3+r5] ; inter pmovzxwd m3, xm3 - pminsd m3, m0 pmaddwd m1, m0 - psubd m3, m0, m3 + psubusw m3, m0, m3 cvtdq2ps m0, m0 cvtdq2ps m1, m1 cvtdq2ps m2, m2 @@ -2184,7 +2213,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) movu xm1, [r4+r5] movu xm2, [r1+r5] pand xm3, xm4, [r3+r5] - pminsw xm3, xm0 + psubusw xm3, xm0, xm3 INT16_UNPACK 0 INT16_UNPACK 1 INT16_UNPACK 2 @@ -2194,7 +2223,6 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) cvtdq2ps m2, m2 cvtdq2ps m3, m3 mulps m1, m0 - subps m3, m0, m3 mulps m1, m5 ; intra*invq*fps_factor>>8 addps m1, m2 ; prop + (intra*invq*fps_factor>>8) rcpps m2, m0 ; 1 / intra 1st approximation @@ -2205,7 +2233,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2) subps m2, m0 ; 2nd approximation for 1/intra mulps m1, m2 ; / intra %endif - vcvtps2dq m1, m1 + cvtps2dq m1, m1 vextractf128 xm2, m1, 1 packssdw xm1, xm2 mova [r0+r5], xm1 @@ -2219,6 +2247,39 @@ MBTREE_AVX INIT_YMM avx2 MBTREE_AVX +INIT_ZMM avx512 +cglobal mbtree_propagate_cost, 6,6 + vbroadcastss m5, [r5] + mov r5d, 0x3fff3fff + vpbroadcastd ym4, r5d + mov r5d, r6m + lea r2, [r2+r5*2] + add r5d, r5d + add r1, r5 + neg r5 + sub r4, r5 + sub r3, r5 + sub r0, r5 +.loop: + pmovzxwd m0, [r2+r5] ; intra + pmovzxwd m1, [r1+r5] ; prop + pmovzxwd m2, [r4+r5] ; invq + pand ym3, ym4, [r3+r5] ; inter + pmovzxwd m3, ym3 + psubusw m3, m0, m3 + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + cvtdq2ps m2, m2 + cvtdq2ps m3, m3 + vdivps m1, m0, {rn-sae} + fmaddps m1, m2, m5, m1 + mulps m1, m3 + cvtps2dq m1, m1 + vpmovsdw [r0+r5], m1 + add r5, 32 + jl .loop + RET + %macro MBTREE_PROPAGATE_LIST 0 ;----------------------------------------------------------------------------- ; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs, @@ -2372,6 +2433,112 @@ cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8 jl .loop RET +%if ARCH_X86_64 +;----------------------------------------------------------------------------- +; void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, +; uint16_t *lowres_costs, int bipred_weight, int mb_y, +; int width, int height, int stride, int list_mask ); +;----------------------------------------------------------------------------- +INIT_ZMM avx512 +cglobal mbtree_propagate_list_internal, 5,7,21 + mova xm16, [pw_0xc000] + vpbroadcastw xm17, r5m ; bipred_weight << 9 + vpbroadcastw ym18, r10m ; 1 << (list+LOWRES_COST_SHIFT) + vbroadcasti32x8 m5, [mbtree_prop_list_avx512_shuf] + vbroadcasti32x8 m6, [pd_0123] + vpord m6, r6m {1to16} ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y + vbroadcasti128 m7, [pd_8] + vbroadcasti128 m8, [pw_31] + vbroadcasti128 m9, [pw_32] + psllw m10, m9, 4 + pcmpeqw ym19, ym19 ; pw_m1 + vpbroadcastw ym20, r7m ; width + psrld m11, m7, 3 ; pd_1 + psrld m12, m8, 16 ; pd_31 + vpbroadcastd m13, r8m ; height + vpbroadcastd m14, r9m ; stride + pslld m15, m14, 16 + por m15, m11 ; {1, stride, 1, stride} ... + lea r4, [r4+2*r0] ; lowres_costs + lea r3, [r3+2*r0] ; propagate_amount + lea r2, [r2+4*r0] ; mvs + neg r0 + mov r6d, 0x5555ffff + kmovd k4, r6d + kshiftrd k5, k4, 16 ; 0x5555 + kshiftlw k6, k4, 8 ; 0xff00 +.loop: + vbroadcasti128 ym1, [r4+2*r0] + mova xm4, [r3+2*r0] + vpcmpuw k1, xm1, xm16, 5 ; if (lists_used == 3) + vpmulhrsw xm4 {k1}, xm17 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6 + vptestmw k1, ym1, ym18 + vpermw m4, m5, m4 + + vbroadcasti32x8 m3, [r2+4*r0] ; {mvx, mvy} + psraw m0, m3, 5 + paddw m0, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y} + paddd m6, m7 ; i_mb_x += 8 + pand m3, m8 ; {x, y} + vprold m1, m3, 20 ; {y, x} << 4 + psubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y} + psubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4 + pmullw m3, m1 + paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000) + pmulhrsw m2, m3, m4 ; idx01weight idx23weightp + + pslld ym1, ym0, 16 + psubw ym1, ym19 + vmovdqu16 ym1 {k5}, ym0 + vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width + kunpckwd k2, k2, k2 + psrad m1, m0, 16 + paddd m1 {k6}, m11 + vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height + + pmaddwd m0, m15 + paddd m0 {k6}, m14 ; idx0 | idx2 + vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight + vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes + + ; We're handling dwords, but the offsets are in words so there may be partial overlaps. + ; We can work around this by handling dword-aligned and -unaligned offsets separately. + vptestmd k0, m0, m11 + kandnw k2, k0, k1 ; dword-aligned offsets + kmovw k3, k2 + vpgatherdd m3 {k2}, [r1+2*m0] + + ; If there are conflicts in the offsets we have to handle them before storing the results. + ; By creating a permutation index using vplzcntd we can resolve all conflicts in parallel + ; in ceil(log2(n)) iterations where n is the largest number of duplicate offsets. + vpconflictd m4, m0 + vpbroadcastmw2d m1, k1 + vptestmd k2, m1, m4 + ktestw k2, k2 + jz .no_conflicts + pand m1, m4 ; mask away unused offsets to avoid false positives + vplzcntd m1, m1 + pxor m1, m12 ; lzcnt gives us the distance from the msb, we want it from the lsb +.conflict_loop: + vpermd m4 {k2}{z}, m1, m2 + vpermd m1 {k2}, m1, m1 ; shift the index one step forward + paddsw m2, m4 ; add the weights of conflicting offsets + vpcmpd k2, m1, m12, 2 + ktestw k2, k2 + jnz .conflict_loop +.no_conflicts: + paddsw m3, m2 + vpscatterdd [r1+2*m0] {k3}, m3 + kandw k1, k0, k1 ; dword-unaligned offsets + kmovw k2, k1 + vpgatherdd m1 {k1}, [r1+2*m0] + paddsw m1, m2 ; all conflicts have already been resolved + vpscatterdd [r1+2*m0] {k2}, m1 + add r0, 8 + jl .loop + RET +%endif + %macro MBTREE_FIX8 0 ;----------------------------------------------------------------------------- ; void mbtree_fix8_pack( uint16_t *dst, float *src, int count ) diff --git a/library/src/main/cpp/libx264/common/x86/mc-c.c b/library/src/main/cpp/libx264/common/x86/mc-c.c index 3258381..c06691c 100644 --- a/library/src/main/cpp/libx264/common/x86/mc-c.c +++ b/library/src/main/cpp/libx264/common/x86/mc-c.c @@ -32,7 +32,8 @@ void func##_mmx2 args;\ void func##_sse2 args;\ void func##_ssse3 args;\ - void func##_avx2 args; + void func##_avx2 args;\ + void func##_avx512 args; DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int )) @@ -99,17 +100,17 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst, void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst, pixel *srcu, intptr_t i_srcu, pixel *srcv, intptr_t i_srcv, int w, int h ); -void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu, - pixel *dstv, intptr_t i_dstv, +void x264_plane_copy_deinterleave_sse2( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, pixel *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu, - uint8_t *dstv, intptr_t i_dstv, +void x264_plane_copy_deinterleave_ssse3( uint8_t *dsta, intptr_t i_dsta, + uint8_t *dstb, intptr_t i_dstb, uint8_t *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu, - uint16_t *dstv, intptr_t i_dstv, +void x264_plane_copy_deinterleave_avx( uint16_t *dsta, intptr_t i_dsta, + uint16_t *dstb, intptr_t i_dstb, uint16_t *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_avx2( pixel *dstu, intptr_t i_dstu, - pixel *dstv, intptr_t i_dstv, +void x264_plane_copy_deinterleave_avx2( pixel *dsta, intptr_t i_dsta, + pixel *dstb, intptr_t i_dstb, pixel *src, intptr_t i_src, int w, int h ); void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, @@ -123,15 +124,18 @@ void x264_plane_copy_deinterleave_rgb_avx2 ( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb, pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h ); -void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu, - uint16_t *dstv, intptr_t i_dstv, - uint32_t *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu, - uint16_t *dstv, intptr_t i_dstv, - uint32_t *src, intptr_t i_src, int w, int h ); -void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu, - uint16_t *dstv, intptr_t i_dstv, - uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_ssse3 ( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); +void x264_plane_copy_deinterleave_v210_avx512( uint16_t *dstu, intptr_t i_dstu, + uint16_t *dstv, intptr_t i_dstv, + uint32_t *src, intptr_t i_src, int w, int h ); void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height ); @@ -143,11 +147,12 @@ void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height ); -void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n ); -void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n ); -void x264_memzero_aligned_mmx( void *dst, size_t n ); -void x264_memzero_aligned_sse( void *dst, size_t n ); -void x264_memzero_aligned_avx( void *dst, size_t n ); +void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n ); +void *x264_memcpy_aligned_avx ( void *dst, const void *src, size_t n ); +void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n ); +void x264_memzero_aligned_sse ( void *dst, size_t n ); +void x264_memzero_aligned_avx ( void *dst, size_t n ); +void x264_memzero_aligned_avx512( void *dst, size_t n ); void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride ); void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride ); @@ -160,14 +165,16 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride ); void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride ); -void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); -void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, - uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); +void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs, + uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len ); void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count ); void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count ); void x264_mbtree_fix8_unpack_ssse3( float *dst, uint16_t *src, int count ); @@ -179,7 +186,7 @@ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src, MC_CHROMA(mmx2) MC_CHROMA(sse2) MC_CHROMA(ssse3) -MC_CHROMA(ssse3_cache64) +MC_CHROMA(cache64_ssse3) MC_CHROMA(avx) MC_CHROMA(avx2) @@ -498,6 +505,15 @@ PLANE_COPY(32, avx) PLANE_COPY_SWAP(16, ssse3) PLANE_COPY_SWAP(32, avx2) +#if HIGH_BIT_DEPTH +PLANE_COPY_YUYV(64, sse2) +PLANE_COPY_YUYV(64, avx) +#else +PLANE_COPY_YUYV(32, sse2) +PLANE_COPY_YUYV(32, ssse3) +#endif +PLANE_COPY_YUYV(64, avx2) + PLANE_INTERLEAVE(mmx2) PLANE_INTERLEAVE(sse2) #if HIGH_BIT_DEPTH @@ -538,6 +554,21 @@ PROPAGATE_LIST(ssse3) PROPAGATE_LIST(avx) PROPAGATE_LIST(avx2) +#if ARCH_X86_64 +void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount, + uint16_t *lowres_costs, int bipred_weight, int mb_y, + int width, int height, int stride, int list_mask ); + +static void x264_mbtree_propagate_list_avx512( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2], + int16_t *propagate_amount, uint16_t *lowres_costs, + int bipred_weight, int mb_y, int len, int list ) +{ + x264_mbtree_propagate_list_internal_avx512( len, ref_costs, mvs, propagate_amount, lowres_costs, bipred_weight << 9, + mb_y << 16, h->mb.i_mb_width, h->mb.i_mb_height, h->mb.i_mb_stride, + (1 << LOWRES_COST_SHIFT) << list ); +} +#endif + void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) { if( !(cpu&X264_CPU_MMX) ) @@ -547,8 +578,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx; pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx; pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx; - pf->memcpy_aligned = x264_memcpy_aligned_mmx; - pf->memzero_aligned = x264_memzero_aligned_mmx; pf->integral_init4v = x264_integral_init4v_mmx; pf->integral_init8v = x264_integral_init8v_mmx; @@ -606,6 +635,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->plane_copy_interleave = x264_plane_copy_interleave_sse2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2; if( cpu&X264_CPU_SSE2_IS_FAST ) { @@ -661,6 +691,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx; pf->plane_copy_interleave = x264_plane_copy_interleave_avx; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx; pf->store_interleave_chroma = x264_store_interleave_chroma_avx; pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx; @@ -677,6 +708,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2; pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2; } + + if( cpu&X264_CPU_AVX512 ) + { + pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx512; + } #else // !HIGH_BIT_DEPTH #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead @@ -702,6 +738,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->hpel_filter = x264_hpel_filter_sse2_amd; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2; @@ -763,6 +800,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3; pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_ssse3; } if( !(cpu&X264_CPU_SLOW_PALIGNR) ) @@ -779,7 +817,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) if( cpu&X264_CPU_CACHELINE_64 ) { if( !(cpu&X264_CPU_STACK_MOD4) ) - pf->mc_chroma = x264_mc_chroma_ssse3_cache64; + pf->mc_chroma = x264_mc_chroma_cache64_ssse3; pf->mc_luma = mc_luma_cache64_ssse3; pf->get_ref = get_ref_cache64_ssse3; if( cpu&X264_CPU_SLOW_ATOM ) @@ -828,10 +866,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2; pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2; } + + if( cpu&X264_CPU_AVX512 ) + { + pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512; + pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx512; + pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512; + pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512; + pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512; + } #endif // HIGH_BIT_DEPTH if( !(cpu&X264_CPU_AVX) ) return; + pf->memcpy_aligned = x264_memcpy_aligned_avx; pf->memzero_aligned = x264_memzero_aligned_avx; pf->plane_copy = x264_plane_copy_avx; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx; @@ -844,10 +892,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf ) return; pf->plane_copy_swap = x264_plane_copy_swap_avx2; pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2; + pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx2; pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2; pf->get_ref = get_ref_avx2; pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2; pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2; pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2; pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2; + + if( !(cpu&X264_CPU_AVX512) ) + return; + pf->memcpy_aligned = x264_memcpy_aligned_avx512; + pf->memzero_aligned = x264_memzero_aligned_avx512; + pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512; +#if ARCH_X86_64 + pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx512; +#endif } diff --git a/library/src/main/cpp/libx264/common/x86/pixel-a.asm b/library/src/main/cpp/libx264/common/x86/pixel-a.asm index 0dfe61d..1ce26b9 100644 --- a/library/src/main/cpp/libx264/common/x86/pixel-a.asm +++ b/library/src/main/cpp/libx264/common/x86/pixel-a.asm @@ -32,6 +32,8 @@ %include "x86util.asm" SECTION_RODATA 32 +var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1 + db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1 hmul_16p: times 16 db 1 times 8 db 1, -1 hmul_8p: times 8 db 1 @@ -701,25 +703,32 @@ SSD_NV12 %if HIGH_BIT_DEPTH == 0 %if %1 mova m7, [pw_00ff] -%elif mmsize < 32 +%elif mmsize == 16 pxor m7, m7 ; zero %endif %endif ; !HIGH_BIT_DEPTH %endmacro -%macro VAR_END 2 -%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256 - HADDUW m5, m2 -%else - HADDW m5, m2 +%macro VAR_END 0 + pmaddwd m5, [pw_1] + SBUTTERFLY dq, 5, 6, 0 + paddd m5, m6 +%if mmsize == 32 + vextracti128 xm6, m5, 1 + paddd xm5, xm6 %endif - HADDD m6, m1 + MOVHL xm6, xm5 + paddd xm5, xm6 %if ARCH_X86_64 - punpckldq m5, m6 - movq rax, m5 + movq rax, xm5 +%else + movd eax, xm5 +%if cpuflag(avx) + pextrd edx, xm5, 1 %else - movd eax, m5 - movd edx, m6 + pshuflw xm5, xm5, q1032 + movd edx, xm5 +%endif %endif RET %endmacro @@ -739,61 +748,25 @@ SSD_NV12 paddd m6, m4 %endmacro -%macro VAR_2ROW 2 - mov r2d, %2 -.loop: -%if HIGH_BIT_DEPTH - mova m0, [r0] - mova m1, [r0+mmsize] - mova m3, [r0+%1] - mova m4, [r0+%1+mmsize] -%else ; !HIGH_BIT_DEPTH - mova m0, [r0] - mova m3, [r0+%1] - punpckhbw m1, m0, m7 - punpcklbw m0, m7 - punpckhbw m4, m3, m7 - punpcklbw m3, m7 -%endif ; HIGH_BIT_DEPTH -%ifidn %1, r1 - lea r0, [r0+%1*2] -%else - add r0, r1 -%endif - VAR_CORE - dec r2d - jg .loop -%endmacro - ;----------------------------------------------------------------------------- ; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -INIT_MMX mmx2 -cglobal pixel_var_16x16, 2,3 - FIX_STRIDES r1 - VAR_START 0 - VAR_2ROW 8*SIZEOF_PIXEL, 16 - VAR_END 16, 16 - -cglobal pixel_var_8x16, 2,3 - FIX_STRIDES r1 - VAR_START 0 - VAR_2ROW r1, 8 - VAR_END 8, 16 - -cglobal pixel_var_8x8, 2,3 - FIX_STRIDES r1 - VAR_START 0 - VAR_2ROW r1, 4 - VAR_END 8, 8 - %if HIGH_BIT_DEPTH %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 FIX_STRIDES r1 VAR_START 0 - VAR_2ROW r1, 8 - VAR_END 16, 16 + mov r2d, 8 +.loop: + mova m0, [r0] + mova m1, [r0+mmsize] + mova m3, [r0+r1] + mova m4, [r0+r1+mmsize] + lea r0, [r0+r1*2] + VAR_CORE + dec r2d + jg .loop + VAR_END cglobal pixel_var_8x8, 2,3,8 lea r2, [r1*3] @@ -809,18 +782,16 @@ cglobal pixel_var_8x8, 2,3,8 mova m3, [r0+r1*4] mova m4, [r0+r2*2] VAR_CORE - VAR_END 8, 8 + VAR_END %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR -INIT_XMM xop -VAR -%endif ; HIGH_BIT_DEPTH -%if HIGH_BIT_DEPTH == 0 +%else ; HIGH_BIT_DEPTH == 0 + %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 VAR_START 1 @@ -833,7 +804,7 @@ cglobal pixel_var_16x16, 2,3,8 VAR_CORE dec r2d jg .loop - VAR_END 16, 16 + VAR_END cglobal pixel_var_8x8, 2,4,8 VAR_START 1 @@ -849,7 +820,7 @@ cglobal pixel_var_8x8, 2,4,8 VAR_CORE dec r2d jg .loop - VAR_END 8, 8 + VAR_END cglobal pixel_var_8x16, 2,4,8 VAR_START 1 @@ -865,15 +836,13 @@ cglobal pixel_var_8x16, 2,4,8 VAR_CORE dec r2d jg .loop - VAR_END 8, 16 + VAR_END %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR -INIT_XMM xop -VAR %endif ; !HIGH_BIT_DEPTH INIT_YMM avx2 @@ -898,209 +867,357 @@ cglobal pixel_var_16x16, 2,4,7 VAR_CORE dec r2d jg .loop - vextracti128 xm0, m5, 1 - vextracti128 xm1, m6, 1 - paddw xm5, xm0 - paddd xm6, xm1 - HADDW xm5, xm2 - HADDD xm6, xm1 -%if ARCH_X86_64 - punpckldq xm5, xm6 - movq rax, xm5 + VAR_END + +%macro VAR_AVX512_CORE 1 ; accum +%if %1 + paddw m0, m2 + pmaddwd m2, m2 + paddw m0, m3 + pmaddwd m3, m3 + paddd m1, m2 + paddd m1, m3 %else - movd eax, xm5 - movd edx, xm6 + paddw m0, m2, m3 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m1, m2, m3 %endif - RET +%endmacro -%macro VAR2_END 3 - HADDW %2, xm1 - movd r1d, %2 - imul r1d, r1d - HADDD %3, xm1 - shr r1d, %1 - movd eax, %3 - movd [r4], %3 - sub eax, r1d ; sqr - (sum * sum >> shift) - RET +%macro VAR_AVX512_CORE_16x16 1 ; accum +%if HIGH_BIT_DEPTH + mova ym2, [r0] + vinserti64x4 m2, [r0+r1], 1 + mova ym3, [r0+2*r1] + vinserti64x4 m3, [r0+r3], 1 +%else + vbroadcasti64x2 ym2, [r0] + vbroadcasti64x2 m2 {k1}, [r0+r1] + vbroadcasti64x2 ym3, [r0+2*r1] + vbroadcasti64x2 m3 {k1}, [r0+r3] + pshufb m2, m4 + pshufb m3, m4 +%endif + VAR_AVX512_CORE %1 %endmacro -;----------------------------------------------------------------------------- -; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * ) -;----------------------------------------------------------------------------- -%macro VAR2_8x8_MMX 2 -cglobal pixel_var2_8x%1, 5,6 - FIX_STRIDES r1, r3 - VAR_START 0 - mov r5d, %1 -.loop: +%macro VAR_AVX512_CORE_8x8 1 ; accum %if HIGH_BIT_DEPTH - mova m0, [r0] - mova m1, [r0+mmsize] - psubw m0, [r2] - psubw m1, [r2+mmsize] -%else ; !HIGH_BIT_DEPTH - movq m0, [r0] - movq m1, m0 - movq m2, [r2] - movq m3, m2 - punpcklbw m0, m7 - punpckhbw m1, m7 - punpcklbw m2, m7 - punpckhbw m3, m7 - psubw m0, m2 - psubw m1, m3 -%endif ; HIGH_BIT_DEPTH - paddw m5, m0 - paddw m5, m1 - pmaddwd m0, m0 - pmaddwd m1, m1 - paddd m6, m0 - paddd m6, m1 - add r0, r1 - add r2, r3 - dec r5d - jg .loop - VAR2_END %2, m5, m6 + mova xm2, [r0] + mova xm3, [r0+r1] +%else + movq xm2, [r0] + movq xm3, [r0+r1] +%endif + vinserti128 ym2, [r0+2*r1], 1 + vinserti128 ym3, [r0+r2], 1 + lea r0, [r0+4*r1] + vinserti32x4 m2, [r0], 2 + vinserti32x4 m3, [r0+r1], 2 + vinserti32x4 m2, [r0+2*r1], 3 + vinserti32x4 m3, [r0+r2], 3 +%if HIGH_BIT_DEPTH == 0 + punpcklbw m2, m4 + punpcklbw m3, m4 +%endif + VAR_AVX512_CORE %1 %endmacro +INIT_ZMM avx512 +cglobal pixel_var_16x16, 2,4 + FIX_STRIDES r1 + mov r2d, 0xf0 + lea r3, [3*r1] +%if HIGH_BIT_DEPTH == 0 + vbroadcasti64x4 m4, [var_shuf_avx512] + kmovb k1, r2d +%endif + VAR_AVX512_CORE_16x16 0 +.loop: + lea r0, [r0+4*r1] + VAR_AVX512_CORE_16x16 1 + sub r2d, 0x50 + jg .loop %if ARCH_X86_64 == 0 -INIT_MMX mmx2 -VAR2_8x8_MMX 8, 6 -VAR2_8x8_MMX 16, 7 + pop r3d + %assign regs_used 3 +%endif +var_avx512_end: + vbroadcasti32x4 m2, [pw_1] + pmaddwd m0, m2 + SBUTTERFLY dq, 0, 1, 2 + paddd m0, m1 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 + vextracti128 xm1, ym0, 1 + paddd xmm0, xm0, xm1 + punpckhqdq xmm1, xmm0, xmm0 + paddd xmm0, xmm1 +%if ARCH_X86_64 + movq rax, xmm0 +%else + movd eax, xmm0 + pextrd edx, xmm0, 1 %endif + RET + +%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth +cglobal pixel_var_8x8, 2,3 + lea r2, [3*r1] + pxor xm4, xm4 + VAR_AVX512_CORE_8x8 0 + jmp var_avx512_end +%endif + +cglobal pixel_var_8x16, 2,3 + FIX_STRIDES r1 + lea r2, [3*r1] +%if HIGH_BIT_DEPTH == 0 + pxor xm4, xm4 +%endif + VAR_AVX512_CORE_8x8 0 + lea r0, [r0+4*r1] + VAR_AVX512_CORE_8x8 1 + jmp var_avx512_end + +;----------------------------------------------------------------------------- +; int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] ) +;----------------------------------------------------------------------------- + +%if ARCH_X86_64 + DECLARE_REG_TMP 6 +%else + DECLARE_REG_TMP 2 +%endif + +%macro VAR2_END 3 ; src, tmp, shift + movifnidn r2, r2mp + pshufd %2, %1, q3331 + pmuludq %1, %1 + movq [r2], %2 ; sqr_u sqr_v + psrld %1, %3 + psubd %2, %1 ; sqr - (sum * sum >> shift) + MOVHL %1, %2 + paddd %1, %2 + movd eax, %1 + RET +%endmacro %macro VAR2_8x8_SSE2 2 -cglobal pixel_var2_8x%1, 5,6,8 - VAR_START 1 - mov r5d, %1/2 +%if HIGH_BIT_DEPTH +cglobal pixel_var2_8x%1, 2,3,6 + pxor m4, m4 + pxor m5, m5 +%define %%sum2 m4 +%define %%sqr2 m5 +%else +cglobal pixel_var2_8x%1, 2,3,7 + mova m6, [pw_00ff] +%define %%sum2 m0 +%define %%sqr2 m1 +%endif + pxor m0, m0 ; sum + pxor m1, m1 ; sqr + mov t0d, (%1-1)*FENC_STRIDEB .loop: %if HIGH_BIT_DEPTH - mova m0, [r0] - mova m1, [r0+r1*2] - mova m2, [r2] - mova m3, [r2+r3*2] -%else ; !HIGH_BIT_DEPTH - movq m1, [r0] - movhps m1, [r0+r1] - movq m3, [r2] - movhps m3, [r2+r3] - DEINTB 0, 1, 2, 3, 7 -%endif ; HIGH_BIT_DEPTH - psubw m0, m2 - psubw m1, m3 - paddw m5, m0 - paddw m5, m1 - pmaddwd m0, m0 - pmaddwd m1, m1 - paddd m6, m0 - paddd m6, m1 - lea r0, [r0+r1*2*SIZEOF_PIXEL] - lea r2, [r2+r3*2*SIZEOF_PIXEL] - dec r5d - jg .loop - VAR2_END %2, m5, m6 + mova m2, [r0+1*t0] + psubw m2, [r1+2*t0] + mova m3, [r0+1*t0+16] + psubw m3, [r1+2*t0+32] +%else + mova m3, [r0+1*t0] + movq m5, [r1+2*t0] + punpcklqdq m5, [r1+2*t0+16] + DEINTB 2, 3, 4, 5, 6 + psubw m2, m4 + psubw m3, m5 +%endif + paddw m0, m2 + pmaddwd m2, m2 + paddw %%sum2, m3 + pmaddwd m3, m3 + paddd m1, m2 + paddd %%sqr2, m3 + sub t0d, FENC_STRIDEB + jge .loop +%if HIGH_BIT_DEPTH + SBUTTERFLY dq, 0, 4, 2 + paddw m0, m4 ; sum_u sum_v + pmaddwd m0, [pw_1] + SBUTTERFLY dq, 1, 5, 2 + paddd m1, m5 ; sqr_u sqr_v + SBUTTERFLY dq, 0, 1, 2 + paddd m0, m1 +%else + pmaddwd m0, [pw_1] + shufps m2, m0, m1, q2020 + shufps m0, m1, q3131 + paddd m0, m2 + pshufd m0, m0, q3120 ; sum_u sqr_u sum_v sqr_v +%endif + VAR2_END m0, m1, %2 %endmacro INIT_XMM sse2 VAR2_8x8_SSE2 8, 6 VAR2_8x8_SSE2 16, 7 +%macro VAR2_CORE 3 ; src1, src2, accum +%if %3 + paddw m0, %1 + pmaddwd %1, %1 + paddw m0, %2 + pmaddwd %2, %2 + paddd m1, %1 + paddd m1, %2 +%else + paddw m0, %1, %2 + pmaddwd %1, %1 + pmaddwd %2, %2 + paddd m1, %1, %2 +%endif +%endmacro + %if HIGH_BIT_DEPTH == 0 -%macro VAR2_8x8_SSSE3 2 -cglobal pixel_var2_8x%1, 5,6,8 - pxor m5, m5 ; sum - pxor m6, m6 ; sum squared - mova m7, [hsub_mul] - mov r5d, %1/4 +INIT_XMM ssse3 +cglobal pixel_var2_internal + pxor m0, m0 ; sum + pxor m1, m1 ; sqr .loop: - movq m0, [r0] - movq m2, [r2] - movq m1, [r0+r1] - movq m3, [r2+r3] - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - punpcklbw m0, m2 - punpcklbw m1, m3 - movq m2, [r0] - movq m3, [r2] - punpcklbw m2, m3 - movq m3, [r0+r1] - movq m4, [r2+r3] - punpcklbw m3, m4 - pmaddubsw m0, m7 - pmaddubsw m1, m7 - pmaddubsw m2, m7 - pmaddubsw m3, m7 - paddw m5, m0 - paddw m5, m1 - paddw m5, m2 - paddw m5, m3 - pmaddwd m0, m0 - pmaddwd m1, m1 - pmaddwd m2, m2 - pmaddwd m3, m3 - paddd m6, m0 - paddd m6, m1 - paddd m6, m2 - paddd m6, m3 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - dec r5d + movq m2, [r0+1*t0] + punpcklbw m2, [r1+2*t0] + movq m3, [r0+1*t0-1*FENC_STRIDE] + punpcklbw m3, [r1+2*t0-1*FDEC_STRIDE] + movq m4, [r0+1*t0-2*FENC_STRIDE] + punpcklbw m4, [r1+2*t0-2*FDEC_STRIDE] + movq m5, [r0+1*t0-3*FENC_STRIDE] + punpcklbw m5, [r1+2*t0-3*FDEC_STRIDE] + pmaddubsw m2, m7 + pmaddubsw m3, m7 + pmaddubsw m4, m7 + pmaddubsw m5, m7 + VAR2_CORE m2, m3, 1 + VAR2_CORE m4, m5, 1 + sub t0d, 4*FENC_STRIDE jg .loop - VAR2_END %2, m5, m6 + pmaddwd m0, [pw_1] + ret + +%macro VAR2_8x8_SSSE3 2 +cglobal pixel_var2_8x%1, 2,3,8 + mova m7, [hsub_mul] + mov t0d, (%1-1)*FENC_STRIDE + call pixel_var2_internal_ssse3 ; u + add r0, 8 + add r1, 16 + SBUTTERFLY qdq, 0, 1, 6 + paddd m1, m0 + mov t0d, (%1-1)*FENC_STRIDE + call pixel_var2_internal_ssse3 ; v + SBUTTERFLY qdq, 0, 6, 2 + paddd m0, m6 + phaddd m1, m0 ; sum_u sqr_u sum_v sqr_v + VAR2_END m1, m0, %2 %endmacro -INIT_XMM ssse3 -VAR2_8x8_SSSE3 8, 6 -VAR2_8x8_SSSE3 16, 7 -INIT_XMM xop VAR2_8x8_SSSE3 8, 6 VAR2_8x8_SSSE3 16, 7 +%endif ; !HIGH_BIT_DEPTH + +%macro VAR2_AVX2_LOAD 3 ; offset_reg, row1_offset, row2_offset +%if HIGH_BIT_DEPTH +%if mmsize == 64 + mova m2, [r1+2*%1+%2*FDEC_STRIDEB] + vshufi32x4 m2, [r1+2*%1+%2*FDEC_STRIDEB+64], q2020 + mova m3, [r1+2*%1+%3*FDEC_STRIDEB] + vshufi32x4 m3, [r1+2*%1+%3*FDEC_STRIDEB+64], q2020 +%else + mova xm2, [r1+2*%1+%2*FDEC_STRIDEB] + vinserti128 m2, [r1+2*%1+%2*FDEC_STRIDEB+32], 1 + mova xm3, [r1+2*%1+%3*FDEC_STRIDEB] + vinserti128 m3, [r1+2*%1+%3*FDEC_STRIDEB+32], 1 +%endif + psubw m2, [r0+1*%1+%2*FENC_STRIDEB] + psubw m3, [r0+1*%1+%3*FENC_STRIDEB] +%else + pmovzxbw m2, [r0+1*%1+%2*FENC_STRIDE] + mova m4, [r1+2*%1+%2*FDEC_STRIDE] + pmovzxbw m3, [r0+1*%1+%3*FENC_STRIDE] + mova m5, [r1+2*%1+%3*FDEC_STRIDE] + punpcklbw m4, m6 + punpcklbw m5, m6 + psubw m2, m4 + psubw m3, m5 +%endif +%endmacro %macro VAR2_8x8_AVX2 2 -cglobal pixel_var2_8x%1, 5,6,6 - pxor m3, m3 ; sum - pxor m4, m4 ; sum squared - mova m5, [hsub_mul] - mov r5d, %1/4 +%if HIGH_BIT_DEPTH +cglobal pixel_var2_8x%1, 2,3,4 +%else +cglobal pixel_var2_8x%1, 2,3,7 + pxor m6, m6 +%endif + mov t0d, (%1-3)*FENC_STRIDEB + VAR2_AVX2_LOAD t0, 2, 1 + VAR2_CORE m2, m3, 0 .loop: - movq xm0, [r0] - movq xm1, [r2] - vinserti128 m0, m0, [r0+r1], 1 - vinserti128 m1, m1, [r2+r3], 1 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - punpcklbw m0, m1 - movq xm1, [r0] - movq xm2, [r2] - vinserti128 m1, m1, [r0+r1], 1 - vinserti128 m2, m2, [r2+r3], 1 - lea r0, [r0+r1*2] - lea r2, [r2+r3*2] - punpcklbw m1, m2 - pmaddubsw m0, m5 - pmaddubsw m1, m5 - paddw m3, m0 - paddw m3, m1 - pmaddwd m0, m0 - pmaddwd m1, m1 - paddd m4, m0 - paddd m4, m1 - dec r5d + VAR2_AVX2_LOAD t0, 0, -1 + VAR2_CORE m2, m3, 1 + sub t0d, 2*FENC_STRIDEB jg .loop - vextracti128 xm0, m3, 1 - vextracti128 xm1, m4, 1 - paddw xm3, xm0 - paddd xm4, xm1 - VAR2_END %2, xm3, xm4 + + pmaddwd m0, [pw_1] + SBUTTERFLY qdq, 0, 1, 2 + paddd m0, m1 + vextracti128 xm1, m0, 1 + phaddd xm0, xm1 + VAR2_END xm0, xm1, %2 %endmacro INIT_YMM avx2 VAR2_8x8_AVX2 8, 6 VAR2_8x8_AVX2 16, 7 -%endif ; !HIGH_BIT_DEPTH +%macro VAR2_AVX512_END 1 ; shift + vbroadcasti32x4 m2, [pw_1] + pmaddwd m0, m2 + SBUTTERFLY qdq, 0, 1, 2 + paddd m0, m1 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 + psrlq ym1, ym0, 32 + paddd ym0, ym1 + vpmovqd xmm0, ym0 ; sum_u, sqr_u, sum_v, sqr_v + VAR2_END xmm0, xmm1, %1 +%endmacro + +INIT_ZMM avx512 +cglobal pixel_var2_8x8, 2,3 +%if HIGH_BIT_DEPTH == 0 + pxor xm6, xm6 +%endif + VAR2_AVX2_LOAD 0, 0, 2 + VAR2_CORE m2, m3, 0 + VAR2_AVX2_LOAD 0, 4, 6 + VAR2_CORE m2, m3, 1 + VAR2_AVX512_END 6 + +cglobal pixel_var2_8x16, 2,3 +%if HIGH_BIT_DEPTH == 0 + pxor xm6, xm6 +%endif + mov t0d, 10*FENC_STRIDEB + VAR2_AVX2_LOAD 0, 14, 12 + VAR2_CORE m2, m3, 0 +.loop: + VAR2_AVX2_LOAD t0, 0, -2 + VAR2_CORE m2, m3, 1 + sub t0d, 4*FENC_STRIDEB + jg .loop + VAR2_AVX512_END 7 ;============================================================================= ; SATD @@ -4583,6 +4700,244 @@ cglobal intra_sad_x9_8x8, 5,7,8 mov rsp, r6 mov eax, r2d RET + +%macro SATD_AVX512_LOAD4 2 ; size, opmask + vpbroadcast%1 m0, [r0] + vpbroadcast%1 m0 {%2}, [r0+2*r1] + vpbroadcast%1 m2, [r2] + vpbroadcast%1 m2 {%2}, [r2+2*r3] + add r0, r1 + add r2, r3 + vpbroadcast%1 m1, [r0] + vpbroadcast%1 m1 {%2}, [r0+2*r1] + vpbroadcast%1 m3, [r2] + vpbroadcast%1 m3 {%2}, [r2+2*r3] +%endmacro + +%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3 + vpbroadcast%1 %{2}0, [r0] + vpbroadcast%1 %{2}0 {%3}, [r0+2*r1] + vpbroadcast%1 %{2}2, [r2] + vpbroadcast%1 %{2}2 {%3}, [r2+2*r3] + vpbroadcast%1 m0 {%4}, [r0+4*r1] + vpbroadcast%1 m2 {%4}, [r2+4*r3] + vpbroadcast%1 m0 {%5}, [r0+2*r4] + vpbroadcast%1 m2 {%5}, [r2+2*r5] + vpbroadcast%1 %{2}1, [r0+r1] + vpbroadcast%1 %{2}1 {%3}, [r0+r4] + vpbroadcast%1 %{2}3, [r2+r3] + vpbroadcast%1 %{2}3 {%3}, [r2+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + vpbroadcast%1 m1 {%4}, [r0+r1] + vpbroadcast%1 m3 {%4}, [r2+r3] + vpbroadcast%1 m1 {%5}, [r0+r4] + vpbroadcast%1 m3 {%5}, [r2+r5] +%endmacro + +%macro SATD_AVX512_PACKED 0 + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 + SUMSUB_BA w, 0, 1, 2 + SBUTTERFLY qdq, 0, 1, 2 + SUMSUB_BA w, 0, 1, 2 + HMAXABSW2 0, 1, 2, 3 +%endmacro + +%macro SATD_AVX512_END 0-1 0 ; sa8d + paddw m0 {k1}{z}, m1 ; zero-extend to dwords +%if ARCH_X86_64 +%if mmsize == 64 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 +%endif +%if mmsize >= 32 + vextracti128 xm1, ym0, 1 + paddd xmm0, xm0, xm1 +%endif + punpckhqdq xmm1, xmm0, xmm0 + paddd xmm0, xmm1 + movq rax, xmm0 + rorx rdx, rax, 32 +%if %1 + lea eax, [rax+rdx+1] + shr eax, 1 +%else + add eax, edx +%endif +%else + HADDD m0, m1 + movd eax, xm0 +%if %1 + inc eax + shr eax, 1 +%endif +%endif + RET +%endmacro + +%macro HMAXABSW2 4 ; a, b, tmp1, tmp2 + pabsw m%1, m%1 + pabsw m%2, m%2 + psrldq m%3, m%1, 2 + psrld m%4, m%2, 16 + pmaxsw m%1, m%3 + pmaxsw m%2, m%4 +%endmacro + +INIT_ZMM avx512 +cglobal pixel_satd_16x8_internal + vbroadcasti64x4 m6, [hmul_16p] + kxnorb k2, k2, k2 + mov r4d, 0x55555555 + knotw k2, k2 + kmovd k1, r4d + lea r4, [3*r1] + lea r5, [3*r3] +satd_16x8_avx512: + vbroadcasti128 ym0, [r0] + vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4 + vbroadcasti128 ym4, [r2] + vbroadcasti32x4 m4 {k2}, [r2+4*r3] + vbroadcasti128 ym2, [r0+2*r1] + vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6 + vbroadcasti128 ym5, [r2+2*r3] + vbroadcasti32x4 m5 {k2}, [r2+2*r5] + DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6 + vbroadcasti128 ym1, [r0+r1] + vbroadcasti128 ym4, [r2+r3] + vbroadcasti128 ym3, [r0+r4] + vbroadcasti128 ym5, [r2+r5] + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5 + vbroadcasti32x4 m4 {k2}, [r2+r3] + vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7 + vbroadcasti32x4 m5 {k2}, [r2+r5] + DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6 + HADAMARD4_V 0, 1, 2, 3, 4 + HMAXABSW2 0, 2, 4, 5 + HMAXABSW2 1, 3, 4, 5 + paddw m4, m0, m2 ; m1 + paddw m2, m1, m3 ; m0 + ret + +cglobal pixel_satd_8x8_internal + vbroadcasti64x4 m4, [hmul_16p] + mov r4d, 0x55555555 + kmovd k1, r4d ; 01010101 + kshiftlb k2, k1, 5 ; 10100000 + kshiftlb k3, k1, 4 ; 01010000 + lea r4, [3*r1] + lea r5, [3*r3] +satd_8x8_avx512: + SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 + SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5 + ret + +cglobal pixel_satd_16x8, 4,6 + call pixel_satd_16x8_internal_avx512 + jmp satd_zmm_avx512_end + +cglobal pixel_satd_16x16, 4,6 + call pixel_satd_16x8_internal_avx512 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m7, m0, m1 + call satd_16x8_avx512 + paddw m1, m7 + jmp satd_zmm_avx512_end + +cglobal pixel_satd_8x8, 4,6 + call pixel_satd_8x8_internal_avx512 +satd_zmm_avx512_end: + SATD_AVX512_END + +cglobal pixel_satd_8x16, 4,6 + call pixel_satd_8x8_internal_avx512 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m5, m0, m1 + call satd_8x8_avx512 + paddw m1, m5 + jmp satd_zmm_avx512_end + +INIT_YMM avx512 +cglobal pixel_satd_4x8_internal + vbroadcasti128 m4, [hmul_4p] + mov r4d, 0x55550c + kmovd k2, r4d ; 00001100 + kshiftlb k3, k2, 2 ; 00110000 + kshiftlb k4, k2, 4 ; 11000000 + kshiftrd k1, k2, 8 ; 01010101 + lea r4, [3*r1] + lea r5, [3*r3] +satd_4x8_avx512: + SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6 +satd_ymm_avx512: ; 1 1 3 3 5 5 7 7 + SATD_AVX512_PACKED + ret + +cglobal pixel_satd_8x4, 4,5 + mova m4, [hmul_16p] + mov r4d, 0x5555 + kmovw k1, r4d + SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0 + call satd_ymm_avx512 ; 3 1 3 1 + jmp satd_ymm_avx512_end2 + +cglobal pixel_satd_4x8, 4,6 + call pixel_satd_4x8_internal_avx512 +satd_ymm_avx512_end: +%if ARCH_X86_64 == 0 + pop r5d + %assign regs_used 5 +%endif +satd_ymm_avx512_end2: + SATD_AVX512_END + +cglobal pixel_satd_4x16, 4,6 + call pixel_satd_4x8_internal_avx512 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] + paddw m5, m0, m1 + call satd_4x8_avx512 + paddw m1, m5 + jmp satd_ymm_avx512_end + +INIT_XMM avx512 +cglobal pixel_satd_4x4, 4,5 + mova m4, [hmul_4p] + mov r4d, 0x550c + kmovw k2, r4d + kshiftrw k1, k2, 8 + SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2 + SATD_AVX512_PACKED ; 1 1 3 3 + SWAP 0, 1 + SATD_AVX512_END + +INIT_ZMM avx512 +cglobal pixel_sa8d_8x8, 4,6 + vbroadcasti64x4 m4, [hmul_16p] + mov r4d, 0x55555555 + kmovd k1, r4d ; 01010101 + kshiftlb k2, k1, 5 ; 10100000 + kshiftlb k3, k1, 4 ; 01010000 + lea r4, [3*r1] + lea r5, [3*r3] + SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5 + SUMSUB_BA w, 0, 1, 2 + SBUTTERFLY qdq, 0, 1, 2 + SUMSUB_BA w, 0, 1, 2 + shufps m2, m0, m1, q2020 + shufps m1, m0, m1, q3131 + SUMSUB_BA w, 2, 1, 0 + vshufi32x4 m0, m2, m1, q1010 + vshufi32x4 m1, m2, m1, q3232 + SUMSUB_BA w, 0, 1, 2 + HMAXABSW2 0, 1, 2, 3 + SATD_AVX512_END 1 + %endif ; HIGH_BIT_DEPTH ;============================================================================= @@ -4867,7 +5222,7 @@ ASD8 add r6, 4*%1 sub r0d, 4*%1 jg .loop - WIN64_RESTORE_XMM rsp + WIN64_RESTORE_XMM %if mmsize==32 vzeroupper %endif diff --git a/library/src/main/cpp/libx264/common/x86/pixel.h b/library/src/main/cpp/libx264/common/x86/pixel.h index 2b0baa3..56cfc5c 100644 --- a/library/src/main/cpp/libx264/common/x86/pixel.h +++ b/library/src/main/cpp/libx264/common/x86/pixel.h @@ -52,6 +52,7 @@ DECL_X1( sad, sse2_aligned ) DECL_X1( sad, ssse3 ) DECL_X1( sad, ssse3_aligned ) DECL_X1( sad, avx2 ) +DECL_X1( sad, avx512 ) DECL_X4( sad, mmx2 ) DECL_X4( sad, sse2 ) DECL_X4( sad, sse3 ) @@ -59,6 +60,7 @@ DECL_X4( sad, ssse3 ) DECL_X4( sad, xop ) DECL_X4( sad, avx ) DECL_X4( sad, avx2 ) +DECL_X4( sad, avx512 ) DECL_X1( ssd, mmx ) DECL_X1( ssd, mmx2 ) DECL_X1( ssd, sse2slow ) @@ -75,6 +77,7 @@ DECL_X1( satd, sse4 ) DECL_X1( satd, avx ) DECL_X1( satd, xop ) DECL_X1( satd, avx2 ) +DECL_X1( satd, avx512 ) DECL_X1( sa8d, mmx2 ) DECL_X1( sa8d, sse2 ) DECL_X1( sa8d, ssse3 ) @@ -83,6 +86,7 @@ DECL_X1( sa8d, sse4 ) DECL_X1( sa8d, avx ) DECL_X1( sa8d, xop ) DECL_X1( sa8d, avx2 ) +DECL_X1( sa8d, avx512 ) DECL_X1( sad, cache32_mmx2 ); DECL_X1( sad, cache64_mmx2 ); DECL_X1( sad, cache64_sse2 ); @@ -92,11 +96,10 @@ DECL_X4( sad, cache64_mmx2 ); DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); -DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride )) @@ -165,16 +168,14 @@ void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1, const pixel *pix2, intptr_t stride2, int sums[2][4] ); float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width ); float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width ); -int x264_pixel_var2_8x8_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); -int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); -int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x8_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); -int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * ); -int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); -int x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * ); +int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] ); +int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] ); +int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] ); +int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] ); int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height ); int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height ); diff --git a/library/src/main/cpp/libx264/common/x86/predict-a.asm b/library/src/main/cpp/libx264/common/x86/predict-a.asm index efc0f5a..527d9ec 100644 --- a/library/src/main/cpp/libx264/common/x86/predict-a.asm +++ b/library/src/main/cpp/libx264/common/x86/predict-a.asm @@ -468,7 +468,7 @@ PREDICT_4x4 w, wd, dq, qdq INIT_MMX mmx2 PREDICT_4x4 b, bw, wd, dq INIT_MMX ssse3 -%define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64 +%define predict_4x4_vr_ssse3 predict_4x4_vr_cache64_ssse3 PREDICT_4x4 b, bw, wd, dq %endif @@ -940,7 +940,7 @@ INIT_XMM sse2 PREDICT_8x8_DDLR INIT_XMM ssse3 PREDICT_8x8_DDLR -INIT_XMM ssse3, cache64 +INIT_XMM cache64, ssse3 PREDICT_8x8_DDLR %elif ARCH_X86_64 == 0 INIT_MMX mmx2 diff --git a/library/src/main/cpp/libx264/common/x86/predict-c.c b/library/src/main/cpp/libx264/common/x86/predict-c.c index fcf8413..27da63a 100644 --- a/library/src/main/cpp/libx264/common/x86/predict-c.c +++ b/library/src/main/cpp/libx264/common/x86/predict-c.c @@ -511,8 +511,8 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_ *predict_8x8_filter = x264_predict_8x8_filter_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) { - pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_ssse3_cache64; - pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_ssse3_cache64; + pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_cache64_ssse3; + pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_cache64_ssse3; } if( !(cpu&X264_CPU_AVX) ) return; @@ -604,6 +604,6 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] ) pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3; pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3; if( cpu&X264_CPU_CACHELINE_64 ) - pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3_cache64; + pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_cache64_ssse3; #endif // HIGH_BIT_DEPTH } diff --git a/library/src/main/cpp/libx264/common/x86/predict.h b/library/src/main/cpp/libx264/common/x86/predict.h index ddc7de6..9f9052c 100644 --- a/library/src/main/cpp/libx264/common/x86/predict.h +++ b/library/src/main/cpp/libx264/common/x86/predict.h @@ -93,12 +93,12 @@ void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] ); void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] ); -void x264_predict_8x8_ddl_ssse3_cache64( pixel *src, pixel edge[36] ); +void x264_predict_8x8_ddl_cache64_ssse3( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] ); void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] ); -void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] ); +void x264_predict_8x8_ddr_cache64_ssse3( pixel *src, pixel edge[36] ); void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] ); void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] ); void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] ); @@ -129,7 +129,7 @@ void x264_predict_4x4_vl_avx( uint16_t *src ); void x264_predict_4x4_vr_mmx2( uint8_t *src ); void x264_predict_4x4_vr_sse2( uint16_t *src ); void x264_predict_4x4_vr_ssse3( pixel *src ); -void x264_predict_4x4_vr_ssse3_cache64( uint8_t *src ); +void x264_predict_4x4_vr_cache64_ssse3( uint8_t *src ); void x264_predict_4x4_vr_avx( uint16_t *src ); void x264_predict_4x4_hd_mmx2( pixel *src ); void x264_predict_4x4_hd_sse2( uint16_t *src ); diff --git a/library/src/main/cpp/libx264/common/x86/quant-a.asm b/library/src/main/cpp/libx264/common/x86/quant-a.asm index 2391b57..f8ebbe5 100644 --- a/library/src/main/cpp/libx264/common/x86/quant-a.asm +++ b/library/src/main/cpp/libx264/common/x86/quant-a.asm @@ -30,7 +30,14 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 + +%if HIGH_BIT_DEPTH +decimate_shuf_avx512: dd 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15 +%else +dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30 + dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62 +%endif %macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3 @@ -42,14 +49,6 @@ SECTION_RODATA 32 dw %4, %2, %6, %2, %4, %2, %6, %2 %endmacro -dequant4_scale: - DQM4 10, 13, 16 - DQM4 11, 14, 18 - DQM4 13, 16, 20 - DQM4 14, 18, 23 - DQM4 16, 20, 25 - DQM4 18, 23, 29 - dequant8_scale: DQM8 20, 18, 32, 19, 25, 24 DQM8 22, 19, 35, 21, 28, 26 @@ -58,6 +57,14 @@ dequant8_scale: DQM8 32, 28, 51, 30, 40, 38 DQM8 36, 32, 58, 34, 46, 43 +dequant4_scale: + DQM4 10, 13, 16 + DQM4 11, 14, 18 + DQM4 13, 16, 20 + DQM4 14, 18, 23 + DQM4 16, 20, 25 + DQM4 18, 23, 29 + decimate_mask_table4: db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14 @@ -743,6 +750,163 @@ DEQUANT 4, 4, 4 DEQUANT 8, 6, 4 %endif +%macro DEQUANT_START_AVX512 1-2 0 ; shift, flat +%if %2 == 0 + movifnidn t2d, r2m +%endif + imul t0d, t2d, 0x2b + shr t0d, 8 ; i_qbits = i_qp / 6 + lea t1d, [t0*5] + sub t2d, t0d + sub t2d, t1d ; i_mf = i_qp % 6 + shl t2d, %1 +%if %2 +%ifdef PIC +%define dmf r1+t2 + lea r1, [dequant8_scale] +%else +%define dmf t2+dequant8_scale +%endif +%elif ARCH_X86_64 +%define dmf r1+t2 +%else +%define dmf r1 + add r1, r1mp ; dequant_mf[i_mf] +%endif + movifnidn r0, r0mp +%endmacro + +INIT_ZMM avx512 +cglobal dequant_4x4, 0,3 + DEQUANT_START_AVX512 6 + mova m0, [dmf] +%if HIGH_BIT_DEPTH + pmaddwd m0, [r0] +%endif + sub t0d, 4 + jl .rshift +%if HIGH_BIT_DEPTH + vpbroadcastd m1, t0d + vpsllvd m0, m1 + mova [r0], m0 +%else + vpbroadcastw ym1, t0d + vpmovsdw ym0, m0 + pmullw ym0, [r0] + vpsllvw ym0, ym1 + mova [r0], ym0 +%endif + RET +.rshift: +%if HIGH_BIT_DEPTH == 0 + pmovzxwd m1, [r0] + pmaddwd m0, m1 +%endif + mov r1d, 1<<31 + shrx r1d, r1d, t0d ; 1 << (-i_qbits-1) + neg t0d + vpbroadcastd m1, r1d + vpbroadcastd m2, t0d + paddd m0, m1 + vpsravd m0, m2 +%if HIGH_BIT_DEPTH + mova [r0], m0 +%else + vpmovsdw [r0], m0 +%endif + RET + +cglobal dequant_8x8, 0,3 + DEQUANT_START_AVX512 8 + mova m0, [dmf+0*64] + mova m1, [dmf+1*64] + mova m2, [dmf+2*64] + mova m3, [dmf+3*64] +%if HIGH_BIT_DEPTH + pmaddwd m0, [r0+0*64] + pmaddwd m1, [r0+1*64] + pmaddwd m2, [r0+2*64] + pmaddwd m3, [r0+3*64] +%else + mova m6, [dequant_shuf_avx512] +%endif + sub t0d, 6 + jl .rshift +%if HIGH_BIT_DEPTH + vpbroadcastd m4, t0d + vpsllvd m0, m4 + vpsllvd m1, m4 + vpsllvd m2, m4 + vpsllvd m3, m4 + jmp .end +.rshift: +%else + vpbroadcastw m4, t0d + vpermt2w m0, m6, m1 + vpermt2w m2, m6, m3 + pmullw m0, [r0] + pmullw m2, [r0+64] + vpsllvw m0, m4 + vpsllvw m2, m4 + mova [r0], m0 + mova [r0+64], m2 + RET +.rshift: + pmovzxwd m4, [r0+0*32] + pmovzxwd m5, [r0+1*32] + pmaddwd m0, m4 + pmaddwd m1, m5 + pmovzxwd m4, [r0+2*32] + pmovzxwd m5, [r0+3*32] + pmaddwd m2, m4 + pmaddwd m3, m5 +%endif + mov r1d, 1<<31 + shrx r1d, r1d, t0d ; 1 << (-i_qbits-1) + neg t0d + vpbroadcastd m4, r1d + vpbroadcastd m5, t0d + paddd m0, m4 + paddd m1, m4 + vpsravd m0, m5 + vpsravd m1, m5 + paddd m2, m4 + paddd m3, m4 + vpsravd m2, m5 + vpsravd m3, m5 +%if HIGH_BIT_DEPTH +.end: + mova [r0+0*64], m0 + mova [r0+1*64], m1 + mova [r0+2*64], m2 + mova [r0+3*64], m3 +%else + vpermt2w m0, m6, m1 + vpermt2w m2, m6, m3 + mova [r0], m0 + mova [r0+64], m2 +%endif + RET + +%if HIGH_BIT_DEPTH == 0 +cglobal dequant_8x8_flat16, 0,3 + movifnidn t2d, r2m + cmp t2d, 12 + jl dequant_8x8_avx512 + sub t2d, 12 + DEQUANT_START_AVX512 6, 1 + vpbroadcastw m0, t0d + mova m1, [dmf] + vpsllvw m1, m0 + pmullw m0, m1, [r0] + pmullw m1, [r0+64] + mova [r0], m0 + mova [r0+64], m1 + RET +%endif + +%undef dmf + %macro DEQUANT_DC 2 cglobal dequant_4x4dc, 0,3,6 DEQUANT_START 6, 6 @@ -1208,13 +1372,12 @@ cglobal denoise_dct, 4,4,4 ; int decimate_score( dctcoef *dct ) ;----------------------------------------------------------------------------- -%macro DECIMATE_MASK 5 -%if mmsize==16 +%macro DECIMATE_MASK 4 %if HIGH_BIT_DEPTH - movdqa m0, [%3+ 0] - movdqa m1, [%3+32] - packssdw m0, [%3+16] - packssdw m1, [%3+48] + mova m0, [%3+0*16] + packssdw m0, [%3+1*16] + mova m1, [%3+2*16] + packssdw m1, [%3+3*16] ABSW2 m0, m1, m0, m1, m3, m4 %else ABSW m0, [%3+ 0], m3 @@ -1226,40 +1389,35 @@ cglobal denoise_dct, 4,4,4 pcmpgtb m0, %4 pmovmskb %1, m2 pmovmskb %2, m0 -%else ; mmsize==8 +%endmacro + +%macro DECIMATE_MASK16_AVX512 0 + mova m0, [r0] %if HIGH_BIT_DEPTH - movq m0, [%3+ 0] - movq m1, [%3+16] - movq m2, [%3+32] - movq m3, [%3+48] - packssdw m0, [%3+ 8] - packssdw m1, [%3+24] - packssdw m2, [%3+40] - packssdw m3, [%3+56] -%else - movq m0, [%3+ 0] - movq m1, [%3+ 8] - movq m2, [%3+16] - movq m3, [%3+24] -%endif - ABSW2 m0, m1, m0, m1, m6, m7 - ABSW2 m2, m3, m2, m3, m6, m7 - packsswb m0, m1 - packsswb m2, m3 - pxor m4, m4 - pxor m6, m6 - pcmpeqb m4, m0 - pcmpeqb m6, m2 - pcmpgtb m0, %4 - pcmpgtb m2, %4 - pmovmskb %5, m4 - pmovmskb %1, m6 - shl %1, 8 - or %1, %5 - pmovmskb %5, m0 - pmovmskb %2, m2 - shl %2, 8 - or %2, %5 + vptestmd k0, m0, m0 + pabsd m0, m0 + vpcmpud k1, m0, [pd_1] {1to16}, 6 +%else + vptestmw k0, m0, m0 + pabsw m0, m0 + vpcmpuw k1, m0, [pw_1], 6 +%endif +%endmacro + +%macro SHRX 2 +%if cpuflag(bmi2) + shrx %1, %1, %2 +%else + shr %1, %2b ; %2 has to be rcx/ecx +%endif +%endmacro + +%macro BLSR 2 +%if cpuflag(bmi1) + blsr %1, %2 +%else + lea %1, [%2-1] + and %1, %2 %endif %endmacro @@ -1269,33 +1427,60 @@ cextern decimate_table8 %macro DECIMATE4x4 1 cglobal decimate_score%1, 1,3 -%ifdef PIC - lea r4, [decimate_table4] - lea r5, [decimate_mask_table4] - %define table r4 - %define mask_table r5 +%if cpuflag(avx512) + DECIMATE_MASK16_AVX512 + xor eax, eax + kmovw edx, k0 +%if %1 == 15 + shr edx, 1 %else - %define table decimate_table4 - %define mask_table decimate_mask_table4 + test edx, edx %endif - DECIMATE_MASK edx, eax, r0, [pb_1], ecx + jz .ret + ktestw k1, k1 + jnz .ret9 +%else + DECIMATE_MASK edx, eax, r0, [pb_1] xor edx, 0xffff - je .ret + jz .ret test eax, eax - jne .ret9 -%if %1==15 + jnz .ret9 +%if %1 == 15 shr edx, 1 +%endif +%endif +%ifdef PIC + lea r4, [decimate_mask_table4] + %define mask_table r4 +%else + %define mask_table decimate_mask_table4 %endif movzx ecx, dl movzx eax, byte [mask_table + rcx] +%if ARCH_X86_64 + xor edx, ecx + jz .ret +%if cpuflag(lzcnt) + lzcnt ecx, ecx + lea r5, [decimate_table4-32] + add r5, rcx +%else + bsr ecx, ecx + lea r5, [decimate_table4-1] + sub r5, rcx +%endif + %define table r5 +%else cmp edx, ecx - je .ret + jz .ret bsr ecx, ecx shr edx, 1 - shr edx, cl + SHRX edx, ecx + %define table decimate_table4 +%endif tzcnt ecx, edx shr edx, 1 - shr edx, cl + SHRX edx, ecx add al, byte [table + rcx] add al, byte [mask_table + rdx] .ret: @@ -1303,175 +1488,224 @@ cglobal decimate_score%1, 1,3 .ret9: mov eax, 9 RET - %endmacro -%if ARCH_X86_64 == 0 -INIT_MMX mmx2 -DECIMATE4x4 15 -DECIMATE4x4 16 -%endif -INIT_XMM sse2 -DECIMATE4x4 15 -DECIMATE4x4 16 -INIT_XMM ssse3 -DECIMATE4x4 15 -DECIMATE4x4 16 - -; 2x gt1 output, 2x nz output, 1x mask -%macro DECIMATE_MASK64_AVX2 5 - pabsw m0, [r0+ 0] - pabsw m2, [r0+32] - pabsw m1, [r0+64] - pabsw m3, [r0+96] - packsswb m0, m2 - packsswb m1, m3 - pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so - pcmpgtb m3, m1, %5 ; we can save latency by doing them here - pmovmskb %1, m2 - pmovmskb %2, m3 - or %1, %2 - jne .ret9 +%macro DECIMATE_MASK64_AVX2 2 ; nz_low, nz_high + mova m0, [r0+0*32] + packsswb m0, [r0+1*32] + mova m1, [r0+2*32] + packsswb m1, [r0+3*32] + mova m4, [pb_1] + pabsb m2, m0 + pabsb m3, m1 + por m2, m3 ; the > 1 checks don't care about order, so + ptest m4, m2 ; we can save latency by doing them here + jnc .ret9 vpermq m0, m0, q3120 vpermq m1, m1, q3120 pxor m4, m4 pcmpeqb m0, m4 pcmpeqb m1, m4 - pmovmskb %3, m0 - pmovmskb %4, m1 + pmovmskb %1, m0 + pmovmskb %2, m1 %endmacro -%macro DECIMATE8x8 0 +%macro DECIMATE_MASK64_AVX512 0 + mova m0, [r0] +%if HIGH_BIT_DEPTH + packssdw m0, [r0+1*64] + mova m1, [r0+2*64] + packssdw m1, [r0+3*64] + packsswb m0, m1 + vbroadcasti32x4 m1, [pb_1] + pabsb m2, m0 + vpcmpub k0, m2, m1, 6 + ktestq k0, k0 + jnz .ret9 + mova m1, [decimate_shuf_avx512] + vpermd m0, m1, m0 + vptestmb k1, m0, m0 +%else + mova m1, [r0+64] + vbroadcasti32x4 m3, [pb_1] + packsswb m2, m0, m1 + pabsb m2, m2 + vpcmpub k0, m2, m3, 6 + ktestq k0, k0 + jnz .ret9 + vptestmw k1, m0, m0 + vptestmw k2, m1, m1 +%endif +%endmacro +%macro DECIMATE8x8 0 %if ARCH_X86_64 cglobal decimate_score64, 1,5 +%if mmsize == 64 + DECIMATE_MASK64_AVX512 + xor eax, eax +%if HIGH_BIT_DEPTH + kmovq r1, k1 + test r1, r1 + jz .ret +%else + kortestd k1, k2 + jz .ret + kunpckdq k1, k2, k1 + kmovq r1, k1 +%endif +%elif mmsize == 32 + DECIMATE_MASK64_AVX2 r1d, eax + not r1 + shl rax, 32 + xor r1, rax + jz .ret +%else + mova m5, [pb_1] + DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5 + test eax, eax + jnz .ret9 + DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5 + shl r2d, 16 + or r1d, r2d + DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5 + shl r2, 32 + or eax, r3d + or r1, r2 + DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5 + not r1 + shl r2, 48 + xor r1, r2 + jz .ret + add eax, r3d + jnz .ret9 +%endif %ifdef PIC lea r4, [decimate_table8] %define table r4 %else %define table decimate_table8 %endif - mova m5, [pb_1] -%if mmsize==32 - DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5 - shl r3, 32 - or r1, r3 - xor r1, -1 - je .ret -%else - DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null - test eax, eax - jne .ret9 - DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null - shl r2d, 16 - or r1d, r2d - DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null - shl r2, 32 - or eax, r3d - or r1, r2 - DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null - shl r2, 48 - or r1, r2 - xor r1, -1 - je .ret - add eax, r3d - jne .ret9 -%endif - mov al, -6 + mov al, -6 .loop: tzcnt rcx, r1 - shr r1, cl - add al, byte [table + rcx] - jge .ret9 - shr r1, 1 - jne .loop - add al, 6 + add al, byte [table + rcx] + jge .ret9 + shr r1, 1 + SHRX r1, rcx +%if cpuflag(bmi2) + test r1, r1 +%endif + jnz .loop + add al, 6 .ret: REP_RET .ret9: - mov eax, 9 + mov eax, 9 RET %else ; ARCH -%if mmsize == 8 -cglobal decimate_score64, 1,6 +cglobal decimate_score64, 1,4 +%if mmsize == 64 + DECIMATE_MASK64_AVX512 + xor eax, eax +%if HIGH_BIT_DEPTH + kshiftrq k2, k1, 32 +%endif + kmovd r2, k1 + kmovd r3, k2 + test r2, r2 + jz .tryret +%elif mmsize == 32 + DECIMATE_MASK64_AVX2 r2, r3 + xor eax, eax + not r3 + xor r2, -1 + jz .tryret %else -cglobal decimate_score64, 1,5 -%endif - mova m5, [pb_1] -%if mmsize==32 - DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5 - xor r3, -1 - je .tryret - xor r4, -1 -.cont: -%else - DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5 - test r2, r2 - jne .ret9 - DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5 - shl r4, 16 - or r3, r4 - DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5 - or r2, r1 - DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5 - shl r1, 16 - or r4, r1 - xor r3, -1 - je .tryret - xor r4, -1 -.cont: - add r0, r2 - jne .ret9 -%endif - mov al, -6 + mova m5, [pb_1] + DECIMATE_MASK r2, r1, r0+SIZEOF_DCTCOEF* 0, m5 + test r1, r1 + jnz .ret9 + DECIMATE_MASK r3, r1, r0+SIZEOF_DCTCOEF*16, m5 + not r2 + shl r3, 16 + xor r2, r3 + mov r0m, r2 + DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF*32, m5 + or r2, r1 + DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5 + add r0, r2 + jnz .ret9 + mov r2, r0m + not r3 + shl r1, 16 + xor r3, r1 + test r2, r2 + jz .tryret +%endif + mov al, -6 .loop: + tzcnt ecx, r2 + add al, byte [decimate_table8 + ecx] + jge .ret9 + sub ecx, 31 ; increase the shift count by one to shift away the lowest set bit as well + jz .run31 ; only bits 0-4 are used so we have to explicitly handle the case of 1<<31 + shrd r2, r3, cl + SHRX r3, ecx +%if notcpuflag(bmi2) + test r2, r2 +%endif + jnz .loop + BLSR r2, r3 + jz .end +.largerun: tzcnt ecx, r3 - test r3, r3 - je .largerun - shrd r3, r4, cl - shr r4, cl - add al, byte [decimate_table8 + ecx] - jge .ret9 - shrd r3, r4, 1 - shr r4, 1 - test r3, r3 - jne .loop - test r4, r4 - jne .loop - add al, 6 -.ret: - REP_RET -.tryret: - xor r4, -1 - jne .cont + shr r3, 1 + SHRX r3, ecx +.loop2: + tzcnt ecx, r3 + add al, byte [decimate_table8 + ecx] + jge .ret9 + shr r3, 1 + SHRX r3, ecx +.run31: + test r3, r3 + jnz .loop2 +.end: + add al, 6 RET +.tryret: + BLSR r2, r3 + jz .ret + mov al, -6 + jmp .largerun .ret9: mov eax, 9 - RET -.largerun: - mov r3, r4 - xor r4, r4 - tzcnt ecx, r3 - shr r3, cl - shr r3, 1 - jne .loop - add al, 6 - RET +.ret: + REP_RET %endif ; ARCH - %endmacro -%if ARCH_X86_64 == 0 -INIT_MMX mmx2 -DECIMATE8x8 -%endif INIT_XMM sse2 +DECIMATE4x4 15 +DECIMATE4x4 16 DECIMATE8x8 INIT_XMM ssse3 +DECIMATE4x4 15 +DECIMATE4x4 16 DECIMATE8x8 +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +%else INIT_YMM avx2 DECIMATE8x8 +INIT_YMM avx512 +%endif +DECIMATE4x4 15 +DECIMATE4x4 16 +INIT_ZMM avx512 +DECIMATE8x8 ;----------------------------------------------------------------------------- ; int coeff_last( dctcoef *dct ) @@ -1556,7 +1790,7 @@ cglobal coeff_last4, 1,3 INIT_MMX mmx2 COEFF_LAST4 -INIT_MMX mmx2, lzcnt +INIT_MMX lzcnt COEFF_LAST4 %macro COEFF_LAST8 0 @@ -1579,7 +1813,7 @@ COEFF_LAST8 %endif INIT_XMM sse2 COEFF_LAST8 -INIT_XMM sse2, lzcnt +INIT_XMM lzcnt COEFF_LAST8 %else ; !HIGH_BIT_DEPTH @@ -1642,7 +1876,7 @@ cglobal coeff_last8, 1,3 INIT_MMX mmx2 COEFF_LAST48 -INIT_MMX mmx2, lzcnt +INIT_MMX lzcnt COEFF_LAST48 %endif ; HIGH_BIT_DEPTH @@ -1707,7 +1941,7 @@ COEFF_LAST %endif INIT_XMM sse2 COEFF_LAST -INIT_XMM sse2, lzcnt +INIT_XMM lzcnt COEFF_LAST %macro LAST_MASK_AVX2 2 @@ -1729,7 +1963,7 @@ COEFF_LAST %endmacro %if ARCH_X86_64 == 0 -INIT_YMM avx2,lzcnt +INIT_YMM avx2 cglobal coeff_last64, 1,2 pxor m2, m2 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32 @@ -1744,7 +1978,7 @@ cglobal coeff_last64, 1,2 add eax, 32 RET %else -INIT_YMM avx2,lzcnt +INIT_YMM avx2 cglobal coeff_last64, 1,3 pxor m2, m2 LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0 @@ -1756,6 +1990,70 @@ cglobal coeff_last64, 1,3 RET %endif +%macro COEFF_LAST_AVX512 2 ; num, w/d +cglobal coeff_last%1, 1,2 + mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF] + vptestm%2 k0, m0, m0 +%if %1 == 15 + mov eax, 30 + kmovw r1d, k0 + lzcnt r1d, r1d + sub eax, r1d +%else + kmovw eax, k0 + lzcnt eax, eax + xor eax, 31 +%endif + RET +%endmacro + +%macro COEFF_LAST64_AVX512 1 ; w/d +cglobal coeff_last64, 1,2 + pxor xm0, xm0 + vpcmp%1 k0, m0, [r0+0*64], 4 + vpcmp%1 k1, m0, [r0+1*64], 4 +%if HIGH_BIT_DEPTH + vpcmp%1 k2, m0, [r0+2*64], 4 + vpcmp%1 k3, m0, [r0+3*64], 4 + kunpckwd k0, k1, k0 + kunpckwd k1, k3, k2 +%endif +%if ARCH_X86_64 + kunpckdq k0, k1, k0 + kmovq rax, k0 + lzcnt rax, rax + xor eax, 63 +%else + kmovd r1d, k1 + kmovd eax, k0 + lzcnt r1d, r1d + lzcnt eax, eax + xor r1d, 32 + cmovnz eax, r1d + xor eax, 31 +%endif + RET +%endmacro + +%if HIGH_BIT_DEPTH +INIT_XMM avx512 +COEFF_LAST_AVX512 4, d +INIT_YMM avx512 +COEFF_LAST_AVX512 8, d +INIT_ZMM avx512 +COEFF_LAST_AVX512 15, d +COEFF_LAST_AVX512 16, d +COEFF_LAST64_AVX512 d +%else ; !HIGH_BIT_DEPTH +INIT_XMM avx512 +COEFF_LAST_AVX512 8, w +INIT_YMM avx512 +COEFF_LAST_AVX512 15, w +COEFF_LAST_AVX512 16, w +INIT_ZMM avx512 +COEFF_LAST64_AVX512 w +%endif ; !HIGH_BIT_DEPTH + ;----------------------------------------------------------------------------- ; int coeff_level_run( dctcoef *dct, run_level_t *runlevel ) ;----------------------------------------------------------------------------- @@ -1833,15 +2131,17 @@ COEFF_LEVELRUN 8 %endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 -INIT_XMM sse2, lzcnt +INIT_MMX lzcnt +COEFF_LEVELRUN 4 +%if HIGH_BIT_DEPTH == 0 +COEFF_LEVELRUN 8 +%endif +INIT_XMM lzcnt %if HIGH_BIT_DEPTH COEFF_LEVELRUN 8 %endif COEFF_LEVELRUN 15 COEFF_LEVELRUN 16 -INIT_MMX mmx2, lzcnt -COEFF_LEVELRUN 4 -COEFF_LEVELRUN 8 ; Similar to the one above, but saves the DCT ; coefficients in m0/m1 so we don't have to load @@ -1968,7 +2268,7 @@ INIT_XMM ssse3, lzcnt COEFF_LEVELRUN_LUT 8 COEFF_LEVELRUN_LUT 15 COEFF_LEVELRUN_LUT 16 -INIT_XMM avx2, lzcnt +INIT_XMM avx2 COEFF_LEVELRUN_LUT 15 COEFF_LEVELRUN_LUT 16 %endif diff --git a/library/src/main/cpp/libx264/common/x86/quant.h b/library/src/main/cpp/libx264/common/x86/quant.h index 9596a58..6b74aac 100644 --- a/library/src/main/cpp/libx264/common/x86/quant.h +++ b/library/src/main/cpp/libx264/common/x86/quant.h @@ -66,12 +66,15 @@ void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp ); +void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp ); void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp ); +void x264_dequant_8x8_flat16_avx512( int16_t dct[64], int dequant_mf[6][64], int i_qp ); void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp ); void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp ); @@ -85,16 +88,16 @@ void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size ); -int x264_decimate_score15_mmx2( dctcoef *dct ); int x264_decimate_score15_sse2( dctcoef *dct ); int x264_decimate_score15_ssse3( dctcoef *dct ); -int x264_decimate_score16_mmx2( dctcoef *dct ); +int x264_decimate_score15_avx512( dctcoef *dct ); int x264_decimate_score16_sse2( dctcoef *dct ); int x264_decimate_score16_ssse3( dctcoef *dct ); -int x264_decimate_score64_mmx2( dctcoef *dct ); +int x264_decimate_score16_avx512( dctcoef *dct ); int x264_decimate_score64_sse2( dctcoef *dct ); int x264_decimate_score64_ssse3( dctcoef *dct ); int x264_decimate_score64_avx2( int16_t *dct ); +int x264_decimate_score64_avx512( dctcoef *dct ); int x264_coeff_last4_mmx2( dctcoef *dct ); int x264_coeff_last8_mmx2( dctcoef *dct ); int x264_coeff_last15_mmx2( dctcoef *dct ); @@ -104,33 +107,37 @@ int x264_coeff_last8_sse2( dctcoef *dct ); int x264_coeff_last15_sse2( dctcoef *dct ); int x264_coeff_last16_sse2( dctcoef *dct ); int x264_coeff_last64_sse2( dctcoef *dct ); -int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct ); -int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct ); -int x264_coeff_last8_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last15_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last16_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last64_sse2_lzcnt( dctcoef *dct ); -int x264_coeff_last64_avx2_lzcnt( dctcoef *dct ); +int x264_coeff_last4_lzcnt( dctcoef *dct ); +int x264_coeff_last8_lzcnt( dctcoef *dct ); +int x264_coeff_last15_lzcnt( dctcoef *dct ); +int x264_coeff_last16_lzcnt( dctcoef *dct ); +int x264_coeff_last64_lzcnt( dctcoef *dct ); +int x264_coeff_last64_avx2 ( dctcoef *dct ); +int x264_coeff_last4_avx512( int32_t *dct ); +int x264_coeff_last8_avx512( dctcoef *dct ); +int x264_coeff_last15_avx512( dctcoef *dct ); +int x264_coeff_last16_avx512( dctcoef *dct ); +int x264_coeff_last64_avx512( dctcoef *dct ); int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run16_avx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run15_avx2( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run4_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel ); -int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); +int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel ); int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel ); int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac ); diff --git a/library/src/main/cpp/libx264/common/x86/sad-a.asm b/library/src/main/cpp/libx264/common/x86/sad-a.asm index ede52ae..8029e11 100644 --- a/library/src/main/cpp/libx264/common/x86/sad-a.asm +++ b/library/src/main/cpp/libx264/common/x86/sad-a.asm @@ -106,8 +106,6 @@ SAD 4, 16 SAD 4, 8 SAD 4, 4 - - ;============================================================================= ; SAD XMM ;============================================================================= @@ -119,118 +117,64 @@ SAD 4, 4 RET %endmacro -%macro SAD_W16 0 ;----------------------------------------------------------------------------- ; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -cglobal pixel_sad_16x16, 4,4,8 - movu m0, [r2] - movu m1, [r2+r3] - lea r2, [r2+2*r3] - movu m2, [r2] - movu m3, [r2+r3] - lea r2, [r2+2*r3] - psadbw m0, [r0] - psadbw m1, [r0+r1] - lea r0, [r0+2*r1] - movu m4, [r2] - paddw m0, m1 - psadbw m2, [r0] - psadbw m3, [r0+r1] - lea r0, [r0+2*r1] - movu m5, [r2+r3] - lea r2, [r2+2*r3] - paddw m2, m3 - movu m6, [r2] - movu m7, [r2+r3] - lea r2, [r2+2*r3] - paddw m0, m2 - psadbw m4, [r0] - psadbw m5, [r0+r1] - lea r0, [r0+2*r1] - movu m1, [r2] - paddw m4, m5 - psadbw m6, [r0] - psadbw m7, [r0+r1] - lea r0, [r0+2*r1] - movu m2, [r2+r3] - lea r2, [r2+2*r3] - paddw m6, m7 - movu m3, [r2] - paddw m0, m4 - movu m4, [r2+r3] - lea r2, [r2+2*r3] - paddw m0, m6 - psadbw m1, [r0] - psadbw m2, [r0+r1] - lea r0, [r0+2*r1] - movu m5, [r2] - paddw m1, m2 - psadbw m3, [r0] - psadbw m4, [r0+r1] - lea r0, [r0+2*r1] - movu m6, [r2+r3] - lea r2, [r2+2*r3] - paddw m3, m4 - movu m7, [r2] - paddw m0, m1 - movu m1, [r2+r3] - paddw m0, m3 - psadbw m5, [r0] - psadbw m6, [r0+r1] - lea r0, [r0+2*r1] - paddw m5, m6 - psadbw m7, [r0] - psadbw m1, [r0+r1] - paddw m7, m1 - paddw m0, m5 - paddw m0, m7 - SAD_END_SSE2 - -;----------------------------------------------------------------------------- -; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) -;----------------------------------------------------------------------------- -cglobal pixel_sad_16x8, 4,4 - movu m0, [r2] - movu m2, [r2+r3] - lea r2, [r2+2*r3] - movu m3, [r2] - movu m4, [r2+r3] - psadbw m0, [r0] - psadbw m2, [r0+r1] - lea r0, [r0+2*r1] - psadbw m3, [r0] - psadbw m4, [r0+r1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - paddw m0, m2 - paddw m3, m4 - paddw m0, m3 - movu m1, [r2] - movu m2, [r2+r3] - lea r2, [r2+2*r3] - movu m3, [r2] - movu m4, [r2+r3] - psadbw m1, [r0] - psadbw m2, [r0+r1] - lea r0, [r0+2*r1] - psadbw m3, [r0] - psadbw m4, [r0+r1] - lea r0, [r0+2*r1] - lea r2, [r2+2*r3] - paddw m1, m2 - paddw m3, m4 - paddw m0, m1 - paddw m0, m3 +%macro SAD_W16 1 ; h +cglobal pixel_sad_16x%1, 4,4 +%ifidn cpuname, sse2 +.skip_prologue: +%endif +%assign %%i 0 +%if ARCH_X86_64 + lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile + lea r5, [3*r3] +%rep %1/4 + movu m1, [r2] + psadbw m1, [r0] + movu m3, [r2+r3] + psadbw m3, [r0+r1] + movu m2, [r2+2*r3] + psadbw m2, [r0+2*r1] + movu m4, [r2+r5] + psadbw m4, [r0+r6] +%if %%i != %1/4-1 + lea r2, [r2+4*r3] + lea r0, [r0+4*r1] +%endif + paddw m1, m3 + paddw m2, m4 + ACCUM paddw, 0, 1, %%i + paddw m0, m2 + %assign %%i %%i+1 +%endrep +%else ; The cost of having to save and restore registers on x86-32 +%rep %1/2 ; nullifies the benefit of having 3*stride in registers. + movu m1, [r2] + psadbw m1, [r0] + movu m2, [r2+r3] + psadbw m2, [r0+r1] +%if %%i != %1/2-1 + lea r2, [r2+2*r3] + lea r0, [r0+2*r1] +%endif + ACCUM paddw, 0, 1, %%i + paddw m0, m2 + %assign %%i %%i+1 +%endrep +%endif SAD_END_SSE2 %endmacro INIT_XMM sse2 -SAD_W16 +SAD_W16 16 +SAD_W16 8 INIT_XMM sse3 -SAD_W16 +SAD_W16 16 +SAD_W16 8 INIT_XMM sse2, aligned -SAD_W16 +SAD_W16 16 +SAD_W16 8 %macro SAD_INC_4x8P_SSE 1 movq m1, [r0] @@ -259,7 +203,132 @@ cglobal pixel_sad_8x16_sse2, 4,4 SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_END_SSE2 + +%macro SAD_W48_AVX512 3 ; w, h, d/q +cglobal pixel_sad_%1x%2, 4,4 + kxnorb k1, k1, k1 + kaddb k1, k1, k1 +%assign %%i 0 +%if ARCH_X86_64 && %2 != 4 + lea r6, [3*r1] + lea r5, [3*r3] +%rep %2/4 + mov%3 m1, [r0] + vpbroadcast%3 m1 {k1}, [r0+r1] + mov%3 m3, [r2] + vpbroadcast%3 m3 {k1}, [r2+r3] + mov%3 m2, [r0+2*r1] + vpbroadcast%3 m2 {k1}, [r0+r6] + mov%3 m4, [r2+2*r3] + vpbroadcast%3 m4 {k1}, [r2+r5] +%if %%i != %2/4-1 + lea r0, [r0+4*r1] + lea r2, [r2+4*r3] +%endif + psadbw m1, m3 + psadbw m2, m4 + ACCUM paddd, 0, 1, %%i + paddd m0, m2 + %assign %%i %%i+1 +%endrep +%else +%rep %2/2 + mov%3 m1, [r0] + vpbroadcast%3 m1 {k1}, [r0+r1] + mov%3 m2, [r2] + vpbroadcast%3 m2 {k1}, [r2+r3] +%if %%i != %2/2-1 + lea r0, [r0+2*r1] + lea r2, [r2+2*r3] +%endif + psadbw m1, m2 + ACCUM paddd, 0, 1, %%i + %assign %%i %%i+1 +%endrep +%endif +%if %1 == 8 + punpckhqdq m1, m0, m0 + paddd m0, m1 +%endif + movd eax, m0 + RET +%endmacro + +INIT_XMM avx512 +SAD_W48_AVX512 4, 4, d +SAD_W48_AVX512 4, 8, d +SAD_W48_AVX512 4, 16, d +SAD_W48_AVX512 8, 4, q +SAD_W48_AVX512 8, 8, q +SAD_W48_AVX512 8, 16, q + +%macro SAD_W16_AVX512_START 1 ; h + cmp r1d, FENC_STRIDE ; optimized for the most common fenc case, which + jne pixel_sad_16x%1_sse2.skip_prologue ; has the rows laid out contiguously in memory + lea r1, [3*r3] +%endmacro + +%macro SAD_W16_AVX512_END 0 + paddd m0, m1 + paddd m0, m2 + paddd m0, m3 +%if mmsize == 64 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 +%endif + vextracti128 xm1, ym0, 1 + paddd xmm0, xm0, xm1 + punpckhqdq xmm1, xmm0, xmm0 + paddd xmm0, xmm1 + movd eax, xmm0 RET +%endmacro + +INIT_YMM avx512 +cglobal pixel_sad_16x8, 4,4 + SAD_W16_AVX512_START 8 + movu xm0, [r2] + vinserti128 m0, [r2+r3], 1 + psadbw m0, [r0+0*32] + movu xm1, [r2+2*r3] + vinserti128 m1, [r2+r1], 1 + lea r2, [r2+4*r3] + psadbw m1, [r0+1*32] + movu xm2, [r2] + vinserti128 m2, [r2+r3], 1 + psadbw m2, [r0+2*32] + movu xm3, [r2+2*r3] + vinserti128 m3, [r2+r1], 1 + psadbw m3, [r0+3*32] + SAD_W16_AVX512_END + +INIT_ZMM avx512 +cglobal pixel_sad_16x16, 4,4 + SAD_W16_AVX512_START 16 + movu xm0, [r2] + vinserti128 ym0, [r2+r3], 1 + movu xm1, [r2+4*r3] + vinserti32x4 m0, [r2+2*r3], 2 + vinserti32x4 m1, [r2+2*r1], 2 + vinserti32x4 m0, [r2+r1], 3 + lea r2, [r2+4*r3] + vinserti32x4 m1, [r2+r3], 1 + psadbw m0, [r0+0*64] + vinserti32x4 m1, [r2+r1], 3 + lea r2, [r2+4*r3] + psadbw m1, [r0+1*64] + movu xm2, [r2] + vinserti128 ym2, [r2+r3], 1 + movu xm3, [r2+4*r3] + vinserti32x4 m2, [r2+2*r3], 2 + vinserti32x4 m3, [r2+2*r1], 2 + vinserti32x4 m2, [r2+r1], 3 + lea r2, [r2+4*r3] + vinserti32x4 m3, [r2+r3], 1 + psadbw m2, [r0+2*64] + vinserti32x4 m3, [r2+r1], 3 + psadbw m3, [r0+3*64] + SAD_W16_AVX512_END ;----------------------------------------------------------------------------- ; void pixel_vsad( pixel *src, intptr_t stride ); @@ -1548,6 +1617,225 @@ SAD_X_AVX2 3, 16, 8, 7 SAD_X_AVX2 4, 16, 16, 8 SAD_X_AVX2 4, 16, 8, 8 +%macro SAD_X_W4_AVX512 2 ; x, h +cglobal pixel_sad_x%1_4x%2, %1+2,%1+3 + mov t1d, 0xa + kmovb k1, t1d + lea t1, [3*t0] + kaddb k2, k1, k1 + kshiftlb k3, k1, 2 +%assign %%i 0 +%rep %2/4 + movu m6, [r0+%%i*64] + vmovddup m6 {k1}, [r0+%%i*64+32] + movd xmm2, [r1] + movd xmm4, [r1+t0] + vpbroadcastd xmm2 {k1}, [r1+2*t0] + vpbroadcastd xmm4 {k1}, [r1+t1] + vpbroadcastd xmm2 {k2}, [r2+t0] + vpbroadcastd xmm4 {k2}, [r2] + vpbroadcastd xmm2 {k3}, [r2+t1] ; a0 a2 b1 b3 + vpbroadcastd xmm4 {k3}, [r2+2*t0] ; a1 a3 b0 b2 + vpmovqd s1, m6 ; s0 s2 s1 s3 + movd xmm3, [r3] + movd xmm5, [r3+t0] + vpbroadcastd xmm3 {k1}, [r3+2*t0] + vpbroadcastd xmm5 {k1}, [r3+t1] +%if %1 == 4 + vpbroadcastd xmm3 {k2}, [r4+t0] + vpbroadcastd xmm5 {k2}, [r4] + vpbroadcastd xmm3 {k3}, [r4+t1] ; c0 c2 d1 d3 + vpbroadcastd xmm5 {k3}, [r4+2*t0] ; c1 c3 d0 d2 +%endif +%if %%i != %2/4-1 +%assign %%j 1 +%rep %1 + lea r%+%%j, [r%+%%j+4*t0] + %assign %%j %%j+1 +%endrep +%endif + pshufd s2, s1, q1032 + psadbw xmm2, s1 + psadbw xmm4, s2 + psadbw xmm3, s1 + psadbw xmm5, s2 +%if %%i + paddd xmm0, xmm2 + paddd xmm1, xmm3 + paddd xmm0, xmm4 + paddd xmm1, xmm5 +%else + paddd xmm0, xmm2, xmm4 + paddd xmm1, xmm3, xmm5 +%endif + %assign %%i %%i+1 +%endrep +%if %1 == 4 + movifnidn t2, r6mp +%else + movifnidn t2, r5mp +%endif + packusdw xmm0, xmm1 + mova [t2], xmm0 + RET +%endmacro + +%macro SAD_X_W8_AVX512 2 ; x, h +cglobal pixel_sad_x%1_8x%2, %1+2,%1+3 + kxnorb k3, k3, k3 + lea t1, [3*t0] + kaddb k1, k3, k3 + kshiftlb k2, k3, 2 + kshiftlb k3, k3, 3 +%assign %%i 0 +%rep %2/4 + movddup m6, [r0+%%i*64] ; s0 s0 s1 s1 + movq xm2, [r1] + movq xm4, [r1+2*t0] + vpbroadcastq xm2 {k1}, [r2] + vpbroadcastq xm4 {k1}, [r2+2*t0] + vpbroadcastq m2 {k2}, [r1+t0] + vpbroadcastq m4 {k2}, [r1+t1] + vpbroadcastq m2 {k3}, [r2+t0] ; a0 b0 a1 b1 + vpbroadcastq m4 {k3}, [r2+t1] ; a2 b2 a3 b3 + movddup m7, [r0+%%i*64+32] ; s2 s2 s3 s3 + movq xm3, [r3] + movq xm5, [r3+2*t0] +%if %1 == 4 + vpbroadcastq xm3 {k1}, [r4] + vpbroadcastq xm5 {k1}, [r4+2*t0] +%endif + vpbroadcastq m3 {k2}, [r3+t0] + vpbroadcastq m5 {k2}, [r3+t1] +%if %1 == 4 + vpbroadcastq m3 {k3}, [r4+t0] ; c0 d0 c1 d1 + vpbroadcastq m5 {k3}, [r4+t1] ; c2 d2 c3 d3 +%endif +%if %%i != %2/4-1 +%assign %%j 1 +%rep %1 + lea r%+%%j, [r%+%%j+4*t0] + %assign %%j %%j+1 +%endrep +%endif + psadbw m2, m6 + psadbw m4, m7 + psadbw m3, m6 + psadbw m5, m7 + ACCUM paddd, 0, 2, %%i + ACCUM paddd, 1, 3, %%i + paddd m0, m4 + paddd m1, m5 + %assign %%i %%i+1 +%endrep +%if %1 == 4 + movifnidn t2, r6mp +%else + movifnidn t2, r5mp +%endif + packusdw m0, m1 + vextracti128 xm1, m0, 1 + paddd xm0, xm1 + mova [t2], xm0 + RET +%endmacro + +%macro SAD_X_W16_AVX512 2 ; x, h +cglobal pixel_sad_x%1_16x%2, %1+2,%1+3 + lea t1, [3*t0] +%assign %%i 0 +%rep %2/4 + mova m6, [r0+%%i*64] ; s0 s1 s2 s3 + movu xm2, [r3] + movu xm4, [r3+t0] +%if %1 == 4 + vinserti128 ym2, [r4+t0], 1 + vinserti128 ym4, [r4], 1 +%endif + vinserti32x4 m2, [r1+2*t0], 2 + vinserti32x4 m4, [r1+t1], 2 + vinserti32x4 m2, [r2+t1], 3 ; c0 d1 a2 b3 + vinserti32x4 m4, [r2+2*t0], 3 ; c1 d0 a3 b2 + vpermq m7, m6, q1032 ; s1 s0 s3 s2 + movu xm3, [r1] + movu xm5, [r1+t0] + vinserti128 ym3, [r2+t0], 1 + vinserti128 ym5, [r2], 1 + vinserti32x4 m3, [r3+2*t0], 2 + vinserti32x4 m5, [r3+t1], 2 +%if %1 == 4 + vinserti32x4 m3, [r4+t1], 3 ; a0 b1 c2 d3 + vinserti32x4 m5, [r4+2*t0], 3 ; a1 b0 c3 d2 +%endif +%if %%i != %2/4-1 +%assign %%j 1 +%rep %1 + lea r%+%%j, [r%+%%j+4*t0] + %assign %%j %%j+1 +%endrep +%endif + psadbw m2, m6 + psadbw m4, m7 + psadbw m3, m6 + psadbw m5, m7 + ACCUM paddd, 0, 2, %%i + ACCUM paddd, 1, 3, %%i + paddd m0, m4 + paddd m1, m5 + %assign %%i %%i+1 +%endrep +%if %1 == 4 + movifnidn t2, r6mp +%else + movifnidn t2, r5mp +%endif + mov t1d, 0x1111 + kmovw k1, t1d + vshufi32x4 m0, m0, q1032 + paddd m0, m1 + punpckhqdq m1, m0, m0 + paddd m0, m1 + vpcompressd m0 {k1}{z}, m0 + mova [t2], xm0 + RET +%endmacro + +; t0 = stride, t1 = tmp/stride3, t2 = scores +%if WIN64 + %define s1 xmm16 ; xmm6 and xmm7 reduces code size, but + %define s2 xmm17 ; they're callee-saved on win64 + DECLARE_REG_TMP 4, 6, 0 +%else + %define s1 xmm6 + %define s2 xmm7 +%if ARCH_X86_64 + DECLARE_REG_TMP 4, 6, 5 ; scores is passed in a register on unix64 +%else + DECLARE_REG_TMP 4, 5, 0 +%endif +%endif + +INIT_YMM avx512 +SAD_X_W4_AVX512 3, 4 ; x3_4x4 +SAD_X_W4_AVX512 3, 8 ; x3_4x8 +SAD_X_W8_AVX512 3, 4 ; x3_8x4 +SAD_X_W8_AVX512 3, 8 ; x3_8x8 +SAD_X_W8_AVX512 3, 16 ; x3_8x16 +INIT_ZMM avx512 +SAD_X_W16_AVX512 3, 8 ; x3_16x8 +SAD_X_W16_AVX512 3, 16 ; x3_16x16 + +DECLARE_REG_TMP 5, 6, 0 +INIT_YMM avx512 +SAD_X_W4_AVX512 4, 4 ; x4_4x4 +SAD_X_W4_AVX512 4, 8 ; x4_4x8 +SAD_X_W8_AVX512 4, 4 ; x4_8x4 +SAD_X_W8_AVX512 4, 8 ; x4_8x8 +SAD_X_W8_AVX512 4, 16 ; x4_8x16 +INIT_ZMM avx512 +SAD_X_W16_AVX512 4, 8 ; x4_16x8 +SAD_X_W16_AVX512 4, 16 ; x4_16x16 + ;============================================================================= ; SAD cacheline split ;============================================================================= diff --git a/library/src/main/cpp/libx264/common/x86/x86inc.asm b/library/src/main/cpp/libx264/common/x86/x86inc.asm index 03304c4..3be387d 100644 --- a/library/src/main/cpp/libx264/common/x86/x86inc.asm +++ b/library/src/main/cpp/libx264/common/x86/x86inc.asm @@ -323,6 +323,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 %endmacro %define required_stack_alignment ((mmsize + 15) & ~15) +%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512))) +%define high_mm_regs (16*cpuflag(avx512)) %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) %ifnum %1 @@ -414,10 +416,10 @@ DECLARE_REG 7, rdi, 64 DECLARE_REG 8, rsi, 72 DECLARE_REG 9, rbx, 80 DECLARE_REG 10, rbp, 88 -DECLARE_REG 11, R12, 96 -DECLARE_REG 12, R13, 104 -DECLARE_REG 13, R14, 112 -DECLARE_REG 14, R15, 120 +DECLARE_REG 11, R14, 96 +DECLARE_REG 12, R15, 104 +DECLARE_REG 13, R12, 112 +DECLARE_REG 14, R13, 120 %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 @@ -436,15 +438,16 @@ DECLARE_REG 14, R15, 120 %macro WIN64_PUSH_XMM 0 ; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated. - %if xmm_regs_used > 6 + %if xmm_regs_used > 6 + high_mm_regs movaps [rstk + stack_offset + 8], xmm6 %endif - %if xmm_regs_used > 7 + %if xmm_regs_used > 7 + high_mm_regs movaps [rstk + stack_offset + 24], xmm7 %endif - %if xmm_regs_used > 8 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 %assign %%i 8 - %rep xmm_regs_used-8 + %rep %%xmm_regs_on_stack movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i %assign %%i %%i+1 %endrep @@ -453,53 +456,56 @@ DECLARE_REG 14, R15, 120 %macro WIN64_SPILL_XMM 1 %assign xmm_regs_used %1 - ASSERT xmm_regs_used <= 16 - %if xmm_regs_used > 8 + ASSERT xmm_regs_used <= 16 + high_mm_regs + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 ; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack. - %assign %%pad (xmm_regs_used-8)*16 + 32 + %assign %%pad %%xmm_regs_on_stack*16 + 32 %assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1)) SUB rsp, stack_size_padded %endif WIN64_PUSH_XMM %endmacro -%macro WIN64_RESTORE_XMM_INTERNAL 1 +%macro WIN64_RESTORE_XMM_INTERNAL 0 %assign %%pad_size 0 - %if xmm_regs_used > 8 - %assign %%i xmm_regs_used - %rep xmm_regs_used-8 + %assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8 + %if %%xmm_regs_on_stack > 0 + %assign %%i xmm_regs_used - high_mm_regs + %rep %%xmm_regs_on_stack %assign %%i %%i-1 - movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32] + movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32] %endrep %endif %if stack_size_padded > 0 %if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT mov rsp, rstkm %else - add %1, stack_size_padded + add rsp, stack_size_padded %assign %%pad_size stack_size_padded %endif %endif - %if xmm_regs_used > 7 - movaps xmm7, [%1 + stack_offset - %%pad_size + 24] + %if xmm_regs_used > 7 + high_mm_regs + movaps xmm7, [rsp + stack_offset - %%pad_size + 24] %endif - %if xmm_regs_used > 6 - movaps xmm6, [%1 + stack_offset - %%pad_size + 8] + %if xmm_regs_used > 6 + high_mm_regs + movaps xmm6, [rsp + stack_offset - %%pad_size + 8] %endif %endmacro -%macro WIN64_RESTORE_XMM 1 - WIN64_RESTORE_XMM_INTERNAL %1 +%macro WIN64_RESTORE_XMM 0 + WIN64_RESTORE_XMM_INTERNAL %assign stack_offset (stack_offset-stack_size_padded) + %assign stack_size_padded 0 %assign xmm_regs_used 0 %endmacro -%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs %macro RET 0 - WIN64_RESTORE_XMM_INTERNAL rsp + WIN64_RESTORE_XMM_INTERNAL POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -518,14 +524,15 @@ DECLARE_REG 7, R10, 16 DECLARE_REG 8, R11, 24 DECLARE_REG 9, rbx, 32 DECLARE_REG 10, rbp, 40 -DECLARE_REG 11, R12, 48 -DECLARE_REG 12, R13, 56 -DECLARE_REG 13, R14, 64 -DECLARE_REG 14, R15, 72 +DECLARE_REG 11, R14, 48 +DECLARE_REG 12, R15, 56 +DECLARE_REG 13, R12, 64 +DECLARE_REG 14, R13, 72 -%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... +%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... %assign num_args %1 %assign regs_used %2 + %assign xmm_regs_used %3 ASSERT regs_used >= num_args SETUP_STACK_POINTER %4 ASSERT regs_used <= 15 @@ -535,7 +542,7 @@ DECLARE_REG 14, R15, 72 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 @@ -546,7 +553,7 @@ DECLARE_REG 14, R15, 72 %endif %endif POP_IF_USED 14, 13, 12, 11, 10, 9 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -591,7 +598,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 DEFINE_ARGS_INTERNAL %0, %4, %5 %endmacro -%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 +%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required %macro RET 0 %if stack_size_padded > 0 @@ -602,7 +609,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %endif %endif POP_IF_USED 6, 5, 4, 3 - %if mmsize == 32 + %if vzeroupper_required vzeroupper %endif AUTO_REP_RET @@ -613,7 +620,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 %if WIN64 == 0 %macro WIN64_SPILL_XMM 1 %endmacro - %macro WIN64_RESTORE_XMM 1 + %macro WIN64_RESTORE_XMM 0 %endmacro %macro WIN64_PUSH_XMM 0 %endmacro @@ -624,7 +631,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 ; We can automatically detect "follows a branch", but not a branch target. ; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) %macro REP_RET 0 - %if has_epilogue + %if has_epilogue || cpuflag(ssse3) RET %else rep ret @@ -712,7 +719,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign stack_offset 0 ; stack pointer offset relative to the return address %assign stack_size 0 ; amount of stack space that can be freely used inside a function %assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding - %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 + %assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper %ifnidn %3, "" PROLOGUE %3 %endif @@ -775,24 +782,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign cpuflags_sse (1<<4) | cpuflags_mmx2 %assign cpuflags_sse2 (1<<5) | cpuflags_sse %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 -%assign cpuflags_sse3 (1<<7) | cpuflags_sse2 -%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 -%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 -%assign cpuflags_sse42 (1<<10)| cpuflags_sse4 -%assign cpuflags_avx (1<<11)| cpuflags_sse42 -%assign cpuflags_xop (1<<12)| cpuflags_avx -%assign cpuflags_fma4 (1<<13)| cpuflags_avx -%assign cpuflags_fma3 (1<<14)| cpuflags_avx -%assign cpuflags_avx2 (1<<15)| cpuflags_fma3 - -%assign cpuflags_cache32 (1<<16) -%assign cpuflags_cache64 (1<<17) -%assign cpuflags_slowctz (1<<18) -%assign cpuflags_lzcnt (1<<19) -%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant -%assign cpuflags_atom (1<<21) -%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt -%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1 +%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2 +%assign cpuflags_sse3 (1<<8) | cpuflags_sse2 +%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3 +%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3 +%assign cpuflags_sse42 (1<<11)| cpuflags_sse4 +%assign cpuflags_aesni (1<<12)| cpuflags_sse42 +%assign cpuflags_avx (1<<13)| cpuflags_sse42 +%assign cpuflags_xop (1<<14)| cpuflags_avx +%assign cpuflags_fma4 (1<<15)| cpuflags_avx +%assign cpuflags_fma3 (1<<16)| cpuflags_avx +%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt +%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1 +%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2 +%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL + +%assign cpuflags_cache32 (1<<21) +%assign cpuflags_cache64 (1<<22) +%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant +%assign cpuflags_atom (1<<24) ; Returns a boolean value expressing whether or not the specified cpuflag is enabled. %define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1) @@ -835,7 +843,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %if ARCH_X86_64 || cpuflag(sse2) %ifdef __NASM_VER__ - ALIGNMODE k8 + ALIGNMODE p6 %else CPU amdnop %endif @@ -848,11 +856,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %endif %endmacro -; Merge mmx and sse* +; Merge mmx, sse*, and avx* ; m# is a simd register of the currently selected size ; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m# ; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m# -; (All 3 remain in sync through SWAP.) +; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m# +; (All 4 remain in sync through SWAP.) %macro CAT_XDEFINE 3 %xdefine %1%2 %3 @@ -862,6 +871,18 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %undef %1%2 %endmacro +; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper +%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg + %if ARCH_X86_64 && cpuflag(avx512) + %assign %%i %1 + %rep 16-%1 + %assign %%i_high %%i+16 + SWAP %%i, %%i_high + %assign %%i %%i+1 + %endrep + %endif +%endmacro + %macro INIT_MMX 0-1+ %assign avx_enabled 0 %define RESET_MM_PERMUTATION INIT_MMX %1 @@ -877,7 +898,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, CAT_XDEFINE nnmm, %%i, %%i %assign %%i %%i+1 %endrep - %rep 8 + %rep 24 CAT_UNDEF m, %%i CAT_UNDEF nnmm, %%i %assign %%i %%i+1 @@ -891,7 +912,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %define mmsize 16 %define num_mmregs 8 %if ARCH_X86_64 - %define num_mmregs 16 + %define num_mmregs 32 %endif %define mova movdqa %define movu movdqu @@ -904,6 +925,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 + %if WIN64 + ; Swap callee-saved registers with volatile registers + AVX512_MM_PERMUTATION 6 + %endif %endmacro %macro INIT_YMM 0-1+ @@ -912,7 +937,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %define mmsize 32 %define num_mmregs 8 %if ARCH_X86_64 - %define num_mmregs 16 + %define num_mmregs 32 %endif %define mova movdqa %define movu movdqu @@ -925,6 +950,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae, %assign %%i %%i+1 %endrep INIT_CPUFLAGS %1 + AVX512_MM_PERMUTATION +%endmacro + +%macro INIT_ZMM 0-1+ + %assign avx_enabled 1 + %define RESET_MM_PERMUTATION INIT_ZMM %1 + %define mmsize 64 + %define num_mmregs 8 + %if ARCH_X86_64 + %define num_mmregs 32 + %endif + %define mova movdqa + %define movu movdqu + %undef movh + %define movnta movntdq + %assign %%i 0 + %rep num_mmregs + CAT_XDEFINE m, %%i, zmm %+ %%i + CAT_XDEFINE nnzmm, %%i, %%i + %assign %%i %%i+1 + %endrep + INIT_CPUFLAGS %1 + AVX512_MM_PERMUTATION %endmacro INIT_XMM @@ -933,18 +981,26 @@ INIT_XMM %define mmmm%1 mm%1 %define mmxmm%1 mm%1 %define mmymm%1 mm%1 + %define mmzmm%1 mm%1 %define xmmmm%1 mm%1 %define xmmxmm%1 xmm%1 %define xmmymm%1 xmm%1 + %define xmmzmm%1 xmm%1 %define ymmmm%1 mm%1 %define ymmxmm%1 xmm%1 %define ymmymm%1 ymm%1 + %define ymmzmm%1 ymm%1 + %define zmmmm%1 mm%1 + %define zmmxmm%1 xmm%1 + %define zmmymm%1 ymm%1 + %define zmmzmm%1 zmm%1 %define xm%1 xmm %+ m%1 %define ym%1 ymm %+ m%1 + %define zm%1 zmm %+ m%1 %endmacro %assign i 0 -%rep 16 +%rep 32 DECLARE_MMCAST i %assign i i+1 %endrep @@ -1032,7 +1088,11 @@ INIT_XMM ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't %macro call 1 - call_internal %1 %+ SUFFIX, %1 + %ifid %1 + call_internal %1 %+ SUFFIX, %1 + %else + call %1 + %endif %endmacro %macro call_internal 2 %xdefine %%i %2 @@ -1075,12 +1135,17 @@ INIT_XMM ;============================================================================= %assign i 0 -%rep 16 +%rep 32 %if i < 8 CAT_XDEFINE sizeofmm, i, 8 + CAT_XDEFINE regnumofmm, i, i %endif CAT_XDEFINE sizeofxmm, i, 16 CAT_XDEFINE sizeofymm, i, 32 + CAT_XDEFINE sizeofzmm, i, 64 + CAT_XDEFINE regnumofxmm, i, i + CAT_XDEFINE regnumofymm, i, i + CAT_XDEFINE regnumofzmm, i, i %assign i i+1 %endrep %undef i @@ -1197,7 +1262,7 @@ INIT_XMM %endmacro %endmacro -; Instructions with both VEX and non-VEX encodings +; Instructions with both VEX/EVEX and legacy encodings ; Non-destructive instructions are written without parameters AVX_INSTR addpd, sse2, 1, 0, 1 AVX_INSTR addps, sse, 1, 0, 1 @@ -1529,15 +1594,48 @@ FMA4_INSTR fmsubadd, pd, ps FMA4_INSTR fnmadd, pd, ps, sd, ss FMA4_INSTR fnmsub, pd, ps, sd, ss -; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0) -%ifdef __YASM_VER__ - %if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0 - %macro vpbroadcastq 2 - %if sizeof%1 == 16 - movddup %1, %2 - %else - vbroadcastsd %1, %2 +; Macros for converting VEX instructions to equivalent EVEX ones. +%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex + %macro %1 2-7 fnord, fnord, %1, %2, %3 + %ifidn %3, fnord + %define %%args %1, %2 + %elifidn %4, fnord + %define %%args %1, %2, %3 + %else + %define %%args %1, %2, %3, %4 + %endif + %assign %%evex_required cpuflag(avx512) & %7 + %ifnum regnumof%1 + %if regnumof%1 >= 16 || sizeof%1 > 32 + %assign %%evex_required 1 %endif - %endmacro - %endif -%endif + %endif + %ifnum regnumof%2 + %if regnumof%2 >= 16 || sizeof%2 > 32 + %assign %%evex_required 1 + %endif + %endif + %if %%evex_required + %6 %%args + %else + %5 %%args ; Prefer VEX over EVEX due to shorter instruction length + %endif + %endmacro +%endmacro + +EVEX_INSTR vbroadcastf128, vbroadcastf32x4 +EVEX_INSTR vbroadcasti128, vbroadcasti32x4 +EVEX_INSTR vextractf128, vextractf32x4 +EVEX_INSTR vextracti128, vextracti32x4 +EVEX_INSTR vinsertf128, vinsertf32x4 +EVEX_INSTR vinserti128, vinserti32x4 +EVEX_INSTR vmovdqa, vmovdqa32 +EVEX_INSTR vmovdqu, vmovdqu32 +EVEX_INSTR vpand, vpandd +EVEX_INSTR vpandn, vpandnd +EVEX_INSTR vpor, vpord +EVEX_INSTR vpxor, vpxord +EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision +EVEX_INSTR vrcpss, vrcp14ss, 1 +EVEX_INSTR vrsqrtps, vrsqrt14ps, 1 +EVEX_INSTR vrsqrtss, vrsqrt14ss, 1 diff --git a/library/src/main/cpp/libx264/common/x86/x86util.asm b/library/src/main/cpp/libx264/common/x86/x86util.asm index ea40bc8..7a140eb 100644 --- a/library/src/main/cpp/libx264/common/x86/x86util.asm +++ b/library/src/main/cpp/libx264/common/x86/x86util.asm @@ -303,24 +303,24 @@ %endmacro %macro HADDD 2 ; sum junk -%if sizeof%1 == 32 -%define %2 xmm%2 - vextracti128 %2, %1, 1 -%define %1 xmm%1 - paddd %1, %2 +%if sizeof%1 >= 64 + vextracti32x8 ymm%2, zmm%1, 1 + paddd ymm%1, ymm%2 %endif -%if mmsize >= 16 - MOVHL %2, %1 - paddd %1, %2 +%if sizeof%1 >= 32 + vextracti128 xmm%2, ymm%1, 1 + paddd xmm%1, xmm%2 +%endif +%if sizeof%1 >= 16 + MOVHL xmm%2, xmm%1 + paddd xmm%1, xmm%2 %endif %if cpuflag(xop) && sizeof%1 == 16 - vphadddq %1, %1 + vphadddq xmm%1, xmm%1 %else - PSHUFLW %2, %1, q0032 - paddd %1, %2 + PSHUFLW xmm%2, xmm%1, q1032 + paddd xmm%1, xmm%2 %endif -%undef %1 -%undef %2 %endmacro %macro HADDW 2 ; reg, tmp diff --git a/library/src/main/cpp/libx264/encoder/analyse.c b/library/src/main/cpp/libx264/encoder/analyse.c index 1941bf2..036d6c1 100644 --- a/library/src/main/cpp/libx264/encoder/analyse.c +++ b/library/src/main/cpp/libx264/encoder/analyse.c @@ -34,37 +34,23 @@ typedef struct { - /* 16x16 */ - int i_rd16x16; x264_me_t me16x16; x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */ - - /* 8x8 */ - int i_cost8x8; - /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */ - ALIGNED_4( int16_t mvc[32][5][2] ); x264_me_t me8x8[4]; - - /* Sub 4x4 */ - int i_cost4x4[4]; /* cost per 8x8 partition */ x264_me_t me4x4[4][4]; - - /* Sub 8x4 */ - int i_cost8x4[4]; /* cost per 8x8 partition */ x264_me_t me8x4[4][2]; - - /* Sub 4x8 */ - int i_cost4x8[4]; /* cost per 8x8 partition */ x264_me_t me4x8[4][2]; - - /* 16x8 */ - int i_cost16x8; x264_me_t me16x8[2]; - - /* 8x16 */ - int i_cost8x16; x264_me_t me8x16[2]; - + int i_rd16x16; + int i_cost8x8; + int i_cost4x4[4]; /* cost per 8x8 partition */ + int i_cost8x4[4]; /* cost per 8x8 partition */ + int i_cost4x8[4]; /* cost per 8x8 partition */ + int i_cost16x8; + int i_cost8x16; + /* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */ + ALIGNED_4( int16_t mvc[32][5][2] ); } x264_mb_analysis_list_t; typedef struct @@ -278,29 +264,31 @@ static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32]; static int init_costs( x264_t *h, float *logs, int qp ) { - int lambda = x264_lambda_tab[qp]; if( h->cost_mv[qp] ) return 0; + + int mv_range = h->param.analyse.i_mv_range; + int lambda = x264_lambda_tab[qp]; /* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */ - CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) ); - h->cost_mv[qp] += 2*4*2048; - for( int i = 0; i <= 2*4*2048; i++ ) + CHECKED_MALLOC( h->cost_mv[qp], (4*4*mv_range + 1) * sizeof(uint16_t) ); + h->cost_mv[qp] += 2*4*mv_range; + for( int i = 0; i <= 2*4*mv_range; i++ ) { h->cost_mv[qp][-i] = - h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 ); + h->cost_mv[qp][i] = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX ); } x264_pthread_mutex_lock( &cost_ref_mutex ); for( int i = 0; i < 3; i++ ) for( int j = 0; j < 33; j++ ) - x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 ); + x264_cost_ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0; x264_pthread_mutex_unlock( &cost_ref_mutex ); if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] ) { for( int j = 0; j < 4; j++ ) { - CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) ); - h->cost_mv_fpel[qp][j] += 2*2048; - for( int i = -2*2048; i < 2*2048; i++ ) + CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*mv_range + 1) * sizeof(uint16_t) ); + h->cost_mv_fpel[qp][j] += 2*mv_range; + for( int i = -2*mv_range; i < 2*mv_range; i++ ) h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j]; } } @@ -314,12 +302,13 @@ fail: int x264_analyse_init_costs( x264_t *h ) { - float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) ); + int mv_range = h->param.analyse.i_mv_range; + float *logs = x264_malloc( (2*4*mv_range+1) * sizeof(float) ); if( !logs ) return -1; logs[0] = 0.718f; - for( int i = 1; i <= 2*4*2048; i++ ) + for( int i = 1; i <= 2*4*mv_range; i++ ) logs[i] = log2f( i+1 ) * 2.0f + 1.718f; for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ ) @@ -338,13 +327,14 @@ fail: void x264_analyse_free_costs( x264_t *h ) { + int mv_range = h->param.analyse.i_mv_range; for( int i = 0; i < QP_MAX+1; i++ ) { if( h->cost_mv[i] ) - x264_free( h->cost_mv[i] - 2*4*2048 ); + x264_free( h->cost_mv[i] - 2*4*mv_range ); if( h->cost_mv_fpel[i][0] ) for( int j = 0; j < 4; j++ ) - x264_free( h->cost_mv_fpel[i][j] - 2*2048 ); + x264_free( h->cost_mv_fpel[i][j] - 2*mv_range ); } } @@ -465,11 +455,10 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel /* Calculate max allowed MV range */ -#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 ) h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 ); h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 ); - h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] ); - h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] ); + h->mb.mv_min_spel[0] = X264_MAX( h->mb.mv_min[0], -i_fmv_range ); + h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max[0], i_fmv_range-1 ); if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P ) { int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */ @@ -513,9 +502,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) mb_y = (h->mb.i_mb_y >> j) + (i == 1); h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 ); h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 ); - h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range ); - h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] ); - h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 ); + h->mb.mv_miny_spel_row[i] = X264_MAX( h->mb.mv_miny_row[i], -i_fmv_range ); + h->mb.mv_maxy_spel_row[i] = X264_MIN3( h->mb.mv_maxy_row[i], i_fmv_range-1, 4*thread_mvy_range ); h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border; h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border; } @@ -524,9 +512,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) { h->mb.mv_min[1] = 4*( -16*mb_y - 24 ); h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 ); - h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range ); - h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] ); - h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 ); + h->mb.mv_min_spel[1] = X264_MAX( h->mb.mv_min[1], -i_fmv_range ); + h->mb.mv_max_spel[1] = X264_MIN3( h->mb.mv_max[1], i_fmv_range-1, 4*thread_mvy_range ); h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border; h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border; } @@ -541,7 +528,6 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp ) h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i]; h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i]; } -#undef CLIP_FMV a->l0.me16x16.cost = a->l0.i_rd16x16 = @@ -713,8 +699,12 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd ) x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 ); if( !h->mb.i_psy_rd ) return; - /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */ - h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) ); + + M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO; + M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO; + M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO; + M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO; + h->mb.pic.fenc_hadamard_cache[8] = 0; if( b_satd ) h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) ); } @@ -743,8 +733,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] ); h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] ); } - a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) - + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ); + a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) + + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); return; } @@ -759,8 +749,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv ); h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] ); h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] ); - satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ); - satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ); + satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ); + satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ); for( ; *predict_mode >= 0; predict_mode++ ) { @@ -788,8 +778,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) } /* we calculate the cost */ - i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) + - h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) + + i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) + + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ) + a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] ); a->i_satd_chroma_dir[i_mode] = i_satd; @@ -845,7 +835,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ if( a->i_satd_i16x16 <= i16x16_thresh ) { h->predict_16x16[I_PRED_16x16_P]( p_dst ); - a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ); + a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ); a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3); COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 ); } @@ -862,7 +852,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ else h->predict_16x16[i_mode]( p_dst ); - i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) + + i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ) + lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] ); COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode ); a->i_satd_i16x16_dir[i_mode] = i_satd; @@ -1065,7 +1055,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_ else h->predict_4x4[i_mode]( p_dst_by ); - i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE ); + i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_src_by, FENC_STRIDE, p_dst_by, FDEC_STRIDE ); if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) ) { i_satd -= lambda * 3; @@ -1735,7 +1725,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size, int chroma ) { - ALIGNED_ARRAY_N( pixel, pix1,[16*16] ); + ALIGNED_ARRAY_32( pixel, pix1,[16*16] ); pixel *pix2 = pix1+8; int i_stride = h->mb.pic.i_stride[1]; int chroma_h_shift = chroma <= CHROMA_422; @@ -1919,8 +1909,8 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8 static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel ) { - ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] ); - ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] ); + ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] ); + ALIGNED_ARRAY_32( pixel, bi, [2],[16*16] ); int i_chroma_cost = 0; int chromapix = h->luma2chroma_pixel[i_pixel]; @@ -2013,8 +2003,8 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) { - ALIGNED_ARRAY_N( pixel, pix0,[16*16] ); - ALIGNED_ARRAY_N( pixel, pix1,[16*16] ); + ALIGNED_ARRAY_32( pixel, pix0,[16*16] ); + ALIGNED_ARRAY_32( pixel, pix1,[16*16] ); pixel *src0, *src1; intptr_t stride0 = 16, stride1 = 16; int i_ref, i_mvc; @@ -2147,7 +2137,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a ) } else { - ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] ); + ALIGNED_ARRAY_32( pixel, pixuv, [2],[16*FENC_STRIDE] ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; int v_shift = CHROMA_V_SHIFT; @@ -2483,7 +2473,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a ) static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd ) { - ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] ); + ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] ); ALIGNED_4( int16_t mvc[3][2] ); h->mb.i_partition = D_16x8; diff --git a/library/src/main/cpp/libx264/encoder/cabac.c b/library/src/main/cpp/libx264/encoder/cabac.c index 27052cd..9debd1e 100644 --- a/library/src/main/cpp/libx264/encoder/cabac.c +++ b/library/src/main/cpp/libx264/encoder/cabac.c @@ -801,7 +801,7 @@ void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { -#if ARCH_X86_64 && HAVE_MMX +#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ ) h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else x264_cabac_block_residual_c( h, cb, ctx_block_cat, l ); @@ -915,7 +915,7 @@ void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_ static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { -#if ARCH_X86_64 && HAVE_MMX +#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ ) h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l ); @@ -923,7 +923,7 @@ static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t } static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ) { -#if ARCH_X86_64 && HAVE_MMX +#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ ) h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb ); #else x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l ); @@ -1057,29 +1057,29 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_ src = dst; #define MUNGE_8x8_NNZ( MUNGE )\ -if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] )\ +if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] && !(h->mb.cbp[h->mb.i_mb_left_xy[0]] & 0x1000) )\ {\ - MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x80 )\ - MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x80 )\ - MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x80 )\ - MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x80 )\ - MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x80 )\ - MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x80 )\ + MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x00 )\ + MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x00 )\ + MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x00 )\ + MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x00 )\ + MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x00 )\ + MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x00 )\ }\ -if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] )\ +if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] && !(h->mb.cbp[h->mb.i_mb_left_xy[1]] & 0x1000) )\ {\ - MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x80 )\ - MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x80 )\ - MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x80 )\ - MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x80 )\ - MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x80 )\ - MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x80 )\ + MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x00 )\ + MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x00 )\ + MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x00 )\ + MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x00 )\ + MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x00 )\ + MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x00 )\ }\ -if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] )\ +if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] && !(h->mb.cbp[h->mb.i_mb_top_xy] & 0x1000) )\ {\ - MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x80808080U )\ - MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x80808080U )\ - MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x80808080U )\ + MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x00000000U )\ + MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x00000000U )\ + MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x00000000U )\ } MUNGE_8x8_NNZ( BACKUP ) diff --git a/library/src/main/cpp/libx264/encoder/encoder.c b/library/src/main/cpp/libx264/encoder/encoder.c index 27db1bd..d183460 100644 --- a/library/src/main/cpp/libx264/encoder/encoder.c +++ b/library/src/main/cpp/libx264/encoder/encoder.c @@ -444,11 +444,6 @@ static int x264_validate_parameters( x264_t *h, int b_open ) fail = 1; } #endif - if( !fail && !(cpuflags & X264_CPU_CMOV) ) - { - x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n"); - fail = 1; - } if( fail ) { x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n"); @@ -494,7 +489,8 @@ static int x264_validate_parameters( x264_t *h, int b_open ) #endif if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX ) { - x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" ); + x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/YUYV/UYVY/" + "I444/YV24/BGR/BGRA/RGB supported)\n" ); return -1; } @@ -859,6 +855,11 @@ static int x264_validate_parameters( x264_t *h, int b_open ) h->param.analyse.inter &= ~X264_ANALYSE_I8x8; h->param.analyse.intra &= ~X264_ANALYSE_I8x8; } + if( i_csp >= X264_CSP_I444 && h->param.b_cabac ) + { + /* Disable 8x8dct during 4:4:4+CABAC encoding for compatibility with libavcodec */ + h->param.analyse.b_transform_8x8 = 0; + } if( h->param.rc.i_rc_method == X264_RC_CQP ) { float qp_p = h->param.rc.i_qp_constant; @@ -1170,7 +1171,7 @@ static int x264_validate_parameters( x264_t *h, int b_open ) if( h->param.analyse.i_mv_range <= 0 ) h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED; else - h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> PARAM_INTERLACED); + h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 8192 >> PARAM_INTERLACED); } h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART ); @@ -1530,6 +1531,12 @@ x264_t *x264_encoder_open( x264_param_t *param ) x264_rdo_init(); /* init CPU functions */ +#if (ARCH_X86 || ARCH_X86_64) && HIGH_BIT_DEPTH + /* FIXME: Only 8-bit has been optimized for AVX-512 so far. The few AVX-512 functions + * enabled in high bit-depth are insignificant and just causes potential issues with + * unnecessary thermal throttling and whatnot, so keep it disabled for now. */ + h->param.cpu &= ~X264_CPU_AVX512; +#endif x264_predict_16x16_init( h->param.cpu, h->predict_16x16 ); x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c ); x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c ); @@ -1566,9 +1573,15 @@ x264_t *x264_encoder_open( x264_param_t *param ) if( !strcmp(x264_cpu_names[i].name, "SSE4.1") && (h->param.cpu & X264_CPU_SSE42) ) continue; + if( !strcmp(x264_cpu_names[i].name, "LZCNT") + && (h->param.cpu & X264_CPU_BMI1) ) + continue; if( !strcmp(x264_cpu_names[i].name, "BMI1") && (h->param.cpu & X264_CPU_BMI2) ) continue; + if( !strcmp(x264_cpu_names[i].name, "FMA4") + && (h->param.cpu & X264_CPU_FMA3) ) + continue; if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags && (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) ) p += sprintf( p, " %s", x264_cpu_names[i].name ); @@ -1580,14 +1593,6 @@ x264_t *x264_encoder_open( x264_param_t *param ) if( x264_analyse_init_costs( h ) ) goto fail; - static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 }; - /* Checks for known miscompilation issues. */ - if( h->cost_mv[X264_LOOKAHEAD_QP][2013] != cost_mv_correct[BIT_DEPTH-8] ) - { - x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" ); - goto fail; - } - /* Must be volatile or else GCC will optimize it out. */ volatile int temp = 392; if( x264_clz( temp ) != 23 ) diff --git a/library/src/main/cpp/libx264/encoder/macroblock.c b/library/src/main/cpp/libx264/encoder/macroblock.c index 87ba7f2..929fcc8 100644 --- a/library/src/main/cpp/libx264/encoder/macroblock.c +++ b/library/src/main/cpp/libx264/encoder/macroblock.c @@ -128,8 +128,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp ) pixel *p_src = h->mb.pic.p_fenc[p]; pixel *p_dst = h->mb.pic.p_fdec[p]; - ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] ); int nz, block_cbp = 0; int decimate_score = h->mb.b_dct_decimate ? 0 : 9; @@ -283,13 +283,10 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction ) { int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6; - int ssd[2]; + ALIGNED_ARRAY_8( int, ssd,[2] ); int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8; - int score = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] ); - if( score < thresh*4 ) - score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] ); - if( score < thresh*4 ) + if( h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], ssd ) < thresh*4 ) { h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0; h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0; @@ -350,7 +347,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter int i_decimate_score = b_decimate ? 0 : 7; int nz_ac = 0; - ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); if( h->mb.b_lossless ) { @@ -561,9 +558,16 @@ void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_m pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride; if( i_mode == I_PRED_4x4_V ) + { h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 ); + memcpy( p_dst, p_dst-FDEC_STRIDE, 4*sizeof(pixel) ); + } else if( i_mode == I_PRED_4x4_H ) + { h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 ); + for( int i = 0; i < 4; i++ ) + p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1]; + } else h->predict_4x4[i_mode]( p_dst ); } @@ -574,9 +578,16 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_m pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride; if( i_mode == I_PRED_8x8_V ) + { h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 ); + memcpy( p_dst, &edge[16], 8*sizeof(pixel) ); + } else if( i_mode == I_PRED_8x8_H ) + { h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 ); + for( int i = 0; i < 8; i++ ) + p_dst[i*FDEC_STRIDE] = edge[14-i]; + } else h->predict_8x8[i_mode]( p_dst, edge ); } @@ -584,12 +595,21 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_m void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode ) { int stride = h->fenc->i_stride[p] << MB_INTERLACED; + pixel *p_dst = h->mb.pic.p_fdec[p]; + if( i_mode == I_PRED_16x16_V ) - h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 ); + { + h->mc.copy[PIXEL_16x16]( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 ); + memcpy( p_dst, p_dst-FDEC_STRIDE, 16*sizeof(pixel) ); + } else if( i_mode == I_PRED_16x16_H ) - h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 ); + { + h->mc.copy_16x16_unaligned( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 ); + for( int i = 0; i < 16; i++ ) + p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1]; + } else - h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] ); + h->predict_16x16[i_mode]( p_dst ); } /***************************************************************************** @@ -780,7 +800,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_ } else if( h->mb.b_transform_8x8 ) { - ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] ); + ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] ); b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) @@ -824,7 +844,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_ } else { - ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] ); for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp ) { int quant_cat = p ? CQM_4PC : CQM_4PY; @@ -965,8 +985,8 @@ void x264_macroblock_encode( x264_t *h ) *****************************************************************************/ static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma ) { - ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] ); - ALIGNED_ARRAY_16( dctcoef, dctscan,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] ); + ALIGNED_ARRAY_64( dctcoef, dctscan,[16] ); ALIGNED_4( int16_t mvp[2] ); int i_qp = h->mb.i_qp; @@ -1219,7 +1239,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i int quant_cat = p ? CQM_8PC : CQM_8PY; pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; - ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] ); + ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] ); h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec ); int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 ); @@ -1252,7 +1272,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE; int i_decimate_8x8 = b_decimate ? 0 : 4; - ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] ); int nnz8x8 = 0; h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec ); @@ -1311,7 +1331,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i i_qp = h->mb.i_chroma_qp; for( int ch = 0; ch < 2; ch++ ) { - ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] ); pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE; pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE; @@ -1376,7 +1396,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i } else { - ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] ); h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec ); nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 ); h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz; diff --git a/library/src/main/cpp/libx264/encoder/macroblock.h b/library/src/main/cpp/libx264/encoder/macroblock.h index 0bf2711..1c901a8 100644 --- a/library/src/main/cpp/libx264/encoder/macroblock.h +++ b/library/src/main/cpp/libx264/encoder/macroblock.h @@ -55,6 +55,9 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 ); void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp ); void x264_cabac_mb_skip( x264_t *h, int b_skip ); +void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); +void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); +void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int idx ); @@ -113,7 +116,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_ int nz; pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]]; pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]]; - ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] ); if( b_predict ) { @@ -151,7 +154,7 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_ int nz; pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE]; pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE]; - ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] ); + ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] ); ALIGNED_ARRAY_32( pixel, edge_buf,[36] ); if( b_predict ) diff --git a/library/src/main/cpp/libx264/encoder/me.c b/library/src/main/cpp/libx264/encoder/me.c index 310bff7..094fc5d 100644 --- a/library/src/main/cpp/libx264/encoder/me.c +++ b/library/src/main/cpp/libx264/encoder/me.c @@ -191,7 +191,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int omx, omy, pmx, pmy; pixel *p_fenc = m->p_fenc[0]; pixel *p_fref_w = m->p_fref_w; - ALIGNED_ARRAY_N( pixel, pix,[16*16] ); + ALIGNED_ARRAY_32( pixel, pix,[16*16] ); ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] ); ALIGNED_ARRAY_16( int, costs,[16] ); @@ -875,7 +875,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite int chroma_v_shift = CHROMA_V_SHIFT; int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; - ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment + ALIGNED_ARRAY_32( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment ALIGNED_ARRAY_16( int, costs,[4] ); int bmx = m->mv[0]; @@ -1034,9 +1034,9 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m const int i_pixel = m0->i_pixel; const int bw = x264_pixel_size[i_pixel].w; const int bh = x264_pixel_size[i_pixel].h; - ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] ); - ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] ); - ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_32( pixel, pixy_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_32( pixel, pixu_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_32( pixel, pixv_buf,[2],[9][16*16] ); pixel *src[3][2][9]; int chromapix = h->luma2chroma_pixel[i_pixel]; int chroma_v_shift = CHROMA_V_SHIFT; @@ -1059,7 +1059,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m uint64_t bcostrd = COST_MAX64; uint16_t amvd; /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ - ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] ); + ALIGNED_ARRAY_64( uint8_t, visited,[8],[8][8] ); /* all permutations of an offset in up to 2 of the dimensions */ ALIGNED_4( static const int8_t dia4d[33][4] ) = { diff --git a/library/src/main/cpp/libx264/encoder/me.h b/library/src/main/cpp/libx264/encoder/me.h index 305c42d..505e3ce 100644 --- a/library/src/main/cpp/libx264/encoder/me.h +++ b/library/src/main/cpp/libx264/encoder/me.h @@ -32,10 +32,10 @@ typedef struct { - /* aligning the first member is a gcc hack to force the struct to be - * 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */ + /* aligning the first member is a gcc hack to force the struct to be aligned, + * as well as force sizeof(struct) to be a multiple of the alignment. */ /* input */ - ALIGNED_16( int i_pixel ); /* PIXEL_WxH */ + ALIGNED_64( int i_pixel ); /* PIXEL_WxH */ uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */ int i_ref_cost; int i_ref; @@ -53,7 +53,7 @@ typedef struct int cost_mv; /* lambda * nbits for the chosen mv */ int cost; /* satd + lambda * nbits */ ALIGNED_4( int16_t mv[2] ); -} ALIGNED_16( x264_me_t ); +} ALIGNED_64( x264_me_t ); void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh ); #define x264_me_search( h, m, mvc, i_mvc )\ @@ -66,8 +66,6 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight ); uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel ); -extern uint16_t *x264_cost_mv_fpel[QP_MAX+1][4]; - #define COPY1_IF_LT(x,y)\ if( (y) < (x) )\ (x) = (y); diff --git a/library/src/main/cpp/libx264/encoder/ratecontrol.c b/library/src/main/cpp/libx264/encoder/ratecontrol.c index dbccb27..5289316 100644 --- a/library/src/main/cpp/libx264/encoder/ratecontrol.c +++ b/library/src/main/cpp/libx264/encoder/ratecontrol.c @@ -243,7 +243,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2 stride <<= b_field; if( b_chroma ) { - ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] ); + ALIGNED_ARRAY_32( pixel, pix,[FENC_STRIDE*16] ); int chromapix = h->luma2chroma_pixel[PIXEL_16x16]; int shift = 7 - CHROMA_V_SHIFT; @@ -420,7 +420,7 @@ static int x264_macroblock_tree_rescale_init( x264_t *h, x264_ratecontrol_t *rc float dstdim[2] = { h->param.i_width / 16.f, h->param.i_height / 16.f}; int srcdimi[2] = {ceil(srcdim[0]), ceil(srcdim[1])}; int dstdimi[2] = {ceil(dstdim[0]), ceil(dstdim[1])}; - if( PARAM_INTERLACED ) + if( h->param.b_interlaced || h->param.b_fake_interlaced ) { srcdimi[1] = (srcdimi[1]+1)&~1; dstdimi[1] = (dstdimi[1]+1)&~1; @@ -1469,7 +1469,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead ) if( h->i_frame == 0 ) { //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR - double fr = 1. / 172; + double fr = 1. / (h->param.i_level_idc >= 60 ? 300 : 172); int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height; rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr; } diff --git a/library/src/main/cpp/libx264/encoder/ratecontrol.h b/library/src/main/cpp/libx264/encoder/ratecontrol.h index c5b95cb..168b515 100644 --- a/library/src/main/cpp/libx264/encoder/ratecontrol.h +++ b/library/src/main/cpp/libx264/encoder/ratecontrol.h @@ -58,8 +58,6 @@ int x264_ratecontrol_qp( x264_t * ); int x264_ratecontrol_mb_qp( x264_t *h ); int x264_ratecontrol_end( x264_t *, int bits, int *filler ); void x264_ratecontrol_summary( x264_t * ); -void x264_ratecontrol_set_estimated_size( x264_t *, int bits ); -int x264_ratecontrol_get_estimated_size( x264_t const *); int x264_rc_analyse_slice( x264_t *h ); void x264_threads_distribute_ratecontrol( x264_t *h ); void x264_threads_merge_ratecontrol( x264_t *h ); diff --git a/library/src/main/cpp/libx264/encoder/rdo.c b/library/src/main/cpp/libx264/encoder/rdo.c index cd76682..a6865bd 100644 --- a/library/src/main/cpp/libx264/encoder/rdo.c +++ b/library/src/main/cpp/libx264/encoder/rdo.c @@ -64,9 +64,8 @@ static uint16_t cabac_size_5ones[128]; #include "cabac.c" #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \ - sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) - (CHROMA444 ? 0 : (1024+12)-460) ) -#define COPY_CABAC_PART( pos, size )\ - memcpy( &cb->state[pos], &h->cabac.state[pos], size ) + sizeof(int) + (CHROMA444 ? 1024+12 : 460) ) +#define COPY_CABAC_PART( pos, size ) memcpy( &cb->state[pos], &h->cabac.state[pos], size ) static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y ) { @@ -634,8 +633,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac, int b_chroma, int dc, int num_coefs, int idx ) { - ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] ); - ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] ); + ALIGNED_ARRAY_64( dctcoef, orig_coefs, [64] ); + ALIGNED_ARRAY_64( dctcoef, quant_coefs, [64] ); const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab; const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab; const int b_interlaced = MB_INTERLACED; @@ -695,7 +694,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct, return !!dct[0]; } -#if HAVE_MMX && ARCH_X86_64 +#if HAVE_MMX && ARCH_X86_64 && !defined( __MACH__ ) #define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\ cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8) if( num_coefs == 16 && !dc ) diff --git a/library/src/main/cpp/libx264/encoder/set.c b/library/src/main/cpp/libx264/encoder/set.c index f86189f..2ab4e4e 100644 --- a/library/src/main/cpp/libx264/encoder/set.c +++ b/library/src/main/cpp/libx264/encoder/set.c @@ -783,23 +783,26 @@ int x264_sei_avcintra_vanc_write( x264_t *h, bs_t *s, int len ) const x264_level_t x264_levels[] = { - { 10, 1485, 99, 396, 64, 175, 64, 64, 0, 2, 0, 0, 1 }, - { 9, 1485, 99, 396, 128, 350, 64, 64, 0, 2, 0, 0, 1 }, /* "1b" */ - { 11, 3000, 396, 900, 192, 500, 128, 64, 0, 2, 0, 0, 1 }, - { 12, 6000, 396, 2376, 384, 1000, 128, 64, 0, 2, 0, 0, 1 }, - { 13, 11880, 396, 2376, 768, 2000, 128, 64, 0, 2, 0, 0, 1 }, - { 20, 11880, 396, 2376, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 }, - { 21, 19800, 792, 4752, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, - { 22, 20250, 1620, 8100, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, - { 30, 40500, 1620, 8100, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 }, - { 31, 108000, 3600, 18000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 }, - { 32, 216000, 5120, 20480, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 }, - { 40, 245760, 8192, 32768, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 }, - { 41, 245760, 8192, 32768, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 }, - { 42, 522240, 8704, 34816, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 }, - { 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 }, - { 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, - { 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, + { 10, 1485, 99, 396, 64, 175, 64, 64, 0, 2, 0, 0, 1 }, + { 9, 1485, 99, 396, 128, 350, 64, 64, 0, 2, 0, 0, 1 }, /* "1b" */ + { 11, 3000, 396, 900, 192, 500, 128, 64, 0, 2, 0, 0, 1 }, + { 12, 6000, 396, 2376, 384, 1000, 128, 64, 0, 2, 0, 0, 1 }, + { 13, 11880, 396, 2376, 768, 2000, 128, 64, 0, 2, 0, 0, 1 }, + { 20, 11880, 396, 2376, 2000, 2000, 128, 64, 0, 2, 0, 0, 1 }, + { 21, 19800, 792, 4752, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, + { 22, 20250, 1620, 8100, 4000, 4000, 256, 64, 0, 2, 0, 0, 0 }, + { 30, 40500, 1620, 8100, 10000, 10000, 256, 32, 22, 2, 0, 1, 0 }, + { 31, 108000, 3600, 18000, 14000, 14000, 512, 16, 60, 4, 1, 1, 0 }, + { 32, 216000, 5120, 20480, 20000, 20000, 512, 16, 60, 4, 1, 1, 0 }, + { 40, 245760, 8192, 32768, 20000, 25000, 512, 16, 60, 4, 1, 1, 0 }, + { 41, 245760, 8192, 32768, 50000, 62500, 512, 16, 24, 2, 1, 1, 0 }, + { 42, 522240, 8704, 34816, 50000, 62500, 512, 16, 24, 2, 1, 1, 1 }, + { 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 }, + { 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, + { 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 }, + { 60, 4177920, 139264, 696320, 240000, 240000, 8192, 16, 24, 2, 1, 1, 1 }, + { 61, 8355840, 139264, 696320, 480000, 480000, 8192, 16, 24, 2, 1, 1, 1 }, + { 62, 16711680, 139264, 696320, 800000, 800000, 8192, 16, 24, 2, 1, 1, 1 }, { 0 } }; diff --git a/library/src/main/cpp/libx264/encoder/slicetype.c b/library/src/main/cpp/libx264/encoder/slicetype.c index b20bbf3..6c0aaa8 100644 --- a/library/src/main/cpp/libx264/encoder/slicetype.c +++ b/library/src/main/cpp/libx264/encoder/slicetype.c @@ -267,7 +267,7 @@ static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t int i_lines = fenc->i_lines[p]; int i_width = fenc->i_width[p]; pixel *src = fenc->plane[p]; - ALIGNED_ARRAY_16( pixel, buf, [16*16] ); + ALIGNED_ARRAY_64( pixel, buf, [16*16] ); int pixoff = 0; if( w ) { @@ -544,17 +544,18 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a, if( p0 == p1 ) goto lowres_intra_mb; + int mv_range = 2 * h->param.analyse.i_mv_range; // no need for h->mb.mv_min[] - h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4; - h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4; - h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 ); - h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 ); + h->mb.mv_min_spel[0] = X264_MAX( 4*(-8*h->mb.i_mb_x - 12), -mv_range ); + h->mb.mv_max_spel[0] = X264_MIN( 4*(8*(h->mb.i_mb_width - h->mb.i_mb_x - 1) + 12), mv_range-1 ); + h->mb.mv_limit_fpel[0][0] = h->mb.mv_min_spel[0] >> 2; + h->mb.mv_limit_fpel[1][0] = h->mb.mv_max_spel[0] >> 2; if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 ) { - h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4; - h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4; - h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 ); - h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 ); + h->mb.mv_min_spel[1] = X264_MAX( 4*(-8*h->mb.i_mb_y - 12), -mv_range ); + h->mb.mv_max_spel[1] = X264_MIN( 4*(8*( h->mb.i_mb_height - h->mb.i_mb_y - 1) + 12), mv_range-1 ); + h->mb.mv_limit_fpel[0][1] = h->mb.mv_min_spel[1] >> 2; + h->mb.mv_limit_fpel[1][1] = h->mb.mv_max_spel[1] >> 2; } #define LOAD_HPELS_LUMA(dst, src) \ @@ -728,13 +729,13 @@ lowres_intra_mb: if( h->param.analyse.i_subpel_refine > 1 ) { h->predict_8x8c[I_PRED_CHROMA_P]( pix ); - int satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); + int satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ); i_icost = X264_MIN( i_icost, satd ); h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS ); for( int i = 3; i < 9; i++ ) { h->predict_8x8[i]( pix, edge ); - satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE ); + satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ); i_icost = X264_MIN( i_icost, satd ); } } diff --git a/library/src/main/cpp/libx264/filters/video/resize.c b/library/src/main/cpp/libx264/filters/video/resize.c index 0bacb5b..0d6bd8c 100644 --- a/library/src/main/cpp/libx264/filters/video/resize.c +++ b/library/src/main/cpp/libx264/filters/video/resize.c @@ -154,10 +154,12 @@ static int convert_csp_to_pix_fmt( int csp ) case X264_CSP_RGB: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_RGB48 : AV_PIX_FMT_RGB24; case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGR48 : AV_PIX_FMT_BGR24; case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGRA64 : AV_PIX_FMT_BGRA; - /* the next csp has no equivalent 16bit depth in swscale */ + /* the following has no equivalent 16-bit depth in swscale */ case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV12; case X264_CSP_NV21: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV21; - /* the next csp is no supported by swscale at all */ + case X264_CSP_YUYV: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_YUYV422; + case X264_CSP_UYVY: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_UYVY422; + /* the following is not supported by swscale at all */ case X264_CSP_NV16: default: return AV_PIX_FMT_NONE; } diff --git a/library/src/main/cpp/libx264/input/input.c b/library/src/main/cpp/libx264/input/input.c index db29a54..335f601 100644 --- a/library/src/main/cpp/libx264/input/input.c +++ b/library/src/main/cpp/libx264/input/input.c @@ -43,6 +43,8 @@ const x264_cli_csp_t x264_cli_csps[] = { [X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, [X264_CSP_NV21] = { "nv21", 2, { 1, 1 }, { 1, .5 }, 2, 2 }, [X264_CSP_NV16] = { "nv16", 2, { 1, 1 }, { 1, 1 }, 2, 1 }, + [X264_CSP_YUYV] = { "yuyv", 1, { 2 }, { 1 }, 2, 1 }, + [X264_CSP_UYVY] = { "uyvy", 1, { 2 }, { 1 }, 2, 1 }, [X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 }, [X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 }, [X264_CSP_RGB] = { "rgb", 1, { 3 }, { 1 }, 1, 1 }, diff --git a/library/src/main/cpp/libx264/input/raw.c b/library/src/main/cpp/libx264/input/raw.c index f8c0406..1f2e73e 100644 --- a/library/src/main/cpp/libx264/input/raw.c +++ b/library/src/main/cpp/libx264/input/raw.c @@ -98,6 +98,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c uint64_t size = ftell( h->fh ); fseek( h->fh, 0, SEEK_SET ); info->num_frames = size / h->frame_size; + FAIL_IF_ERROR( !info->num_frames, "empty input file\n" ); /* Attempt to use memory-mapped input frames if possible */ if( !(h->bit_depth & 7) ) diff --git a/library/src/main/cpp/libx264/input/y4m.c b/library/src/main/cpp/libx264/input/y4m.c index 8948f68..09dbb13 100644 --- a/library/src/main/cpp/libx264/input/y4m.c +++ b/library/src/main/cpp/libx264/input/y4m.c @@ -223,6 +223,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c uint64_t i_size = ftell( h->fh ); fseek( h->fh, init_pos, SEEK_SET ); info->num_frames = (i_size - h->seq_header_len) / h->frame_size; + FAIL_IF_ERROR( !info->num_frames, "empty input file\n" ); /* Attempt to use memory-mapped input frames if possible */ if( !(h->bit_depth & 7) ) diff --git a/library/src/main/cpp/libx264/libs/armeabi-v7a/libx264.a b/library/src/main/cpp/libx264/libs/armeabi-v7a/libx264.a index 566f60d..4319a96 100644 Binary files a/library/src/main/cpp/libx264/libs/armeabi-v7a/libx264.a and b/library/src/main/cpp/libx264/libs/armeabi-v7a/libx264.a differ diff --git a/library/src/main/cpp/libx264/libs/x86/libx264.a b/library/src/main/cpp/libx264/libs/x86/libx264.a index 7b229c3..ac5cc84 100644 Binary files a/library/src/main/cpp/libx264/libs/x86/libx264.a and b/library/src/main/cpp/libx264/libs/x86/libx264.a differ diff --git a/library/src/main/cpp/libx264/tools/checkasm-a.asm b/library/src/main/cpp/libx264/tools/checkasm-a.asm index 34d5beb..a9f7493 100644 --- a/library/src/main/cpp/libx264/tools/checkasm-a.asm +++ b/library/src/main/cpp/libx264/tools/checkasm-a.asm @@ -153,7 +153,11 @@ cglobal checkasm_call, 2,15,16,max_args*8+8 mov r9, rax mov r10, rdx lea r0, [error_message] +%if FORMAT_ELF + call puts wrt ..plt +%else call puts +%endif mov r1, [rsp+max_args*8] mov dword [r1], 0 mov rdx, r10 @@ -221,3 +225,14 @@ cglobal stack_pagealign, 2,2 leave RET +; Trigger a warmup of vector units +%macro WARMUP 0 +cglobal checkasm_warmup, 0,0 + xorps m0, m0 + RET +%endmacro + +INIT_YMM avx +WARMUP +INIT_ZMM avx512 +WARMUP diff --git a/library/src/main/cpp/libx264/tools/checkasm-aarch64.S b/library/src/main/cpp/libx264/tools/checkasm-aarch64.S index 0341a5a..db4a33d 100644 --- a/library/src/main/cpp/libx264/tools/checkasm-aarch64.S +++ b/library/src/main/cpp/libx264/tools/checkasm-aarch64.S @@ -25,9 +25,7 @@ #include "../common/aarch64/asm.S" -.section .rodata -.align 4 -register_init: +const register_init, align=4 .quad 0x21f86d66c8ca00ce .quad 0x75b6ba21077c48ad .quad 0xed56bb2dcb3c7736 @@ -46,10 +44,12 @@ register_init: .quad 0xd229e1f5b281303f .quad 0x71aeaff20b095fd9 .quad 0xab63e2e11fa38ed9 +endconst -error_message: +const error_message .asciz "failed to preserve register" +endconst .text @@ -149,7 +149,7 @@ function x264_checkasm_call, export=1 mov w9, #0 str w9, [x2] movrel x0, error_message - bl puts + bl X(puts) 0: ldp x0, x1, [sp], #16 ldp d14, d15, [sp], #16 diff --git a/library/src/main/cpp/libx264/tools/checkasm-arm.S b/library/src/main/cpp/libx264/tools/checkasm-arm.S index 433ac53..4bd0ca0 100644 --- a/library/src/main/cpp/libx264/tools/checkasm-arm.S +++ b/library/src/main/cpp/libx264/tools/checkasm-arm.S @@ -25,9 +25,7 @@ #include "../common/arm/asm.S" -.section .rodata -.align 4 -register_init: +const register_init, align=4 .quad 0x21f86d66c8ca00ce .quad 0x75b6ba21077c48ad .quad 0xed56bb2dcb3c7736 @@ -36,9 +34,11 @@ register_init: .quad 0xdf9a54b303f1d3a3 .quad 0x4a75479abd64e097 .quad 0x249214109d5d1c88 +endconst -error_message: +const error_message .asciz "failed to preserve register" +endconst .text diff --git a/library/src/main/cpp/libx264/tools/checkasm.c b/library/src/main/cpp/libx264/tools/checkasm.c index 7427c33..e25a45c 100644 --- a/library/src/main/cpp/libx264/tools/checkasm.c +++ b/library/src/main/cpp/libx264/tools/checkasm.c @@ -28,6 +28,7 @@ #include #include "common/common.h" #include "common/cpu.h" +#include "encoder/macroblock.h" #ifdef _WIN32 #include @@ -56,8 +57,7 @@ int quiet = 0; if( !ok ) ret = -1; \ } -#define BENCH_RUNS 100 // tradeoff between accuracy and speed -#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff) +#define BENCH_RUNS 2000 // tradeoff between accuracy and speed #define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions #define MAX_CPUS 30 // number of different combinations of cpu flags @@ -99,7 +99,7 @@ static inline uint32_t read_time(void) : "=a"(a) :: "edx", "memory" ); #elif ARCH_PPC asm volatile( "mftb %0" : "=r"(a) :: "memory" ); -#elif ARCH_ARM // ARMv7 only +#elif HAVE_ARM_INLINE_ASM // ARMv7 only asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" ); #elif ARCH_AARCH64 uint64_t b = 0; @@ -177,7 +177,10 @@ static void print_bench(void) continue; printf( "%s_%s%s: %"PRId64"\n", benchs[i].name, #if HAVE_MMX + b->cpu&X264_CPU_AVX512 ? "avx512" : b->cpu&X264_CPU_AVX2 ? "avx2" : + b->cpu&X264_CPU_BMI2 ? "bmi2" : + b->cpu&X264_CPU_BMI1 ? "bmi1" : b->cpu&X264_CPU_FMA3 ? "fma3" : b->cpu&X264_CPU_FMA4 ? "fma4" : b->cpu&X264_CPU_XOP ? "xop" : @@ -186,6 +189,7 @@ static void print_bench(void) b->cpu&X264_CPU_SSE4 ? "sse4" : b->cpu&X264_CPU_SSSE3 ? "ssse3" : b->cpu&X264_CPU_SSE3 ? "sse3" : + b->cpu&X264_CPU_LZCNT ? "lzcnt" : /* print sse2slow only if there's also a sse2fast version of the same func */ b->cpu&X264_CPU_SSE2_IS_SLOW && jcpu&X264_CPU_SSE2 ? "sse2" : @@ -208,10 +212,7 @@ static void print_bench(void) b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" : b->cpu&X264_CPU_CACHELINE_64 ? "_c64" : b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" : - b->cpu&X264_CPU_LZCNT ? "_lzcnt" : - b->cpu&X264_CPU_BMI2 ? "_bmi2" : - b->cpu&X264_CPU_BMI1 ? "_bmi1" : - b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" : + b->cpu&X264_CPU_LZCNT && b->cpu&X264_CPU_SSE3 && !(b->cpu&X264_CPU_BMI1) ? "_lzcnt" : b->cpu&X264_CPU_SLOW_ATOM ? "_atom" : #elif ARCH_ARM b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" : @@ -221,8 +222,18 @@ static void print_bench(void) } } +/* YMM and ZMM registers on x86 are turned off to save power when they haven't been + * used for some period of time. When they are used there will be a "warmup" period + * during which performance will be reduced and inconsistent which is problematic when + * trying to benchmark individual functions. We can work around this by periodically + * issuing "dummy" instructions that uses those registers to keep them powered on. */ +static void (*simd_warmup_func)( void ) = NULL; +#define simd_warmup() do { if( simd_warmup_func ) simd_warmup_func(); } while( 0 ) + #if ARCH_X86 || ARCH_X86_64 int x264_stack_pagealign( int (*func)(), int align ); +void x264_checkasm_warmup_avx( void ); +void x264_checkasm_warmup_avx512( void ); /* detect when callee-saved regs aren't saved * needs an explicit asm check because it only sometimes crashes in normal use. */ @@ -257,6 +268,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... ); #define call_a1(func,...) ({ \ uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \ x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \ + simd_warmup(); \ x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); }) #elif ARCH_AARCH64 && !defined(__APPLE__) void x264_checkasm_stack_clobber( uint64_t clobber, ... ); @@ -284,6 +296,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... ); call_a1(func, __VA_ARGS__);\ for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\ {\ + simd_warmup();\ uint32_t t = read_time();\ func(__VA_ARGS__);\ func(__VA_ARGS__);\ @@ -357,8 +370,9 @@ static int check_pixel( int cpu_ref, int cpu_new ) used_asm = 1; \ for( int j = 0; j < 64; j++ ) \ { \ - res_c = call_c( pixel_c.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \ - res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \ + intptr_t stride1 = (j&31) == 31 ? 32 : FENC_STRIDE; \ + res_c = call_c( pixel_c.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \ + res_asm = call_a( pixel_asm.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \ if( res_c != res_asm ) \ { \ ok = 0; \ @@ -493,15 +507,17 @@ static int check_pixel( int cpu_ref, int cpu_new ) #define TEST_PIXEL_VAR2( i ) \ if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \ { \ - int res_c, res_asm, ssd_c, ssd_asm; \ + int res_c, res_asm; \ + ALIGNED_ARRAY_8( int, ssd_c, [2] ); \ + ALIGNED_ARRAY_8( int, ssd_asm,[2] ); \ set_func_name( "%s_%s", "var2", pixel_names[i] ); \ used_asm = 1; \ - res_c = call_c( pixel_c.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_c ); \ - res_asm = call_a( pixel_asm.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_asm ); \ - if( res_c != res_asm || ssd_c != ssd_asm ) \ + res_c = call_c( pixel_c.var2[i], pbuf1, pbuf2, ssd_c ); \ + res_asm = call_a( pixel_asm.var2[i], pbuf1, pbuf2, ssd_asm ); \ + if( res_c != res_asm || memcmp( ssd_c, ssd_asm, 2*sizeof(int) ) ) \ { \ ok = 0; \ - fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \ + fprintf( stderr, "var2[%d]: {%d, %d, %d} != {%d, %d, %d} [FAILED]\n", i, res_c, ssd_c[0], ssd_c[1], res_asm, ssd_asm[0], ssd_asm[1] ); \ } \ } @@ -826,10 +842,10 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_dct_function_t dct_asm; x264_quant_function_t qf; int ret = 0, ok, used_asm, interlace = 0; - ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] ); + ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct2, [16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] ); + ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] ); ALIGNED_16( dctcoef dctdc[2][8] ); x264_t h_buf; x264_t *h = &h_buf; @@ -1031,8 +1047,8 @@ static int check_dct( int cpu_ref, int cpu_new ) x264_zigzag_function_t zigzag_ref[2]; x264_zigzag_function_t zigzag_asm[2]; - ALIGNED_ARRAY_16( dctcoef, level1,[64] ); - ALIGNED_ARRAY_16( dctcoef, level2,[64] ); + ALIGNED_ARRAY_64( dctcoef, level1,[64] ); + ALIGNED_ARRAY_64( dctcoef, level2,[64] ); #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \ if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \ @@ -1370,6 +1386,8 @@ static int check_mc( int cpu_ref, int cpu_new ) } report( "mc offsetsub :" ); + memset( pbuf3, 0, 64*16 ); + memset( pbuf4, 0, 64*16 ); ok = 1; used_asm = 0; for( int height = 8; height <= 16; height += 8 ) { @@ -1377,8 +1395,6 @@ static int check_mc( int cpu_ref, int cpu_new ) { set_func_name( "store_interleave_chroma" ); used_asm = 1; - memset( pbuf3, 0, 64*height ); - memset( pbuf4, 0, 64*height ); call_c( mc_c.store_interleave_chroma, pbuf3, (intptr_t)64, pbuf1, pbuf1+16, height ); call_a( mc_a.store_interleave_chroma, pbuf4, (intptr_t)64, pbuf1, pbuf1+16, height ); if( memcmp( pbuf3, pbuf4, 64*height ) ) @@ -1525,6 +1541,33 @@ static int check_mc( int cpu_ref, int cpu_new ) } } + if( mc_a.plane_copy_deinterleave_yuyv != mc_ref.plane_copy_deinterleave_yuyv ) + { + set_func_name( "plane_copy_deinterleave_yuyv" ); + used_asm = 1; + for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ ) + { + int w = (plane_specs[i].w + 1) >> 1; + int h = plane_specs[i].h; + intptr_t dst_stride = ALIGN( w, 32/sizeof(pixel) ); + intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1; + intptr_t offv = dst_stride*h; + pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1); + memset( pbuf3, 0, 0x1000 ); + memset( pbuf4, 0, 0x1000 ); + /* Skip benchmarking since it's the same as plane_copy_deinterleave(), just verify correctness. */ + call_c1( mc_c.plane_copy_deinterleave_yuyv, pbuf3, dst_stride, pbuf3+offv, dst_stride, src1, src_stride, w, h ); + call_a1( mc_a.plane_copy_deinterleave_yuyv, pbuf4, dst_stride, pbuf4+offv, dst_stride, src1, src_stride, w, h ); + for( int y = 0; y < h; y++ ) + if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) || + memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(pixel) ) ) + { + fprintf( stderr, "plane_copy_deinterleave_yuyv FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride ); + break; + } + } + } + if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb ) { set_func_name( "plane_copy_deinterleave_rgb" ); @@ -1565,7 +1608,7 @@ static int check_mc( int cpu_ref, int cpu_new ) { int w = (plane_specs[i].w + 1) >> 1; int h = plane_specs[i].h; - intptr_t dst_stride = ALIGN( w, 16 ); + intptr_t dst_stride = ALIGN( w, 32 ); intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t); intptr_t offv = dst_stride*h + 32; memset( pbuf3, 0, 0x1000 ); @@ -1703,7 +1746,7 @@ static int check_mc( int cpu_ref, int cpu_new ) { ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4; if( !ok ) - fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] ); + fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] ); } } } @@ -1722,15 +1765,16 @@ static int check_mc( int cpu_ref, int cpu_new ) h.mb.i_mb_width = width; h.mb.i_mb_height = height; - uint16_t *ref_costsc = (uint16_t*)buf3; - uint16_t *ref_costsa = (uint16_t*)buf4; - int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size); + uint16_t *ref_costsc = (uint16_t*)buf3 + width; + uint16_t *ref_costsa = (uint16_t*)buf4 + width; + int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + width + size); int16_t *propagate_amount = (int16_t*)(mvs + width); uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width); - h.scratch_buffer2 = (uint8_t*)(ref_costsa + size); + h.scratch_buffer2 = (uint8_t*)(ref_costsa + width + size); int bipred_weight = (rand()%63)+1; + int mb_y = rand()&3; int list = i&1; - for( int j = 0; j < size; j++ ) + for( int j = -width; j < size+width; j++ ) ref_costsc[j] = ref_costsa[j] = rand()&32767; for( int j = 0; j < width; j++ ) { @@ -1741,18 +1785,18 @@ static int check_mc( int cpu_ref, int cpu_new ) lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT; } - call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); - call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); + call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); - for( int j = 0; j < size && ok; j++ ) + for( int j = -width; j < size+width && ok; j++ ) { ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1; if( !ok ) fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] ); } - call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); - call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list ); + call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); + call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list ); } } @@ -1815,12 +1859,14 @@ static int check_mc( int cpu_ref, int cpu_new ) { set_func_name( "memcpy_aligned" ); ok = 1; used_asm = 1; - for( size_t size = 16; size < 256; size += 16 ) + for( size_t size = 16; size < 512; size += 16 ) { - memset( buf4, 0xAA, size + 1 ); + for( int i = 0; i < size; i++ ) + buf1[i] = rand(); + memset( buf4-1, 0xAA, size + 2 ); call_c( mc_c.memcpy_aligned, buf3, buf1, size ); call_a( mc_a.memcpy_aligned, buf4, buf1, size ); - if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA ) + if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA ) { ok = 0; fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size ); @@ -1836,10 +1882,10 @@ static int check_mc( int cpu_ref, int cpu_new ) ok = 1; used_asm = 1; for( size_t size = 128; size < 1024; size += 128 ) { - memset( buf4, 0xAA, size + 1 ); + memset( buf4-1, 0xAA, size + 2 ); call_c( mc_c.memzero_aligned, buf3, size ); call_a( mc_a.memzero_aligned, buf4, size ); - if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA ) + if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA ) { ok = 0; fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size ); @@ -1919,12 +1965,15 @@ static int check_deblock( int cpu_ref, int cpu_new ) if( db_a.deblock_strength != db_ref.deblock_strength ) { + set_func_name( "deblock_strength" ); + used_asm = 1; for( int i = 0; i < 100; i++ ) { - ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] ); + ALIGNED_ARRAY_16( uint8_t, nnz_buf, [X264_SCAN8_SIZE+8] ); + uint8_t *nnz = &nnz_buf[8]; ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] ); ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] ); - ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] ); + ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] ); memset( bs, 99, sizeof(uint8_t)*2*4*8*2 ); for( int j = 0; j < X264_SCAN8_SIZE; j++ ) nnz[j] = ((rand()&7) == 7) * rand() & 0xf; @@ -1933,9 +1982,8 @@ static int check_deblock( int cpu_ref, int cpu_new ) { ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2; for( int l = 0; l < 2; l++ ) - mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512; + mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&16383) - 8192; } - set_func_name( "deblock_strength" ); call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) ); call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) ); if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) ) @@ -1968,11 +2016,11 @@ static int check_quant( int cpu_ref, int cpu_new ) x264_quant_function_t qf_c; x264_quant_function_t qf_ref; x264_quant_function_t qf_a; - ALIGNED_ARRAY_N( dctcoef, dct1,[64] ); - ALIGNED_ARRAY_N( dctcoef, dct2,[64] ); - ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] ); - ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] ); - ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] ); + ALIGNED_ARRAY_64( dctcoef, dct1,[64] ); + ALIGNED_ARRAY_64( dctcoef, dct2,[64] ); + ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] ); + ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] ); + ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] ); int ret = 0, ok, used_asm; int oks[3] = {1,1,1}, used_asms[3] = {0,0,0}; x264_t h_buf; @@ -2213,7 +2261,7 @@ static int check_quant( int cpu_ref, int cpu_new ) int max = X264_MIN( i, PIXEL_MAX*16 ); \ for( int j = 0; j < size; j++ ) \ dct1[j] = rand()%(max*2+1) - max; \ - for( int j = 0; i <= size; j += 4 ) \ + for( int j = 0; j <= size; j += 4 ) \ qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \ memcpy( dct2, dct1, size*sizeof(dctcoef) ); \ res_c = call_c1( qf_c.optname, dct1, dmf ); \ @@ -2560,9 +2608,6 @@ DECL_CABAC(asm) #endif extern const uint8_t x264_count_cat_m1[14]; -void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); -void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); -void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l ); static int check_cabac( int cpu_ref, int cpu_new ) { @@ -2577,6 +2622,11 @@ static int check_cabac( int cpu_ref, int cpu_new ) x264_quant_init( &h, cpu_new, &h.quantf ); h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4; +/* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */ +#define GET_CB( i ) (\ + x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\ + cb[i].f8_bits_encoded = 0, &cb[i] ) + #define CABAC_RESIDUAL(name, start, end, rd)\ {\ if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\ @@ -2589,7 +2639,7 @@ static int check_cabac( int cpu_ref, int cpu_new ) {\ for( int j = 0; j < 256; j++ )\ {\ - ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\ + ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\ uint8_t bitstream[2][1<<16];\ static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\ int ac = ctx_ac[ctx_block_cat];\ @@ -2612,13 +2662,9 @@ static int check_cabac( int cpu_ref, int cpu_new ) x264_cabac_t cb[2];\ x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\ x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\ - x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\ - x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\ - cb[0].f8_bits_encoded = 0;\ - cb[1].f8_bits_encoded = 0;\ if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\ - call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\ - call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ + call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\ + call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\ ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\ if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\ if( !ok )\ @@ -2631,8 +2677,8 @@ static int check_cabac( int cpu_ref, int cpu_new ) }\ if( (j&15) == 0 )\ {\ - call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\ - call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\ + call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\ + call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\ }\ }\ }\ @@ -2759,6 +2805,14 @@ static int check_all_flags( void ) int ret = 0; int cpu0 = 0, cpu1 = 0; uint32_t cpu_detect = x264_cpu_detect(); +#if ARCH_X86 || ARCH_X86_64 + if( cpu_detect & X264_CPU_AVX512 ) + simd_warmup_func = x264_checkasm_warmup_avx512; + else if( cpu_detect & X264_CPU_AVX ) + simd_warmup_func = x264_checkasm_warmup_avx; +#endif + simd_warmup(); + #if HAVE_MMX if( cpu_detect & X264_CPU_MMX2 ) { @@ -2769,13 +2823,6 @@ static int check_all_flags( void ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" ); cpu1 &= ~X264_CPU_CACHELINE_32; #endif - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; - } - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" ); - cpu1 &= ~X264_CPU_SLOW_CTZ; } if( cpu_detect & X264_CPU_SSE ) ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" ); @@ -2787,13 +2834,11 @@ static int check_all_flags( void ) cpu1 &= ~X264_CPU_CACHELINE_64; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" ); cpu1 &= ~X264_CPU_SLOW_SHUFFLE; - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" ); - cpu1 &= ~X264_CPU_SLOW_CTZ; - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; - } + } + if( cpu_detect & X264_CPU_LZCNT ) + { + ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "LZCNT" ); + cpu1 &= ~X264_CPU_LZCNT; } if( cpu_detect & X264_CPU_SSE3 ) { @@ -2807,8 +2852,6 @@ static int check_all_flags( void ) cpu1 &= ~X264_CPU_CACHELINE_64; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" ); cpu1 &= ~X264_CPU_SLOW_SHUFFLE; - ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" ); - cpu1 &= ~X264_CPU_SLOW_CTZ; ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" ); ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" ); cpu1 &= ~X264_CPU_CACHELINE_64; @@ -2833,29 +2876,15 @@ static int check_all_flags( void ) cpu1 &= ~X264_CPU_FMA4; } if( cpu_detect & X264_CPU_FMA3 ) - { ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" ); - cpu1 &= ~X264_CPU_FMA3; - } - if( cpu_detect & X264_CPU_AVX2 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" ); - if( cpu_detect & X264_CPU_LZCNT ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" ); - cpu1 &= ~X264_CPU_LZCNT; - } - } if( cpu_detect & X264_CPU_BMI1 ) - { ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" ); - cpu1 &= ~X264_CPU_BMI1; - } if( cpu_detect & X264_CPU_BMI2 ) - { - ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" ); - cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2); - } + ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" ); + if( cpu_detect & X264_CPU_AVX2 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" ); + if( cpu_detect & X264_CPU_AVX512 ) + ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" ); #elif ARCH_PPC if( cpu_detect & X264_CPU_ALTIVEC ) { @@ -2885,8 +2914,6 @@ static int check_all_flags( void ) int main(int argc, char *argv[]) { - int ret = 0; - #ifdef _WIN32 /* Disable the Windows Error Reporting dialog */ SetErrorMode( SEM_NOGPFAULTERRORBOX ); @@ -2912,8 +2939,8 @@ int main(int argc, char *argv[]) fprintf( stderr, "x264: using random seed %u\n", seed ); srand( seed ); - buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS ); - pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS ); + buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) ); + pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) ); if( !buf1 || !pbuf1 ) { fprintf( stderr, "malloc failed, unable to initiate tests!\n" ); @@ -2934,21 +2961,7 @@ int main(int argc, char *argv[]) } memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) ); - /* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */ - if( do_bench ) - for( int i = 0; i < BENCH_ALIGNS && !ret; i++ ) - { - INIT_POINTER_OFFSETS; - ret |= x264_stack_pagealign( check_all_flags, i*32 ); - buf1 += 32; - pbuf1 += 32; - quiet = 1; - fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS ); - } - else - ret = x264_stack_pagealign( check_all_flags, 0 ); - - if( ret ) + if( x264_stack_pagealign( check_all_flags, 0 ) ) { fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" ); return -1; diff --git a/library/src/main/cpp/libx264/tools/gas-preprocessor.pl b/library/src/main/cpp/libx264/tools/gas-preprocessor.pl index cb5e3c5..afdfc9e 100755 --- a/library/src/main/cpp/libx264/tools/gas-preprocessor.pl +++ b/library/src/main/cpp/libx264/tools/gas-preprocessor.pl @@ -63,7 +63,7 @@ while (@ARGV) { $force_thumb = 1; } elsif ($opt eq "-arch") { $arch = shift; - die "unknown arch: '$arch'\n" if not exists $comments{$arch}; + die "unknown arch: '$arch'\n" if not exists $canonical_arch{$arch}; } elsif ($opt eq "-as-type") { $as_type = shift; die "unknown as type: '$as_type'\n" if $as_type !~ /^((apple-)?(gas|clang)|armasm)$/; @@ -429,7 +429,7 @@ sub parse_line { sub handle_set { my $line = $_[0]; - if ($line =~ /\.set\s+(.*),\s*(.*)/) { + if ($line =~ /\.(?:set|equ)\s+(\S*)\s*,\s*(.*)/) { $symbols{$1} = eval_expr($2); return 1; } @@ -874,7 +874,7 @@ sub handle_serialized_line { # Don't interpret e.g. bic as b with ic as conditional code if ($cond !~ /|$arm_cond_codes/) { # Not actually a branch - } elsif ($target =~ /(\d+)([bf])/) { + } elsif ($target =~ /^(\d+)([bf])$/) { # The target is a local label $line = handle_local_label($line, $1, $2); $line =~ s/\b$instr\b/$&.w/ if $width eq ""; @@ -888,12 +888,12 @@ sub handle_serialized_line { } # ALIGN in armasm syntax is the actual number of bytes - if ($line =~ /\.align\s+(\d+)/) { + if ($line =~ /\.(?:p2)?align\s+(\d+)/) { my $align = 1 << $1; - $line =~ s/\.align\s(\d+)/ALIGN $align/; + $line =~ s/\.(?:p2)?align\s(\d+)/ALIGN $align/; } # Convert gas style [r0, :128] into armasm [r0@128] alignment specification - $line =~ s/\[([^\[]+),\s*:(\d+)\]/[$1\@$2]/g; + $line =~ s/\[([^\[,]+),?\s*:(\d+)\]/[$1\@$2]/g; # armasm treats logical values {TRUE} and {FALSE} separately from # numeric values - logical operators and values can't be intermixed @@ -930,7 +930,7 @@ sub handle_serialized_line { # Misc bugs/deficiencies: # armasm seems unable to parse e.g. "vmov s0, s1" without a type # qualifier, thus add .f32. - $line =~ s/^(\s+(?:vmov|vadd))(\s+s)/$1.f32$2/; + $line =~ s/^(\s+(?:vmov|vadd))(\s+s\d+\s*,\s*s\d+)/$1.f32$2/; # armasm is unable to parse &0x - add spacing $line =~ s/&0x/& 0x/g; } @@ -939,16 +939,31 @@ sub handle_serialized_line { # Convert register post indexing to a separate add instruction. # This converts e.g. "ldr r0, [r1], r2" into "ldr r0, [r1]", # "add r1, r1, r2". - $line =~ s/(ldr|str)\s+(\w+),\s*\[(\w+)\],\s*(\w+)/$1 $2, [$3]\n\tadd $3, $3, $4/g; + $line =~ s/((?:ldr|str)[bh]?)\s+(\w+),\s*\[(\w+)\],\s*(\w+)/$1 $2, [$3]\n\tadd $3, $3, $4/g; # Convert "mov pc, lr" into "bx lr", since the former only works # for switching from arm to thumb (and only in armv7), but not # from thumb to arm. s/mov\s*pc\s*,\s*lr/bx lr/g; - # Convert stmdb/ldmia with only one register into a plain str/ldr with post-increment/decrement - $line =~ s/stmdb\s+sp!\s*,\s*\{([^,-]+)\}/str $1, [sp, #-4]!/g; - $line =~ s/ldmia\s+sp!\s*,\s*\{([^,-]+)\}/ldr $1, [sp], #4/g; + # Convert stmdb/ldmia/stmfd/ldmfd/ldm with only one register into a plain str/ldr with post-increment/decrement. + # Wide thumb2 encoding requires at least two registers in register list while all other encodings support one register too. + $line =~ s/stm(?:db|fd)\s+sp!\s*,\s*\{([^,-]+)\}/str $1, [sp, #-4]!/g; + $line =~ s/ldm(?:ia|fd)?\s+sp!\s*,\s*\{([^,-]+)\}/ldr $1, [sp], #4/g; + + # Convert muls into mul+cmp + $line =~ s/muls\s+(\w+),\s*(\w+)\,\s*(\w+)/mul $1, $2, $3\n\tcmp $1, #0/g; + + # Convert "and r0, sp, #xx" into "mov r0, sp", "and r0, r0, #xx" + $line =~ s/and\s+(\w+),\s*(sp|r13)\,\s*#(\w+)/mov $1, $2\n\tand $1, $1, #$3/g; + + # Convert "ldr r0, [r0, r1, lsl #6]" where the shift is >3 (which + # can't be handled in thumb) into "add r0, r0, r1, lsl #6", + # "ldr r0, [r0]", for the special case where the same address is + # used as base and target for the ldr. + if ($line =~ /(ldr[bh]?)\s+(\w+),\s*\[\2,\s*(\w+),\s*lsl\s*#(\w+)\]/ and $4 > 3) { + $line =~ s/(ldr[bh]?)\s+(\w+),\s*\[\2,\s*(\w+),\s*lsl\s*#(\w+)\]/add $2, $2, $3, lsl #$4\n\t$1 $2, [$2]/; + } $line =~ s/\.arm/.thumb/x; } @@ -978,6 +993,9 @@ sub handle_serialized_line { $line =~ s/\.int/.long/x; $line =~ s/\.float/.single/x; } + if ($as_type eq "apple-gas") { + $line =~ s/vmrs\s+APSR_nzcv/fmrx r15/x; + } if ($as_type eq "armasm") { $line =~ s/\.global/EXPORT/x; $line =~ s/\.int/dcd/x; @@ -986,11 +1004,15 @@ sub handle_serialized_line { $line =~ s/\.word/dcd/x; $line =~ s/\.short/dcw/x; $line =~ s/\.byte/dcb/x; + $line =~ s/\.quad/dcq/x; + $line =~ s/\.ascii/dcb/x; + $line =~ s/\.asciz(.*)$/dcb\1,0/x; $line =~ s/\.thumb/THUMB/x; $line =~ s/\.arm/ARM/x; # The alignment in AREA is the power of two, just as .align in gas - $line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=2, CODEALIGN/; + $line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=4, CODEALIGN/; $line =~ s/(\s*)(.*)\.rodata/$1AREA |.rodata|, DATA, READONLY, ALIGN=5/; + $line =~ s/\.data/AREA |.data|, DATA, ALIGN=5/; $line =~ s/fmxr/vmsr/; $line =~ s/fmrx/vmrs/; diff --git a/library/src/main/cpp/libx264/tools/msvsdepend.sh b/library/src/main/cpp/libx264/tools/msvsdepend.sh index 568f611..5d267ab 100755 --- a/library/src/main/cpp/libx264/tools/msvsdepend.sh +++ b/library/src/main/cpp/libx264/tools/msvsdepend.sh @@ -23,6 +23,12 @@ if command -v cygpath >/dev/null 2>&1 ; then IFS=' ' deps="$(cygpath -u -- $deps)" +elif grep -q 'Microsoft' /proc/sys/kernel/osrelease 2>/dev/null ; then + # Running under WSL. We don't have access to cygpath but since the Windows + # file system resides under "/mnt//" we can simply replace + # "C:" with "/mnt/c". This command uses a GNU extension to sed but that's + # available on WSL so we don't need to limit ourselves by what POSIX says. + deps="$(printf '%s' "$deps" | sed 's/^\([a-zA-Z]\):/\/mnt\/\L\1/')" fi # Escape characters as required to create valid Makefile file names diff --git a/library/src/main/cpp/libx264/x264.c b/library/src/main/cpp/libx264/x264.c index 41f94e4..93f2d58 100644 --- a/library/src/main/cpp/libx264/x264.c +++ b/library/src/main/cpp/libx264/x264.c @@ -420,47 +420,47 @@ static char *stringify_names( char *buf, const char * const names[] ) return buf; } +#define INDENT " " +#define INDENT_LEN 32 // strlen( INDENT ) +#define SEPARATOR ", " +#define SEPARATOR_LEN 2 // strlen( SEPARATOR ) + +static void print_csp_name_internal( const char *name, size_t *line_len, int last ) +{ + if( name ) + { + size_t name_len = strlen( name ); + if( *line_len + name_len > (80 - SEPARATOR_LEN) ) + { + printf( "\n" INDENT ); + *line_len = INDENT_LEN; + } + printf( "%s", name ); + *line_len += name_len; + if( !last ) + { + printf( SEPARATOR ); + *line_len += SEPARATOR_LEN; + } + } +} + static void print_csp_names( int longhelp ) { if( longhelp < 2 ) return; -# define INDENT " " printf( " - valid csps for `raw' demuxer:\n" ); printf( INDENT ); + size_t line_len = INDENT_LEN; for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ ) - { - if( x264_cli_csps[i].name ) - { - printf( "%s", x264_cli_csps[i].name ); - if( i+1 < X264_CSP_CLI_MAX ) - printf( ", " ); - } - } + print_csp_name_internal( x264_cli_csps[i].name, &line_len, i == X264_CSP_CLI_MAX-1 ); #if HAVE_LAVF printf( "\n" ); printf( " - valid csps for `lavf' demuxer:\n" ); printf( INDENT ); - size_t line_len = strlen( INDENT ); + line_len = INDENT_LEN; for( enum AVPixelFormat i = AV_PIX_FMT_NONE+1; i < AV_PIX_FMT_NB; i++ ) - { - const char *pfname = av_get_pix_fmt_name( i ); - if( pfname ) - { - size_t name_len = strlen( pfname ); - if( line_len + name_len > (80 - strlen( ", " )) ) - { - printf( "\n" INDENT ); - line_len = strlen( INDENT ); - } - printf( "%s", pfname ); - line_len += name_len; - if( i+1 < AV_PIX_FMT_NB ) - { - printf( ", " ); - line_len += 2; - } - } - } + print_csp_name_internal( av_get_pix_fmt_name( i ), &line_len, i == AV_PIX_FMT_NB-1 ); #endif printf( "\n" ); } @@ -636,7 +636,7 @@ static void help( x264_param_t *defaults, int longhelp ) " - grain (psy tuning):\n" " --aq-strength 0.5 --no-dct-decimate\n" " --deadzone-inter 6 --deadzone-intra 6\n" - " --deblock -2:-2 --ipratio 1.1 \n" + " --deblock -2:-2 --ipratio 1.1\n" " --pbratio 1.1 --psy-rd :0.25\n" " --qcomp 0.8\n" " - stillimage (psy tuning):\n" diff --git a/library/src/main/cpp/libx264/x264.h b/library/src/main/cpp/libx264/x264.h index 18f5796..0f34067 100644 --- a/library/src/main/cpp/libx264/x264.h +++ b/library/src/main/cpp/libx264/x264.h @@ -45,7 +45,7 @@ extern "C" { #include "x264_config.h" -#define X264_BUILD 148 +#define X264_BUILD 152 /* Application developers planning to link against a shared library version of * libx264 from a Microsoft Visual Studio or similar development environment @@ -119,39 +119,38 @@ typedef struct x264_nal_t /* CPU flags */ /* x86 */ -#define X264_CPU_CMOV 0x0000001 -#define X264_CPU_MMX 0x0000002 -#define X264_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */ -#define X264_CPU_MMXEXT X264_CPU_MMX2 -#define X264_CPU_SSE 0x0000008 -#define X264_CPU_SSE2 0x0000010 -#define X264_CPU_SSE3 0x0000020 -#define X264_CPU_SSSE3 0x0000040 -#define X264_CPU_SSE4 0x0000080 /* SSE4.1 */ -#define X264_CPU_SSE42 0x0000100 /* SSE4.2 */ -#define X264_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */ -#define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */ -#define X264_CPU_XOP 0x0000800 /* AMD XOP */ -#define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */ -#define X264_CPU_FMA3 0x0002000 /* FMA3 */ -#define X264_CPU_AVX2 0x0004000 /* AVX2 */ -#define X264_CPU_BMI1 0x0008000 /* BMI1 */ -#define X264_CPU_BMI2 0x0010000 /* BMI2 */ +#define X264_CPU_MMX (1<<0) +#define X264_CPU_MMX2 (1<<1) /* MMX2 aka MMXEXT aka ISSE */ +#define X264_CPU_MMXEXT X264_CPU_MMX2 +#define X264_CPU_SSE (1<<2) +#define X264_CPU_SSE2 (1<<3) +#define X264_CPU_LZCNT (1<<4) +#define X264_CPU_SSE3 (1<<5) +#define X264_CPU_SSSE3 (1<<6) +#define X264_CPU_SSE4 (1<<7) /* SSE4.1 */ +#define X264_CPU_SSE42 (1<<8) /* SSE4.2 */ +#define X264_CPU_AVX (1<<9) /* Requires OS support even if YMM registers aren't used */ +#define X264_CPU_XOP (1<<10) /* AMD XOP */ +#define X264_CPU_FMA4 (1<<11) /* AMD FMA4 */ +#define X264_CPU_FMA3 (1<<12) +#define X264_CPU_BMI1 (1<<13) +#define X264_CPU_BMI2 (1<<14) +#define X264_CPU_AVX2 (1<<15) +#define X264_CPU_AVX512 (1<<16) /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */ /* x86 modifiers */ -#define X264_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */ -#define X264_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */ -#define X264_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */ -#define X264_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */ -#define X264_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ -#define X264_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */ -#define X264_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */ -#define X264_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow +#define X264_CPU_CACHELINE_32 (1<<17) /* avoid memory loads that span the border between two cachelines */ +#define X264_CPU_CACHELINE_64 (1<<18) /* 32/64 is the size of a cacheline in bytes */ +#define X264_CPU_SSE2_IS_SLOW (1<<19) /* avoid most SSE2 functions on Athlon64 */ +#define X264_CPU_SSE2_IS_FAST (1<<20) /* a few functions are only faster on Core2 and Phenom */ +#define X264_CPU_SLOW_SHUFFLE (1<<21) /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */ +#define X264_CPU_STACK_MOD4 (1<<22) /* if stack is only mod4 and not mod16 */ +#define X264_CPU_SLOW_ATOM (1<<23) /* The Atom is terrible: slow SSE unaligned loads, slow * SIMD multiplies, slow SIMD variable shifts, slow pshufb, * cacheline split penalties -- gather everything here that * isn't shared by other CPUs to avoid making half a dozen * new SLOW flags. */ -#define X264_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */ -#define X264_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */ +#define X264_CPU_SLOW_PSHUFB (1<<24) /* such as on the Intel Atom */ +#define X264_CPU_SLOW_PALIGNR (1<<25) /* such as on the AMD Bobcat */ /* PowerPC */ #define X264_CPU_ALTIVEC 0x0000001 @@ -227,13 +226,15 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 }; #define X264_CSP_I422 0x0005 /* yuv 4:2:2 planar */ #define X264_CSP_YV16 0x0006 /* yvu 4:2:2 planar */ #define X264_CSP_NV16 0x0007 /* yuv 4:2:2, with one y plane and one packed u+v */ -#define X264_CSP_V210 0x0008 /* 10-bit yuv 4:2:2 packed in 32 */ -#define X264_CSP_I444 0x0009 /* yuv 4:4:4 planar */ -#define X264_CSP_YV24 0x000a /* yvu 4:4:4 planar */ -#define X264_CSP_BGR 0x000b /* packed bgr 24bits */ -#define X264_CSP_BGRA 0x000c /* packed bgr 32bits */ -#define X264_CSP_RGB 0x000d /* packed rgb 24bits */ -#define X264_CSP_MAX 0x000e /* end of list */ +#define X264_CSP_YUYV 0x0008 /* yuyv 4:2:2 packed */ +#define X264_CSP_UYVY 0x0009 /* uyvy 4:2:2 packed */ +#define X264_CSP_V210 0x000a /* 10-bit yuv 4:2:2 packed in 32 */ +#define X264_CSP_I444 0x000b /* yuv 4:4:4 planar */ +#define X264_CSP_YV24 0x000c /* yvu 4:4:4 planar */ +#define X264_CSP_BGR 0x000d /* packed bgr 24bits */ +#define X264_CSP_BGRA 0x000e /* packed bgr 32bits */ +#define X264_CSP_RGB 0x000f /* packed rgb 24bits */ +#define X264_CSP_MAX 0x0010 /* end of list */ #define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */ #define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */ @@ -563,19 +564,19 @@ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal ); typedef struct x264_level_t { - int level_idc; - int mbps; /* max macroblock processing rate (macroblocks/sec) */ - int frame_size; /* max frame size (macroblocks) */ - int dpb; /* max decoded picture buffer (mbs) */ - int bitrate; /* max bitrate (kbit/sec) */ - int cpb; /* max vbv buffer (kbit) */ - int mv_range; /* max vertical mv component range (pixels) */ - int mvs_per_2mb; /* max mvs per 2 consecutive mbs. */ - int slice_rate; /* ?? */ - int mincr; /* min compression ratio */ - int bipred8x8; /* limit bipred to >=8x8 */ - int direct8x8; /* limit b_direct to >=8x8 */ - int frame_only; /* forbid interlacing */ + uint8_t level_idc; + uint32_t mbps; /* max macroblock processing rate (macroblocks/sec) */ + uint32_t frame_size; /* max frame size (macroblocks) */ + uint32_t dpb; /* max decoded picture buffer (mbs) */ + uint32_t bitrate; /* max bitrate (kbit/sec) */ + uint32_t cpb; /* max vbv buffer (kbit) */ + uint16_t mv_range; /* max vertical mv component range (pixels) */ + uint8_t mvs_per_2mb; /* max mvs per 2 consecutive mbs. */ + uint8_t slice_rate; /* ?? */ + uint8_t mincr; /* min compression ratio */ + uint8_t bipred8x8; /* limit bipred to >=8x8 */ + uint8_t direct8x8; /* limit b_direct to >=8x8 */ + uint8_t frame_only; /* forbid interlacing */ } x264_level_t; /* all of the levels defined in the standard, terminated by .level_idc=0 */ diff --git a/library/src/main/cpp/libx264/x264_config.h b/library/src/main/cpp/libx264/x264_config.h index 1e0d377..d1b569f 100644 --- a/library/src/main/cpp/libx264/x264_config.h +++ b/library/src/main/cpp/libx264/x264_config.h @@ -3,4 +3,4 @@ #define X264_INTERLACED 1 #define X264_CHROMA_FORMAT 0 #define X264_VERSION "" -#define X264_POINTVER "0.148.x" +#define X264_POINTVER "0.152.x"