Upgrade libx264

Signed-off-by: Leo Ma <begeekmyfriend@gmail.com>
camera2
Leo Ma 8 years ago
parent 0984dd5907
commit d8de3cd6cd

@ -278,7 +278,8 @@ clean:
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock *.pgd *.pgc
distclean: clean
rm -f config.mak x264_config.h config.h config.log x264.pc x264.def conftest*
rm -f config.mak x264_config.h config.h config.log x264.pc x264.def
rm -rf conftest*
install-cli: cli
$(INSTALL) -d $(DESTDIR)$(bindir)

@ -569,57 +569,65 @@ endfunc
.macro pixel_var2_8 h
function x264_pixel_var2_8x\h\()_neon, export=1
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
mov x5, \h - 4
usubl v6.8h, v16.8b, v18.8b
usubl v7.8h, v17.8b, v19.8b
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
smull v2.4s, v6.4h, v6.4h
smull2 v3.4s, v6.8h, v6.8h
add v0.8h, v6.8h, v7.8h
smlal v2.4s, v7.4h, v7.4h
smlal2 v3.4s, v7.8h, v7.8h
mov x3, #16
ld1 {v16.8b}, [x0], #8
ld1 {v18.8b}, [x1], x3
ld1 {v17.8b}, [x0], #8
ld1 {v19.8b}, [x1], x3
mov x5, \h - 2
usubl v0.8h, v16.8b, v18.8b
usubl v1.8h, v17.8b, v19.8b
ld1 {v16.8b}, [x0], #8
ld1 {v18.8b}, [x1], x3
smull v2.4s, v0.4h, v0.4h
smull2 v3.4s, v0.8h, v0.8h
smull v4.4s, v1.4h, v1.4h
smull2 v5.4s, v1.8h, v1.8h
usubl v6.8h, v16.8b, v18.8b
1: subs x5, x5, #2
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
1: subs x5, x5, #1
ld1 {v17.8b}, [x0], #8
ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
ld1 {v16.8b}, [x0], x1
ld1 {v18.8b}, [x2], x3
smlal v2.4s, v7.4h, v7.4h
smlal2 v3.4s, v7.8h, v7.8h
ld1 {v16.8b}, [x0], #8
ld1 {v18.8b}, [x1], x3
smlal v4.4s, v7.4h, v7.4h
smlal2 v5.4s, v7.8h, v7.8h
usubl v6.8h, v16.8b, v18.8b
add v0.8h, v0.8h, v7.8h
add v1.8h, v1.8h, v7.8h
b.gt 1b
ld1 {v17.8b}, [x0], x1
ld1 {v19.8b}, [x2], x3
ld1 {v17.8b}, [x0], #8
ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
smlal v2.4s, v7.4h, v7.4h
add v0.8h, v0.8h, v7.8h
smlal2 v3.4s, v7.8h, v7.8h
smlal v4.4s, v7.4h, v7.4h
add v1.8h, v1.8h, v7.8h
smlal2 v5.4s, v7.8h, v7.8h
saddlv s0, v0.8h
saddlv s1, v1.8h
add v2.4s, v2.4s, v3.4s
add v4.4s, v4.4s, v5.4s
mov w0, v0.s[0]
addv s1, v2.4s
sxtw x0, w0
mov w1, v1.s[0]
mul x0, x0, x0
str w1, [x4]
sub x0, x1, x0, lsr # 6 + (\h >> 4)
addv s2, v2.4s
addv s4, v4.4s
mul w0, w0, w0
mul w1, w1, w1
mov w3, v2.s[0]
mov w4, v4.s[0]
sub w0, w3, w0, lsr # 6 + (\h >> 4)
sub w1, w4, w1, lsr # 6 + (\h >> 4)
str w3, [x2]
add w0, w0, w1
str w4, [x2, #4]
ret
endfunc

@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );

@ -28,15 +28,10 @@
.syntax unified
#if HAVE_NEON
.arch armv7-a
#elif HAVE_ARMV6T2
.arch armv6t2
#elif HAVE_ARMV6
.arch armv6
#endif
#ifndef __APPLE__
.arch armv7-a
.fpu neon
#endif
#ifdef PREFIX
# define EXTERN_ASM _
@ -50,6 +45,14 @@
# define ELF @
#endif
#ifdef __MACH__
# define MACH
# define NONMACH @
#else
# define MACH @
# define NONMACH
#endif
#if HAVE_AS_FUNC
# define FUNC
#else
@ -76,6 +79,7 @@ ELF .size \name, . - \name
FUNC .endfunc
.purgem endfunc
.endm
.text
.align 2
.if \export == 1
.global EXTERN_ASM\name
@ -99,7 +103,8 @@ ELF .size \name, . - \name
.if HAVE_SECTION_DATA_REL_RO && \relocate
.section .data.rel.ro
.else
.section .rodata
NONMACH .section .rodata
MACH .const_data
.endif
.align \align
\name:

@ -26,14 +26,12 @@
#include "asm.S"
.section .rodata
.align 4
scan4x4_frame:
const scan4x4_frame, align=4
.byte 0,1, 8,9, 2,3, 4,5
.byte 2,3, 8,9, 16,17, 10,11
.byte 12,13, 6,7, 14,15, 20,21
.byte 10,11, 12,13, 6,7, 14,15
endconst
.text

@ -28,10 +28,9 @@
#include "asm.S"
.section .rodata
.align 4
pw_0to15:
const pw_0to15, align=4
.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
endconst
.text
@ -140,7 +139,7 @@ MEMCPY_ALIGNED 16, 8
MEMCPY_ALIGNED 8, 16
MEMCPY_ALIGNED 8, 8
const memcpy_table align=2, relocate=1
const memcpy_table, align=2, relocate=1
.word memcpy_aligned_16_16_neon
.word memcpy_aligned_16_8_neon
.word memcpy_aligned_8_16_neon

@ -26,9 +26,7 @@
#include "asm.S"
.section .rodata
.align 4
const mask_array, align=4
.rept 16
.byte 0xff
.endr
@ -36,11 +34,14 @@ mask_ff:
.rept 16
.byte 0
.endr
endconst
mask_ac4:
const mask_ac4, align=4
.short 0, -1, -1, -1, 0, -1, -1, -1
mask_ac8:
endconst
const mask_ac8, align=4
.short 0, -1, -1, -1, -1, -1, -1, -1
endconst
.text
@ -718,13 +719,24 @@ function x264_var_end, export=0
bx lr
endfunc
.macro DIFF_SUM diff da db lastdiff
vld1.64 {\da}, [r0,:64], r1
vld1.64 {\db}, [r2,:64], r3
.ifnb \lastdiff
vadd.s16 q0, q0, \lastdiff
.macro DIFF_SUM diff1 diff2 da1 db1 da2 db2 lastdiff1 lastdiff2 acc1 acc2
vld1.64 {\da1}, [r0,:64]!
vld1.64 {\db1}, [r1,:64], r3
.ifnb \lastdiff1
vadd.s16 \acc1, \acc1, \lastdiff1
vadd.s16 \acc2, \acc2, \lastdiff2
.endif
vsubl.u8 \diff, \da, \db
vld1.64 {\da2}, [r0,:64]!
vld1.64 {\db2}, [r1,:64], r3
vsubl.u8 \diff1, \da1, \db1
vsubl.u8 \diff2, \da2, \db2
.endm
.macro SQR_ACC_DOUBLE acc1 acc2 d0 d1 d2 d3 vmlal=vmlal.s16
\vmlal \acc1, \d0, \d0
vmlal.s16 \acc1, \d1, \d1
\vmlal \acc2, \d2, \d2
vmlal.s16 \acc2, \d3, \d3
.endm
.macro SQR_ACC acc d0 d1 vmlal=vmlal.s16
@ -733,77 +745,89 @@ endfunc
.endm
function x264_pixel_var2_8x8_neon
DIFF_SUM q0, d0, d1
DIFF_SUM q8, d16, d17
SQR_ACC q1, d0, d1, vmull.s16
DIFF_SUM q9, d18, d19, q8
SQR_ACC q2, d16, d17, vmull.s16
mov r3, #16
DIFF_SUM q0, q10, d0, d1, d20, d21
DIFF_SUM q8, q11, d16, d17, d22, d23
SQR_ACC_DOUBLE q1, q13, d0, d1, d20, d21, vmull.s16
DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10
SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23, vmull.s16
.rept 2
DIFF_SUM q8, d16, d17, q9
SQR_ACC q1, d18, d19
DIFF_SUM q9, d18, d19, q8
SQR_ACC q2, d16, d17
DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10
SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25
DIFF_SUM q9, q12, d18, d19, d24, d25, q8, q11, q0, q10
SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23
.endr
DIFF_SUM q8, d16, d17, q9
SQR_ACC q1, d18, d19
DIFF_SUM q8, q11, d16, d17, d22, d23, q9, q12, q0, q10
SQR_ACC_DOUBLE q1, q13, d18, d19, d24, d25
vadd.s16 q0, q0, q8
SQR_ACC q2, d16, d17
vadd.s16 q10, q10, q11
SQR_ACC_DOUBLE q2, q14, d16, d17, d22, d23
ldr ip, [sp]
vadd.s16 d0, d0, d1
vadd.s16 d20, d20, d21
vadd.s32 q1, q1, q2
vadd.s32 q13, q13, q14
vpaddl.s16 d0, d0
vpaddl.s16 d20, d20
vadd.s32 d1, d2, d3
vpadd.s32 d0, d0, d1
vadd.s32 d26, d26, d27
vpadd.s32 d0, d0, d20 @ sum
vpadd.s32 d1, d1, d26 @ sqr
vmul.s32 d0, d0, d0 @ sum*sum
vshr.s32 d0, d0, #6
vsub.s32 d0, d1, d0
vpadd.s32 d0, d0, d0
vmov r0, r1, d0
vst1.32 {d0[1]}, [ip,:32]
mul r0, r0, r0
sub r0, r1, r0, lsr #6
vst1.32 {d1}, [r2,:64]
bx lr
endfunc
function x264_pixel_var2_8x16_neon
vld1.64 {d16}, [r0,:64], r1
vld1.64 {d17}, [r2,:64], r3
vld1.64 {d18}, [r0,:64], r1
vld1.64 {d19}, [r2,:64], r3
vsubl.u8 q10, d16, d17
vsubl.u8 q11, d18, d19
SQR_ACC q1, d20, d21, vmull.s16
vld1.64 {d16}, [r0,:64], r1
vadd.s16 q0, q10, q11
vld1.64 {d17}, [r2,:64], r3
SQR_ACC q2, d22, d23, vmull.s16
mov ip, #14
1: subs ip, ip, #2
vld1.64 {d18}, [r0,:64], r1
mov r3, #16
vld1.64 {d16}, [r0,:64]!
vld1.64 {d17}, [r1,:64], r3
vld1.64 {d18}, [r0,:64]!
vld1.64 {d19}, [r1,:64], r3
vsubl.u8 q0, d16, d17
vsubl.u8 q3, d18, d19
SQR_ACC q1, d0, d1, vmull.s16
vld1.64 {d16}, [r0,:64]!
mov ip, #15
vld1.64 {d17}, [r1,:64], r3
SQR_ACC q2, d6, d7, vmull.s16
1: subs ip, ip, #1
vld1.64 {d18}, [r0,:64]!
vsubl.u8 q10, d16, d17
vld1.64 {d19}, [r2,:64], r3
vld1.64 {d19}, [r1,:64], r3
vadd.s16 q0, q0, q10
SQR_ACC q1, d20, d21
vsubl.u8 q11, d18, d19
beq 2f
vld1.64 {d16}, [r0,:64], r1
vadd.s16 q0, q0, q11
vld1.64 {d17}, [r2,:64], r3
vld1.64 {d16}, [r0,:64]!
vadd.s16 q3, q3, q11
vld1.64 {d17}, [r1,:64], r3
SQR_ACC q2, d22, d23
b 1b
2:
vadd.s16 q0, q0, q11
vadd.s16 q3, q3, q11
SQR_ACC q2, d22, d23
ldr ip, [sp]
vadd.s16 d0, d0, d1
vadd.s32 q1, q1, q2
vadd.s16 d6, d6, d7
vpaddl.s16 d0, d0
vadd.s32 d1, d2, d3
vpadd.s32 d0, d0, d1
vpaddl.s16 d6, d6
vadd.s32 d2, d2, d3
vadd.s32 d4, d4, d5
vpadd.s32 d0, d0, d6 @ sum
vpadd.s32 d2, d2, d4 @ sqr
vmul.s32 d0, d0, d0 @ sum*sum
vshr.s32 d0, d0, #7
vsub.s32 d0, d2, d0
vpadd.s32 d0, d0, d0
vmov r0, r1, d0
vst1.32 {d0[1]}, [ip,:32]
mul r0, r0, r0
sub r0, r1, r0, lsr #7
vst1.32 {d2}, [r2,:64]
bx lr
endfunc

@ -63,8 +63,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );

@ -27,10 +27,9 @@
#include "asm.S"
.section .rodata
.align 4
p16weight: .short 1,2,3,4,5,6,7,8
const p16weight, align=4
.short 1,2,3,4,5,6,7,8
endconst
.text

@ -26,19 +26,20 @@
#include "asm.S"
.section .rodata
.align 4
pmovmskb_byte:
const pmovmskb_byte, align=4
.byte 1,2,4,8,16,32,64,128
.byte 1,2,4,8,16,32,64,128
endconst
mask_2bit:
const mask_2bit, align=4
.byte 3,12,48,192,3,12,48,192
.byte 3,12,48,192,3,12,48,192
endconst
mask_1bit:
const mask_1bit, align=4
.byte 128,64,32,16,8,4,2,1
.byte 128,64,32,16,8,4,2,1
endconst
.text

@ -43,16 +43,19 @@ uint8_t *x264_nal_escape_mmx2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_sse2( uint8_t *dst, uint8_t *src, uint8_t *end );
uint8_t *x264_nal_escape_avx2( uint8_t *dst, uint8_t *src, uint8_t *end );
void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_8x8_rd_internal_avx512 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_sse2_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_avx2_bmi2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_avx2 ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
@ -116,7 +119,7 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
pf->nal_escape = x264_nal_escape_c;
#if HAVE_MMX
#if ARCH_X86_64
#if ARCH_X86_64 && !defined( __MACH__ )
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
@ -126,18 +129,17 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
pf->nal_escape = x264_nal_escape_mmx2;
if( cpu&X264_CPU_SSE2 )
{
#if ARCH_X86_64
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2_lzcnt;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2_lzcnt;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2_lzcnt;
}
#endif
if( cpu&X264_CPU_SSE2_IS_FAST )
pf->nal_escape = x264_nal_escape_sse2;
}
#if ARCH_X86_64
#if ARCH_X86_64 && !defined( __MACH__ )
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_lzcnt;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_lzcnt;
}
if( cpu&X264_CPU_SSSE3 )
{
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_ssse3;
@ -152,8 +154,14 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
if( cpu&X264_CPU_AVX2 )
{
pf->nal_escape = x264_nal_escape_avx2;
if( cpu&X264_CPU_BMI2 )
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2_bmi2;
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512;
}
#endif
#endif

@ -42,7 +42,7 @@ typedef struct
uint8_t *p_end;
/* aligned for memcpy_aligned starting here */
ALIGNED_16( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
ALIGNED_64( int f8_bits_encoded ); // only if using x264_cabac_size_decision()
/* context */
uint8_t state[1024];

@ -669,7 +669,7 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
{
if( !strcmp(value, "1b") )
p->i_level_idc = 9;
else if( atof(value) < 6 )
else if( atof(value) < 7 )
p->i_level_idc = (int)(10*atof(value)+.5);
else
p->i_level_idc = atoi(value);
@ -1143,6 +1143,8 @@ int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_heigh
[X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, },
[X264_CSP_YUYV] = { 1, { 256*2 }, { 256*1 }, },
[X264_CSP_UYVY] = { 1, { 256*2 }, { 256*1 }, },
[X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_BGR] = { 1, { 256*3 }, { 256*1 }, },

@ -635,11 +635,11 @@ struct x264_t
/* Current MB DCT coeffs */
struct
{
ALIGNED_N( dctcoef luma16x16_dc[3][16] );
ALIGNED_64( dctcoef luma16x16_dc[3][16] );
ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
ALIGNED_N( dctcoef luma8x8[12][64] );
ALIGNED_N( dctcoef luma4x4[16*3][16] );
ALIGNED_64( dctcoef luma8x8[12][64] );
ALIGNED_64( dctcoef luma4x4[16*3][16] );
} dct;
/* MB table and cache for current frame/mb */
@ -729,7 +729,7 @@ struct x264_t
int8_t *type; /* mb type */
uint8_t *partition; /* mb partition */
int8_t *qp; /* mb qp */
int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x0200 and 0x0400: chroma dc (all set for PCM)*/
int16_t *cbp; /* mb cbp: 0x0?: luma, 0x?0: chroma, 0x100: luma dc, 0x200 and 0x400: chroma dc, 0x1000 PCM (all set for PCM) */
int8_t (*intra4x4_pred_mode)[8]; /* intra4x4 pred mode. for non I4x4 set to I_PRED_4x4_DC(2) */
/* actually has only 7 entries; set to 8 for write-combining optimizations */
uint8_t (*non_zero_count)[16*3]; /* nzc. for I_PCM set to 16 */
@ -740,8 +740,7 @@ struct x264_t
int16_t (*mvr[2][X264_REF_MAX*2])[2];/* 16x16 mv for each possible ref */
int8_t *skipbp; /* block pattern for SKIP or DIRECT (sub)mbs. B-frames + cabac only */
int8_t *mb_transform_size; /* transform_size_8x8_flag of each mb */
uint16_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of
* NOTE: this will fail on resolutions above 2^16 MBs... */
uint32_t *slice_table; /* sh->first_mb of the slice that the indexed mb is part of */
uint8_t *field;
/* buffer for weighted versions of the reference frames */
@ -778,26 +777,27 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_64( pixel fdec_buf[54*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
ALIGNED_16( pixel i8x8_fdec_buf[16*16] );
ALIGNED_16( dctcoef i8x8_dct_buf[3][64] );
ALIGNED_16( dctcoef i4x4_dct_buf[15][16] );
ALIGNED_64( dctcoef i8x8_dct_buf[3][64] );
ALIGNED_64( dctcoef i4x4_dct_buf[15][16] );
uint32_t i4x4_nnz_buf[4];
uint32_t i8x8_nnz_buf[4];
int i4x4_cbp;
int i8x8_cbp;
/* Psy trellis DCT data */
ALIGNED_16( dctcoef fenc_dct8[4][64] );
ALIGNED_16( dctcoef fenc_dct4[16][16] );
/* Psy RD SATD/SA8D scores cache */
ALIGNED_N( uint64_t fenc_hadamard_cache[9] );
ALIGNED_N( uint32_t fenc_satd_cache[32] );
ALIGNED_64( uint32_t fenc_satd_cache[32] );
ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
int i4x4_cbp;
int i8x8_cbp;
/* pointer over mb of the frame to be compressed */
pixel *p_fenc[3]; /* y,u,v */
@ -822,10 +822,10 @@ struct x264_t
struct
{
/* real intra4x4_pred_mode if I_4X4 or I_8X8, I_PRED_4x4_DC if mb available, -1 if not */
ALIGNED_8( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
ALIGNED_16( int8_t intra4x4_pred_mode[X264_SCAN8_LUMA_SIZE] );
/* i_non_zero_count if available else 0x80 */
ALIGNED_16( uint8_t non_zero_count[X264_SCAN8_SIZE] );
/* i_non_zero_count if available else 0x80. intentionally misaligned by 8 for asm */
ALIGNED_8( uint8_t non_zero_count[X264_SCAN8_SIZE] );
/* -1 if unused, -2 if unavailable */
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
@ -930,8 +930,8 @@ struct x264_t
uint32_t (*nr_residual_sum)[64];
uint32_t *nr_count;
ALIGNED_N( udctcoef nr_offset_denoise[4][64] );
ALIGNED_N( uint32_t nr_residual_sum_buf[2][4][64] );
ALIGNED_32( udctcoef nr_offset_denoise[4][64] );
ALIGNED_32( uint32_t nr_residual_sum_buf[2][4][64] );
uint32_t nr_count_buf[2][4];
uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */

@ -47,8 +47,7 @@ const x264_cpu_name_t x264_cpu_names[] =
{
#if HAVE_MMX
// {"MMX", X264_CPU_MMX}, // we don't support asm on mmx1 cpus anymore
// {"CMOV", X264_CPU_CMOV}, // we require this unconditionally, so don't print it
#define MMX2 X264_CPU_MMX|X264_CPU_MMX2|X264_CPU_CMOV
#define MMX2 X264_CPU_MMX|X264_CPU_MMX2
{"MMX2", MMX2},
{"MMXEXT", MMX2},
{"SSE", MMX2|X264_CPU_SSE},
@ -56,6 +55,7 @@ const x264_cpu_name_t x264_cpu_names[] =
{"SSE2Slow", SSE2|X264_CPU_SSE2_IS_SLOW},
{"SSE2", SSE2},
{"SSE2Fast", SSE2|X264_CPU_SSE2_IS_FAST},
{"LZCNT", SSE2|X264_CPU_LZCNT},
{"SSE3", SSE2|X264_CPU_SSE3},
{"SSSE3", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3},
{"SSE4.1", SSE2|X264_CPU_SSE3|X264_CPU_SSSE3|X264_CPU_SSE4},
@ -66,16 +66,17 @@ const x264_cpu_name_t x264_cpu_names[] =
{"XOP", AVX|X264_CPU_XOP},
{"FMA4", AVX|X264_CPU_FMA4},
{"FMA3", AVX|X264_CPU_FMA3},
{"AVX2", AVX|X264_CPU_FMA3|X264_CPU_AVX2},
{"BMI1", AVX|X264_CPU_LZCNT|X264_CPU_BMI1},
{"BMI2", AVX|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2},
#define AVX2 AVX|X264_CPU_FMA3|X264_CPU_LZCNT|X264_CPU_BMI1|X264_CPU_BMI2|X264_CPU_AVX2
{"AVX2", AVX2},
{"AVX512", AVX2|X264_CPU_AVX512},
#undef AVX2
#undef AVX
#undef SSE2
#undef MMX2
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"LZCNT", X264_CPU_LZCNT},
{"BMI1", X264_CPU_BMI1},
{"BMI2", X264_CPU_BMI1|X264_CPU_BMI2},
{"SlowCTZ", X264_CPU_SLOW_CTZ},
{"SlowAtom", X264_CPU_SLOW_ATOM},
{"SlowPshufb", X264_CPU_SLOW_PSHUFB},
{"SlowPalignr", X264_CPU_SLOW_PALIGNR},
@ -118,7 +119,7 @@ static void sigill_handler( int sig )
#if HAVE_MMX
int x264_cpu_cpuid_test( void );
void x264_cpu_cpuid( uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx );
void x264_cpu_xgetbv( uint32_t op, uint32_t *eax, uint32_t *edx );
uint64_t x264_cpu_xgetbv( int xcr );
uint32_t x264_cpu_detect( void )
{
@ -126,15 +127,14 @@ uint32_t x264_cpu_detect( void )
uint32_t eax, ebx, ecx, edx;
uint32_t vendor[4] = {0};
uint32_t max_extended_cap, max_basic_cap;
int cache;
uint64_t xcr0 = 0;
#if !ARCH_X86_64
if( !x264_cpu_cpuid_test() )
return 0;
#endif
x264_cpu_cpuid( 0, &eax, vendor+0, vendor+2, vendor+1 );
max_basic_cap = eax;
x264_cpu_cpuid( 0, &max_basic_cap, vendor+0, vendor+2, vendor+1 );
if( max_basic_cap == 0 )
return 0;
@ -145,27 +145,23 @@ uint32_t x264_cpu_detect( void )
return cpu;
if( edx&0x02000000 )
cpu |= X264_CPU_MMX2|X264_CPU_SSE;
if( edx&0x00008000 )
cpu |= X264_CPU_CMOV;
else
return cpu;
if( edx&0x04000000 )
cpu |= X264_CPU_SSE2;
if( ecx&0x00000001 )
cpu |= X264_CPU_SSE3;
if( ecx&0x00000200 )
cpu |= X264_CPU_SSSE3;
cpu |= X264_CPU_SSSE3|X264_CPU_SSE2_IS_FAST;
if( ecx&0x00080000 )
cpu |= X264_CPU_SSE4;
if( ecx&0x00100000 )
cpu |= X264_CPU_SSE42;
/* Check OXSAVE and AVX bits */
if( (ecx&0x18000000) == 0x18000000 )
if( ecx&0x08000000 ) /* XGETBV supported and XSAVE enabled by OS */
{
/* Check for OS support */
x264_cpu_xgetbv( 0, &eax, &edx );
if( (eax&0x6) == 0x6 )
xcr0 = x264_cpu_xgetbv( 0 );
if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
{
if( ecx&0x10000000 )
cpu |= X264_CPU_AVX;
if( ecx&0x00001000 )
cpu |= X264_CPU_FMA3;
@ -175,19 +171,24 @@ uint32_t x264_cpu_detect( void )
if( max_basic_cap >= 7 )
{
x264_cpu_cpuid( 7, &eax, &ebx, &ecx, &edx );
/* AVX2 requires OS support, but BMI1/2 don't. */
if( (cpu&X264_CPU_AVX) && (ebx&0x00000020) )
cpu |= X264_CPU_AVX2;
if( ebx&0x00000008 )
{
cpu |= X264_CPU_BMI1;
if( ebx&0x00000100 )
cpu |= X264_CPU_BMI2;
if( (xcr0&0x6) == 0x6 ) /* XMM/YMM state */
{
if( ebx&0x00000020 )
cpu |= X264_CPU_AVX2;
if( (xcr0&0xE0) == 0xE0 ) /* OPMASK/ZMM state */
{
if( (ebx&0xD0030000) == 0xD0030000 )
cpu |= X264_CPU_AVX512;
}
}
}
if( cpu & X264_CPU_SSSE3 )
cpu |= X264_CPU_SSE2_IS_FAST;
x264_cpu_cpuid( 0x80000000, &eax, &ebx, &ecx, &edx );
max_extended_cap = eax;
@ -228,8 +229,6 @@ uint32_t x264_cpu_detect( void )
{
if( edx&0x00400000 )
cpu |= X264_CPU_MMX2;
if( !(cpu&X264_CPU_LZCNT) )
cpu |= X264_CPU_SLOW_CTZ;
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_FAST) )
cpu |= X264_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */
}
@ -254,7 +253,6 @@ uint32_t x264_cpu_detect( void )
else if( model == 28 )
{
cpu |= X264_CPU_SLOW_ATOM;
cpu |= X264_CPU_SLOW_CTZ;
cpu |= X264_CPU_SLOW_PSHUFB;
}
/* Conroe has a slow shuffle unit. Check the model number to make sure not
@ -268,7 +266,7 @@ uint32_t x264_cpu_detect( void )
{
/* cacheline size is specified in 3 places, any of which may be missing */
x264_cpu_cpuid( 1, &eax, &ebx, &ecx, &edx );
cache = (ebx&0xff00)>>5; // cflush size
int cache = (ebx&0xff00)>>5; // cflush size
if( !cache && max_extended_cap >= 0x80000006 )
{
x264_cpu_cpuid( 0x80000006, &eax, &ebx, &ecx, &edx );

@ -56,7 +56,7 @@ void x264_cpu_sfence( void );
* alignment between functions (osdep.h handles manual alignment of arrays
* if it doesn't).
*/
#if (ARCH_X86 || STACK_ALIGNMENT > 16) && HAVE_MMX
#if HAVE_MMX && (STACK_ALIGNMENT > 16 || (ARCH_X86 && STACK_ALIGNMENT > 4))
intptr_t x264_stack_align( void (*func)(), ... );
#define x264_stack_align(func,...) x264_stack_align((void (*)())func, __VA_ARGS__)
#else
@ -65,7 +65,7 @@ intptr_t x264_stack_align( void (*func)(), ... );
typedef struct
{
const char name[16];
const char *name;
uint32_t flags;
} x264_cpu_name_t;
extern const x264_cpu_name_t x264_cpu_names[];

@ -711,6 +711,16 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
#endif
}
if( cpu&X264_CPU_AVX512 )
{
dctf->sub4x4_dct = x264_sub4x4_dct_avx512;
dctf->sub8x8_dct = x264_sub8x8_dct_avx512;
dctf->sub16x16_dct = x264_sub16x16_dct_avx512;
dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_avx512;
dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx512;
dctf->add8x8_idct = x264_add8x8_idct_avx512;
}
#endif //HAVE_MMX
#if HAVE_ALTIVEC
@ -986,6 +996,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
}
#endif // ARCH_X86_64
if( cpu&X264_CPU_AVX512 )
{
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512;
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
}
#endif // HAVE_MMX
#else
#if HAVE_MMX
@ -1026,6 +1043,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
}
if( cpu&X264_CPU_AVX512 )
{
pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_avx512;
pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx512;
pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
}
#endif // HAVE_MMX
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
@ -1068,6 +1092,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
}
if( cpu&X264_CPU_AVX512 )
{
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
}
#else
if( cpu&X264_CPU_MMX )
{
@ -1091,6 +1120,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx512;
}
#endif // HIGH_BIT_DEPTH
#endif
#if !HIGH_BIT_DEPTH

@ -75,7 +75,6 @@ typedef struct
} x264_zigzag_function_t;
void x264_dct_init( int cpu, x264_dct_function_t *dctf );
void x264_dct_init_weights( void );
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced );
#endif

@ -676,13 +676,10 @@ void x264_deblock_h_chroma_intra_avx ( pixel *pix, intptr_t stride, int alpha, i
void x264_deblock_h_chroma_422_intra_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_avx ( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_strength_mmx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_sse2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
void x264_deblock_strength_ssse3 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
@ -691,6 +688,9 @@ void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X2
void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_strength_avx512( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
@ -803,7 +803,6 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
#if !HIGH_BIT_DEPTH
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_mmx2;
#endif
pf->deblock_strength = x264_deblock_strength_mmx2;
if( cpu&X264_CPU_SSE2 )
{
pf->deblock_strength = x264_deblock_strength_sse2;
@ -852,6 +851,10 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
{
pf->deblock_strength = x264_deblock_strength_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->deblock_strength = x264_deblock_strength_avx512;
}
}
#endif

@ -54,6 +54,8 @@ static int x264_frame_internal_csp( int external_csp )
case X264_CSP_NV16:
case X264_CSP_I422:
case X264_CSP_YV16:
case X264_CSP_YUYV:
case X264_CSP_UYVY:
case X264_CSP_V210:
return X264_CSP_NV16;
case X264_CSP_I444:
@ -76,7 +78,7 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
int i_padv = PADV << PARAM_INTERLACED;
int align = 16;
#if ARCH_X86 || ARCH_X86_64
if( h->param.cpu&X264_CPU_CACHELINE_64 )
if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
align = 64;
else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
align = 32;
@ -221,11 +223,13 @@ static x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
PREALLOC( frame->lowres_mvs[j][i], 2*h->mb.i_mb_count*sizeof(int16_t) );
PREALLOC( frame->lowres_mv_costs[j][i], h->mb.i_mb_count*sizeof(int) );
}
PREALLOC( frame->i_propagate_cost, (i_mb_count+7) * sizeof(uint16_t) );
PREALLOC( frame->i_propagate_cost, i_mb_count * sizeof(uint16_t) );
for( int j = 0; j <= h->param.i_bframe+1; j++ )
for( int i = 0; i <= h->param.i_bframe+1; i++ )
PREALLOC( frame->lowres_costs[j][i], (i_mb_count+3) * sizeof(uint16_t) );
PREALLOC( frame->lowres_costs[j][i], i_mb_count * sizeof(uint16_t) );
/* mbtree asm can overread the input buffers, make sure we don't read outside of allocated memory. */
prealloc_size += NATIVE_ALIGN;
}
if( h->param.rc.i_aq_mode )
{
@ -408,7 +412,13 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
uint8_t *pix[3];
int stride[3];
if( i_csp == X264_CSP_V210 )
if( i_csp == X264_CSP_YUYV || i_csp == X264_CSP_UYVY )
{
int p = i_csp == X264_CSP_UYVY;
h->mc.plane_copy_deinterleave_yuyv( dst->plane[p], dst->i_stride[p], dst->plane[p^1], dst->i_stride[p^1],
(pixel*)src->img.plane[0], src->img.i_stride[0], h->param.i_width, h->param.i_height );
}
else if( i_csp == X264_CSP_V210 )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];

@ -121,8 +121,8 @@ static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int
int mvy1 = x264_clip3( h->mb.cache.mv[1][i8][1], h->mb.mv_min[1], h->mb.mv_max[1] ) + 4*4*y;
int i_mode = x264_size2pixel[height][width];
intptr_t i_stride0 = 16, i_stride1 = 16;
ALIGNED_ARRAY_N( pixel, tmp0,[16*16] );
ALIGNED_ARRAY_N( pixel, tmp1,[16*16] );
ALIGNED_ARRAY_32( pixel, tmp0,[16*16] );
ALIGNED_ARRAY_32( pixel, tmp1,[16*16] );
pixel *src0, *src1;
MC_LUMA_BI( 0 );
@ -260,7 +260,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
PREALLOC( h->mb.qp, i_mb_count * sizeof(int8_t) );
PREALLOC( h->mb.cbp, i_mb_count * sizeof(int16_t) );
PREALLOC( h->mb.mb_transform_size, i_mb_count * sizeof(int8_t) );
PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint16_t) );
PREALLOC( h->mb.slice_table, i_mb_count * sizeof(uint32_t) );
/* 0 -> 3 top(4), 4 -> 6 : left(3) */
PREALLOC( h->mb.intra4x4_pred_mode, i_mb_count * 8 * sizeof(int8_t) );
@ -326,7 +326,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
PREALLOC_END( h->mb.base );
memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint16_t) );
memset( h->mb.slice_table, -1, i_mb_count * sizeof(uint32_t) );
for( int i = 0; i < 2; i++ )
{
@ -388,7 +388,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
((me_range*2+24) * sizeof(int16_t) + (me_range+4) * (me_range+1) * 4 * sizeof(mvsad_t));
scratch_size = X264_MAX3( buf_hpel, buf_ssim, buf_tesa );
}
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+7)&~7) * sizeof(int16_t);
int buf_mbtree = h->param.rc.b_mb_tree * ((h->mb.i_mb_width+15)&~15) * sizeof(int16_t);
scratch_size = X264_MAX( scratch_size, buf_mbtree );
if( scratch_size )
CHECKED_MALLOC( h->scratch_buffer, scratch_size );
@ -532,16 +532,16 @@ void x264_macroblock_thread_init( x264_t *h )
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
if( CHROMA444 )
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 38*FDEC_STRIDE;
}
else
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16;
}
}
@ -1738,7 +1738,7 @@ void x264_macroblock_cache_save( x264_t *h )
h->mb.i_last_dqp = 0;
h->mb.i_cbp_chroma = CHROMA444 ? 0 : 2;
h->mb.i_cbp_luma = 0xf;
h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x700;
h->mb.cbp[i_mb_xy] = (h->mb.i_cbp_chroma << 4) | h->mb.i_cbp_luma | 0x1700;
h->mb.b_transform_8x8 = 0;
for( int i = 0; i < 48; i++ )
h->mb.cache.non_zero_count[x264_scan8[i]] = h->param.b_cabac ? 1 : 16;

@ -325,15 +325,14 @@ void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
}
}
static void x264_plane_copy_deinterleave_c( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h )
{
for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
for( int y=0; y<h; y++, dsta+=i_dsta, dstb+=i_dstb, src+=i_src )
for( int x=0; x<w; x++ )
{
dstu[x] = src[2*x];
dstv[x] = src[2*x+1];
dsta[x] = src[2*x];
dstb[x] = src[2*x+1];
}
}
@ -362,7 +361,7 @@ static ALWAYS_INLINE uint32_t v210_endian_fix32( uint32_t x )
#define v210_endian_fix32(x) (x)
#endif
void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
static void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
pixel *dstc, intptr_t i_dstc,
uint32_t *src, intptr_t i_src, int w, int h )
{
@ -649,6 +648,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
pf->plane_copy_swap = x264_plane_copy_swap_c;
pf->plane_copy_interleave = x264_plane_copy_interleave_c;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;

@ -160,6 +160,39 @@ static void x264_plane_copy_swap_##cpu( pixel *dst, intptr_t i_dst, pixel *src,
x264_plane_copy_swap_c( dst, i_dst, src, i_src, w, h );\
}
void x264_plane_copy_deinterleave_c( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
/* We can utilize existing plane_copy_deinterleave() functions for YUYV/UYUV
* input with the additional constraint that we cannot overread src. */
#define PLANE_COPY_YUYV(align, cpu)\
static void x264_plane_copy_deinterleave_yuyv_##cpu( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,\
pixel *src, intptr_t i_src, int w, int h )\
{\
int c_w = (align>>1) / sizeof(pixel) - 1;\
if( !(w&c_w) )\
x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
else if( w > c_w )\
{\
if( --h > 0 )\
{\
if( i_src > 0 )\
{\
x264_plane_copy_deinterleave_##cpu( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
dsta += i_dsta * h;\
dstb += i_dstb * h;\
src += i_src * h;\
}\
else\
x264_plane_copy_deinterleave_##cpu( dsta+i_dsta, i_dsta, dstb+i_dstb, i_dstb,\
src+i_src, i_src, w, h );\
}\
x264_plane_copy_deinterleave_c( dsta, 0, dstb, 0, src, 0, w, 1 );\
}\
else\
x264_plane_copy_deinterleave_c( dsta, i_dsta, dstb, i_dstb, src, i_src, w, h );\
}
void x264_plane_copy_interleave_c( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
@ -260,6 +293,8 @@ typedef struct
/* may write up to 15 pixels off the end of each plane */
void (*plane_copy_deinterleave)( pixel *dstu, intptr_t i_dstu, pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_yuyv)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,

@ -108,10 +108,10 @@ int x264_is_pipe( const char *path );
#else
#define DECLARE_ALIGNED( var, n ) var __attribute__((aligned(n)))
#endif
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define ALIGNED_4( var ) DECLARE_ALIGNED( var, 4 )
#define ALIGNED_8( var ) DECLARE_ALIGNED( var, 8 )
#define ALIGNED_16( var ) DECLARE_ALIGNED( var, 16 )
// ARM compiliers don't reliably align stack variables
// - EABI requires only 8 byte stack alignment to be maintained
@ -125,39 +125,39 @@ int x264_is_pipe( const char *path );
type (*name) __VA_ARGS__ = (void*)((intptr_t)(name##_u+mask) & ~mask)
#if ARCH_ARM && SYS_MACOSX
#define ALIGNED_ARRAY_8( ... ) ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ )
#define ALIGNED_ARRAY_8( ... ) EXPAND( ALIGNED_ARRAY_EMU( 7, __VA_ARGS__ ) )
#else
#define ALIGNED_ARRAY_8( type, name, sub1, ... )\
ALIGNED_8( type name sub1 __VA_ARGS__ )
#define ALIGNED_ARRAY_8( type, name, sub1, ... ) ALIGNED_8( type name sub1 __VA_ARGS__ )
#endif
#if ARCH_ARM
#define ALIGNED_ARRAY_16( ... ) ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ )
#define ALIGNED_ARRAY_16( ... ) EXPAND( ALIGNED_ARRAY_EMU( 15, __VA_ARGS__ ) )
#else
#define ALIGNED_ARRAY_16( type, name, sub1, ... )\
ALIGNED_16( type name sub1 __VA_ARGS__ )
#define ALIGNED_ARRAY_16( type, name, sub1, ... ) ALIGNED_16( type name sub1 __VA_ARGS__ )
#endif
#define EXPAND(x) x
#if ARCH_X86 || ARCH_X86_64
#define NATIVE_ALIGN 64
#define ALIGNED_32( var ) DECLARE_ALIGNED( var, 32 )
#define ALIGNED_64( var ) DECLARE_ALIGNED( var, 64 )
#if STACK_ALIGNMENT >= 32
#define ALIGNED_ARRAY_32( type, name, sub1, ... )\
ALIGNED_32( type name sub1 __VA_ARGS__ )
#define ALIGNED_ARRAY_32( type, name, sub1, ... ) ALIGNED_32( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_32( ... ) EXPAND( ALIGNED_ARRAY_EMU( 31, __VA_ARGS__ ) )
#endif
#if STACK_ALIGNMENT >= 64
#define ALIGNED_ARRAY_64( type, name, sub1, ... ) ALIGNED_64( type name sub1 __VA_ARGS__ )
#else
#define ALIGNED_ARRAY_64( ... ) EXPAND( ALIGNED_ARRAY_EMU( 63, __VA_ARGS__ ) )
/* For AVX2 */
#if ARCH_X86 || ARCH_X86_64
#define NATIVE_ALIGN 32
#define ALIGNED_N ALIGNED_32
#define ALIGNED_ARRAY_N ALIGNED_ARRAY_32
#endif
#else
#define NATIVE_ALIGN 16
#define ALIGNED_N ALIGNED_16
#define ALIGNED_ARRAY_N ALIGNED_ARRAY_16
#define ALIGNED_32 ALIGNED_16
#define ALIGNED_64 ALIGNED_16
#define ALIGNED_ARRAY_32 ALIGNED_ARRAY_16
#define ALIGNED_ARRAY_64 ALIGNED_ARRAY_16
#endif
#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 0)

@ -201,28 +201,32 @@ PIXEL_VAR_C( x264_pixel_var_8x8, 8, 8 )
/****************************************************************************
* pixel_var2_wxh
****************************************************************************/
#define PIXEL_VAR2_C( name, w, h, shift ) \
static int name( pixel *pix1, intptr_t i_stride1, pixel *pix2, intptr_t i_stride2, int *ssd ) \
#define PIXEL_VAR2_C( name, h, shift ) \
static int name( pixel *fenc, pixel *fdec, int ssd[2] ) \
{ \
int var = 0, sum = 0, sqr = 0; \
int sum_u = 0, sum_v = 0, sqr_u = 0, sqr_v = 0; \
for( int y = 0; y < h; y++ ) \
{ \
for( int x = 0; x < w; x++ ) \
for( int x = 0; x < 8; x++ ) \
{ \
int diff = pix1[x] - pix2[x]; \
sum += diff; \
sqr += diff * diff; \
int diff_u = fenc[x] - fdec[x]; \
int diff_v = fenc[x+FENC_STRIDE/2] - fdec[x+FDEC_STRIDE/2]; \
sum_u += diff_u; \
sum_v += diff_v; \
sqr_u += diff_u * diff_u; \
sqr_v += diff_v * diff_v; \
} \
pix1 += i_stride1; \
pix2 += i_stride2; \
fenc += FENC_STRIDE; \
fdec += FDEC_STRIDE; \
} \
var = sqr - ((int64_t)sum * sum >> shift); \
*ssd = sqr; \
return var; \
ssd[0] = sqr_u; \
ssd[1] = sqr_v; \
return sqr_u - ((int64_t)sum_u * sum_u >> shift) + \
sqr_v - ((int64_t)sum_v * sum_v >> shift); \
}
PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16, 7 )
PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 8, 6 )
PIXEL_VAR2_C( x264_pixel_var2_8x16, 16, 7 )
PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 6 )
#if BIT_DEPTH > 8
typedef uint32_t sum_t;
@ -885,13 +889,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
#endif
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_mmx2;
@ -962,7 +959,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( sad, _ssse3 );
INIT7( sad_x3, _ssse3 );
INIT7( sad_x4, _ssse3 );
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
#endif
INIT6( satd, _ssse3 );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
@ -1003,7 +1002,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_AVX )
{
INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
#endif
INIT6( satd, _avx );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
@ -1028,8 +1029,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( sad_x3, _xop );
INIT5( sad_x4, _xop );
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->vsad = x264_pixel_vsad_xop;
pixf->asd8 = x264_pixel_asd8_xop;
#if ARCH_X86_64
@ -1044,10 +1043,19 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _avx2 );
INIT2( sad_x4, _avx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
pixf->vsad = x264_pixel_vsad_avx2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
@ -1067,16 +1075,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT_ADS( _mmx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2;
pixf->vsad = x264_pixel_vsad_mmx2;
if( cpu&X264_CPU_CACHELINE_32 )
@ -1197,7 +1200,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3;
#endif
}
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
#endif
if( cpu&X264_CPU_SLOW_ATOM )
{
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
@ -1280,7 +1285,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
#endif
INIT4( hadamard_ac, _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
@ -1321,11 +1328,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop;
#if ARCH_X86_64
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_xop;
#endif
@ -1338,7 +1340,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _avx2 );
INIT4( satd, _avx2 );
INIT2( hadamard_ac, _avx2 );
#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx2 );
#endif
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
@ -1351,6 +1355,21 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_avx2;
#endif
}
if( cpu&X264_CPU_AVX512 )
{
INIT8( sad, _avx512 );
INIT8_NAME( sad_aligned, sad, _avx512 );
INIT7( sad_x3, _avx512 );
INIT7( sad_x4, _avx512 );
INIT8( satd, _avx512 );
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif //HAVE_MMX
#if HAVE_ARMV6
@ -1480,8 +1499,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_msa;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_msa;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_msa;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa;
//pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_msa;
//pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_msa;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8;
}

@ -93,8 +93,7 @@ typedef struct
uint64_t (*sa8d_satd[1])( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t (*var[4])( pixel *pix, intptr_t stride );
int (*var2[4])( pixel *pix1, intptr_t stride1,
pixel *pix2, intptr_t stride2, int *ssd );
int (*var2[4])( pixel *fenc, pixel *fdec, int ssd[2] );
uint64_t (*hadamard_ac[4])( pixel *pix, intptr_t stride );
void (*ssd_nv12_core)( pixel *pixuv1, intptr_t stride1,

@ -293,12 +293,8 @@ void x264_sub16x16_dct8_altivec( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix
vec_vsx_st( dcvsum8, 0, dest ); \
}
static void idct8_dc_altivec( uint8_t *dst, int16_t dc1, int16_t dc2 )
static void idct8_dc_altivec( uint8_t *dst, vec_s16_t dcv )
{
dc1 = (dc1 + 32) >> 6;
dc2 = (dc2 + 32) >> 6;
vec_s16_t dcv = { dc1, dc1, dc1, dc1, dc2, dc2, dc2, dc2 };
LOAD_ZERO;
ALTIVEC_STORE8_DC_SUM_CLIP( &dst[0*FDEC_STRIDE], dcv );
ALTIVEC_STORE8_DC_SUM_CLIP( &dst[1*FDEC_STRIDE], dcv );
@ -308,8 +304,18 @@ static void idct8_dc_altivec( uint8_t *dst, int16_t dc1, int16_t dc2 )
void x264_add8x8_idct_dc_altivec( uint8_t *p_dst, int16_t dct[4] )
{
idct8_dc_altivec( &p_dst[0], dct[0], dct[1] );
idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dct[2], dct[3] );
vec_s16_t dcv;
vec_s16_t v32 = vec_sl( vec_splat_s16( 8 ), vec_splat_u16( 2 ) );
vec_u16_t v6 = vec_splat_u16( 6 );
vec_s16_t dctv = vec_vsx_ld( 0, dct );
dctv = vec_sra( vec_add( dctv, v32 ), v6 );
dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 0 ), (vec_s32_t)vec_splat( dctv, 1 ) );
dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv );
idct8_dc_altivec( &p_dst[0], dcv );
dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)vec_splat( dctv, 2 ), (vec_s32_t)vec_splat( dctv, 3 ) );
dcv = (vec_s16_t)vec_mergeh( (vec_s32_t)dcv, (vec_s32_t)dcv );
idct8_dc_altivec( &p_dst[4*FDEC_STRIDE+0], dcv );
}
#define IDCT_1D_ALTIVEC(s0, s1, s2, s3, d0, d1, d2, d3) \

@ -32,19 +32,6 @@
typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
uint8_t *dst, intptr_t i_dst, int i_height );
static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
{
return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
pix[ 3*i_pix_next];
}
static inline int x264_tapfilter1( uint8_t *pix )
{
return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
pix[ 3];
}
static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, intptr_t i_dst,
uint8_t *src1, intptr_t i_src1,
uint8_t *src2, int i_height )

@ -460,9 +460,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
#if ARCH_X86
pf->denoise_dct = x264_denoise_dct_mmx;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
@ -473,8 +470,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif
pf->coeff_last4 = x264_coeff_last4_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
}
if( cpu&X264_CPU_SSE2 )
{
@ -499,17 +494,18 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run8 = x264_coeff_level_run8_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
}
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_last8 = x264_coeff_last8_sse2_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
}
pf->coeff_last4 = x264_coeff_last4_lzcnt;
pf->coeff_last8 = x264_coeff_last8_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt;
pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt;
}
if( cpu&X264_CPU_SSSE3 )
{
@ -557,8 +553,20 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->dequant_8x8 = x264_dequant_8x8_avx2;
pf->dequant_4x4_dc = x264_dequant_4x4dc_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->dequant_4x4 = x264_dequant_4x4_avx512;
pf->dequant_8x8 = x264_dequant_8x8_avx512;
pf->decimate_score15 = x264_decimate_score15_avx512;
pf->decimate_score16 = x264_decimate_score16_avx512;
pf->decimate_score64 = x264_decimate_score64_avx512;
pf->coeff_last4 = x264_coeff_last4_avx512;
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
@ -586,9 +594,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_mmx2;
pf->quant_8x8 = x264_quant_8x8_mmx2;
pf->quant_4x4_dc = x264_quant_4x4_dc_mmx2;
pf->decimate_score15 = x264_decimate_score15_mmx2;
pf->decimate_score16 = x264_decimate_score16_mmx2;
pf->decimate_score64 = x264_decimate_score64_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
@ -599,13 +604,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last8 = x264_coeff_last8_mmx2;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
pf->coeff_level_run8 = x264_coeff_level_run8_mmx2;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_last8 = x264_coeff_last8_mmx2_lzcnt;
pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_mmx2_lzcnt;
}
}
if( cpu&X264_CPU_SSE2 )
@ -634,14 +632,19 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
}
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
}
pf->coeff_last4 = x264_coeff_last4_lzcnt;
pf->coeff_last8 = x264_coeff_last8_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_lzcnt;
pf->coeff_level_run4 = x264_coeff_level_run4_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_lzcnt;
}
if( cpu&X264_CPU_SSSE3 )
@ -657,17 +660,19 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
#if ARCH_X86 || !defined( __MACH__ )
pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3;
if( cpu&X264_CPU_LZCNT )
{
pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
pf->coeff_level_run4 = x264_coeff_level_run4_ssse3_lzcnt;
pf->coeff_level_run8 = x264_coeff_level_run8_ssse3_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
}
#endif
}
if( cpu&X264_CPU_SSE4 )
@ -717,12 +722,28 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
}
pf->decimate_score64 = x264_decimate_score64_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
if( cpu&X264_CPU_LZCNT )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
#if ARCH_X86 || !defined( __MACH__ )
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
#endif
}
if( cpu&X264_CPU_AVX512 )
{
if( h->param.i_cqm_preset == X264_CQM_FLAT )
pf->dequant_8x8 = x264_dequant_8x8_flat16_avx512;
else
{
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2_lzcnt;
pf->dequant_4x4 = x264_dequant_4x4_avx512;
pf->dequant_8x8 = x264_dequant_8x8_avx512;
}
pf->decimate_score15 = x264_decimate_score15_avx512;
pf->decimate_score16 = x264_decimate_score16_avx512;
pf->decimate_score64 = x264_decimate_score64_avx512;
pf->coeff_last8 = x264_coeff_last8_avx512;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_avx512;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_avx512;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx512;
}
#endif // HAVE_MMX

@ -53,21 +53,32 @@ coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
%endmacro
cextern coeff_last4_mmx2
cextern coeff_last4_mmx2_lzcnt
cextern coeff_last4_lzcnt
%if HIGH_BIT_DEPTH
cextern coeff_last4_avx512
%endif
cextern coeff_last15_sse2
cextern coeff_last15_sse2_lzcnt
cextern coeff_last15_lzcnt
cextern coeff_last15_avx512
cextern coeff_last16_sse2
cextern coeff_last16_sse2_lzcnt
cextern coeff_last16_lzcnt
cextern coeff_last16_avx512
cextern coeff_last64_sse2
cextern coeff_last64_sse2_lzcnt
cextern coeff_last64_avx2_lzcnt
cextern coeff_last64_lzcnt
cextern coeff_last64_avx2
cextern coeff_last64_avx512
%ifdef PIC
SECTION .data
%endif
coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_lzcnt: COEFF_LAST_TABLE lzcnt, lzcnt, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
coeff_last_avx2: COEFF_LAST_TABLE lzcnt, avx2, lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
%if HIGH_BIT_DEPTH
coeff_last_avx512: COEFF_LAST_TABLE avx512, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
%else
coeff_last_avx512: COEFF_LAST_TABLE lzcnt, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
%endif
%endif
SECTION .text
@ -100,7 +111,7 @@ struc cb
.start: pointer 1
.p: pointer 1
.end: pointer 1
align 16, resb 1
align 64, resb 1
.bits_encoded: resd 1
.state: resb 1024
endstruc
@ -352,25 +363,33 @@ CABAC bmi2
%endmacro
%macro ABS_DCTCOEFS 2
%assign i 0
%rep %2/16
%if HIGH_BIT_DEPTH
ABSD m0, [%1+ 0+i*64], m4
ABSD m1, [%1+16+i*64], m5
ABSD m2, [%1+32+i*64], m4
ABSD m3, [%1+48+i*64], m5
mova [rsp+ 0+i*64], m0
mova [rsp+16+i*64], m1
mova [rsp+32+i*64], m2
mova [rsp+48+i*64], m3
%define %%abs ABSD
%else
ABSW m0, [%1+ 0+i*32], m2
ABSW m1, [%1+16+i*32], m3
mova [rsp+ 0+i*32], m0
mova [rsp+16+i*32], m1
%endif
%define %%abs ABSW
%endif
%if mmsize == %2*SIZEOF_DCTCOEF
%%abs m0, [%1], m1
mova [rsp], m0
%elif mmsize == %2*SIZEOF_DCTCOEF/2
%%abs m0, [%1+0*mmsize], m2
%%abs m1, [%1+1*mmsize], m3
mova [rsp+0*mmsize], m0
mova [rsp+1*mmsize], m1
%else
%assign i 0
%rep %2*SIZEOF_DCTCOEF/(4*mmsize)
%%abs m0, [%1+(4*i+0)*mmsize], m4
%%abs m1, [%1+(4*i+1)*mmsize], m5
%%abs m2, [%1+(4*i+2)*mmsize], m4
%%abs m3, [%1+(4*i+3)*mmsize], m5
mova [rsp+(4*i+0)*mmsize], m0
mova [rsp+(4*i+1)*mmsize], m1
mova [rsp+(4*i+2)*mmsize], m2
mova [rsp+(4*i+3)*mmsize], m3
%assign i i+1
%endrep
%endif
%endmacro
%macro SIG_OFFSET 1
@ -403,16 +422,14 @@ CABAC bmi2
%endif
%ifdef PIC
cglobal func, 4,13
cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
lea r12, [$$]
%define GLOBAL +r12-$$
%else
cglobal func, 4,12
cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF
%define GLOBAL
%endif
%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
SUB rsp, pad
shl r1d, 4 ; MB_INTERLACED*16
%if %1
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
@ -429,15 +446,13 @@ CABAC bmi2
ABS_DCTCOEFS r0, 64
%else
mov r4, r0 ; r4 = dct
mov r6, ~SIZEOF_DCTCOEF
and r6, r4 ; handle AC coefficient case
ABS_DCTCOEFS r6, 16
sub r4, r6 ; calculate our new dct pointer
and r4, ~SIZEOF_DCTCOEF ; handle AC coefficient case
ABS_DCTCOEFS r4, 16
xor r4, r0 ; calculate our new dct pointer
add r4, rsp ; restore AC coefficient offset
%endif
mov r1, [%2+gprsize*r2 GLOBAL]
; for improved OOE performance, run coeff_last on the original coefficients.
call r1 ; coeff_last[ctx_block_cat]( dct )
call [%2+gprsize*r2 GLOBAL] ; coeff_last[ctx_block_cat]( dct )
; we know on 64-bit that the SSE2 versions of this function only
; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
; don't need r2 in 8x8 mode.
@ -521,7 +536,6 @@ CABAC bmi2
jge .coeff_loop
.end:
mov [r3+cb.bits_encoded-cb.state], r0d
ADD rsp, pad
RET
%endmacro
@ -529,15 +543,23 @@ CABAC bmi2
INIT_XMM sse2
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
INIT_XMM sse2,lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
INIT_XMM lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
INIT_XMM ssse3
CABAC_RESIDUAL_RD 0, coeff_last_sse2
CABAC_RESIDUAL_RD 1, coeff_last_sse2
INIT_XMM ssse3,lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
%if HIGH_BIT_DEPTH
INIT_ZMM avx512
%else
INIT_YMM avx512
%endif
CABAC_RESIDUAL_RD 0, coeff_last_avx512
INIT_ZMM avx512
CABAC_RESIDUAL_RD 1, coeff_last_avx512
%endif
;-----------------------------------------------------------------------------
@ -615,7 +637,7 @@ CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
%endmacro
%macro CABAC_RESIDUAL 1
cglobal cabac_block_residual_internal, 4,15
cglobal cabac_block_residual_internal, 4,15,0,-4*64
%ifdef PIC
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
lea r7, [$$]
@ -625,8 +647,6 @@ cglobal cabac_block_residual_internal, 4,15
%define lastm r7d
%define GLOBAL
%endif
%assign pad gprsize+4*2+4*64-(stack_offset&15)
SUB rsp, pad
shl r1d, 4
%define sigoffq r8
@ -653,8 +673,7 @@ cglobal cabac_block_residual_internal, 4,15
mov dct, r0
mov leveloffm, leveloffd
mov r1, [%1+gprsize*r2 GLOBAL]
call r1
call [%1+gprsize*r2 GLOBAL]
mov lastm, eax
; put cabac in r0; needed for cabac_encode_decision
mov r0, r3
@ -742,15 +761,16 @@ cglobal cabac_block_residual_internal, 4,15
%endif
dec coeffidxd
jge .level_loop
ADD rsp, pad
RET
%endmacro
%if ARCH_X86_64
INIT_XMM sse2
CABAC_RESIDUAL coeff_last_sse2
INIT_XMM sse2,lzcnt
CABAC_RESIDUAL coeff_last_sse2_lzcnt
INIT_XMM avx2,bmi2
CABAC_RESIDUAL coeff_last_avx2_lzcnt
INIT_XMM lzcnt
CABAC_RESIDUAL coeff_last_lzcnt
INIT_XMM avx2
CABAC_RESIDUAL coeff_last_avx2
INIT_XMM avx512
CABAC_RESIDUAL coeff_last_avx512
%endif

@ -53,18 +53,16 @@ cglobal cpu_cpuid, 5,7
RET
;-----------------------------------------------------------------------------
; void cpu_xgetbv( int op, int *eax, int *edx )
; uint64_t cpu_xgetbv( int xcr )
;-----------------------------------------------------------------------------
cglobal cpu_xgetbv, 3,7
push r2
push r1
mov ecx, r0d
cglobal cpu_xgetbv
movifnidn ecx, r0m
xgetbv
pop r4
mov [r4], eax
pop r4
mov [r4], edx
RET
%if ARCH_X86_64
shl rdx, 32
or rax, rdx
%endif
ret
%if ARCH_X86_64
@ -77,7 +75,7 @@ cglobal stack_align
%if WIN64
sub rsp, 32 ; shadow space
%endif
and rsp, ~31
and rsp, ~(STACK_ALIGNMENT-1)
mov rax, r0
mov r0, r1
mov r1, r2
@ -118,7 +116,7 @@ cglobal stack_align
push ebp
mov ebp, esp
sub esp, 12
and esp, ~31
and esp, ~(STACK_ALIGNMENT-1)
mov ecx, [ebp+8]
mov edx, [ebp+12]
mov [esp], edx

@ -30,7 +30,41 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 32
SECTION_RODATA 64
; AVX-512 permutation indices are bit-packed to save cache
%if HIGH_BIT_DEPTH
scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3: 4x4_frame
dd 0x00a3ca95, 0x00dd8d08, 0x00e75b8c, 0x00a92919 ; bits 4-8: 8x8_frame1
dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13: 8x8_frame2
dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3
; bits 19-23: 8x8_frame4
scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4: 8x8_field1
dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9: 8x8_field2
dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3
dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4
cavlc_shuf_avx512: dd 0x00018820, 0x000398a4, 0x0005a928, 0x0007b9ac ; bits 0-4: interleave1
dd 0x0009ca30, 0x000bdab4, 0x000deb38, 0x000ffbbc ; bits 5-9: interleave2
dd 0x00010c01, 0x00031c85, 0x00052d09, 0x00073d8d ; bits 10-14: interleave3
dd 0x00094e11, 0x000b5e95, 0x000d6f19, 0x000f7f9d ; bits 15-19: interleave4
%else
dct_avx512: dd 0x10000000, 0x00021104, 0x3206314c, 0x60042048 ; bits 0-4: dct8x8_fenc bits 5-9: dct8x8_fdec
dd 0x98008a10, 0x20029b14, 0xba06bb5c, 0x4004aa58 ; bits 10-13: dct16x16_fenc bits 14-18: dct16x16_fdec
dd 0x54004421, 0x80025525, 0x7606756d, 0xe0046469 ; bits(e) 24-27: idct8x8_idct1 bits(e) 28-31: idct8x8_idct2
dd 0xdc00ce31, 0xa002df35, 0xfe06ff7d, 0xc004ee79 ; bits(o) 24-31: idct8x8_gather
scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3: 4x4_frame
dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9: 8x8_frame1
dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30
scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5: 8x8_field1
dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11: 8x8_field2
dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a
dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2
cavlc_shuf_avx512: dw 0x0080, 0x0184, 0x0288, 0x038c, 0x0490, 0x0594, 0x0698, 0x079c ; bits 0-5: interleave1
dw 0x08a0, 0x09a4, 0x0aa8, 0x0bac, 0x0cb0, 0x0db4, 0x0eb8, 0x0fbc ; bits 6-11: interleave2
dw 0x00c1, 0x01c5, 0x02c9, 0x03cd, 0x04d1, 0x05d5, 0x06d9, 0x07dd
dw 0x08e1, 0x09e5, 0x0ae9, 0x0bed, 0x0cf1, 0x0df5, 0x0ef9, 0x0ffd
%endif
pw_ppmmmmpp: dw 1,1,-1,-1,-1,-1,1,1
pb_sub4frame: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
pb_sub4field: db 0,4,1,8,12,5,9,13,2,6,10,14,3,7,11,15
@ -580,6 +614,217 @@ cglobal sub16x16_dct, 3,3,6
DCT4_1D 0, 1, 2, 3, 4
STORE16_DCT_AVX2 0, 1, 2, 3, 4
ret
%macro DCT4x4_AVX512 0
psubw m0, m2 ; 0 1
psubw m1, m3 ; 3 2
SUMSUB_BA w, 1, 0, 2
SBUTTERFLY wd, 1, 0, 2
paddw m2, m1, m0
psubw m3, m1, m0
paddw m2 {k1}, m1 ; 0+1+2+3 0<<1+1-2-3<<1
psubw m3 {k1}, m0 ; 0-1-2+3 0-1<<1+2<<1-3
shufps m1, m2, m3, q2323 ; a3 b3 a2 b2 c3 d3 c2 d2
punpcklqdq m2, m3 ; a0 b0 a1 b1 c0 d0 c1 d1
SUMSUB_BA w, 1, 2, 3
shufps m3, m1, m2, q3131 ; a1+a2 b1+b2 c1+c2 d1+d2 a1-a2 b1-b2 b1-b2 d1-d2
shufps m1, m2, q2020 ; a0+a3 b0+b3 c0+c3 d0+d3 a0-a3 b0-b3 c0-c3 d0-d3
paddw m2, m1, m3
psubw m0, m1, m3
paddw m2 {k2}, m1 ; 0'+1'+2'+3' 0'<<1+1'-2'-3'<<1
psubw m0 {k2}, m3 ; 0'-1'-2'+3' 0'-1'<<1+2'<<1-3'
%endmacro
INIT_XMM avx512
cglobal sub4x4_dct
mov eax, 0xf0aa
kmovw k1, eax
PROLOGUE 3,3
movd m0, [r1+0*FENC_STRIDE]
movd m2, [r2+0*FDEC_STRIDE]
vpbroadcastd m0 {k1}, [r1+1*FENC_STRIDE]
vpbroadcastd m2 {k1}, [r2+1*FDEC_STRIDE]
movd m1, [r1+3*FENC_STRIDE]
movd m3, [r2+3*FDEC_STRIDE]
vpbroadcastd m1 {k1}, [r1+2*FENC_STRIDE]
vpbroadcastd m3 {k1}, [r2+2*FDEC_STRIDE]
kshiftrw k2, k1, 8
pxor m4, m4
punpcklbw m0, m4
punpcklbw m2, m4
punpcklbw m1, m4
punpcklbw m3, m4
DCT4x4_AVX512
mova [r0], m2
mova [r0+16], m0
RET
INIT_ZMM avx512
cglobal dct4x4x4_internal
punpcklbw m0, m1, m4
punpcklbw m2, m3, m4
punpckhbw m1, m4
punpckhbw m3, m4
DCT4x4_AVX512
mova m1, m2
vshufi32x4 m2 {k2}, m0, m0, q2200 ; m0
vshufi32x4 m0 {k3}, m1, m1, q3311 ; m1
ret
%macro DCT8x8_LOAD_FENC_AVX512 4 ; dst, perm, row1, row2
movu %1, [r1+%3*FENC_STRIDE]
vpermt2d %1, %2, [r1+%4*FENC_STRIDE]
%endmacro
%macro DCT8x8_LOAD_FDEC_AVX512 5 ; dst, perm, tmp, row1, row2
movu %1, [r2+(%4 )*FDEC_STRIDE]
vmovddup %1 {k1}, [r2+(%4+2)*FDEC_STRIDE]
movu %3, [r2+(%5 )*FDEC_STRIDE]
vmovddup %3 {k1}, [r2+(%5+2)*FDEC_STRIDE]
vpermt2d %1, %2, %3
%endmacro
cglobal sub8x8_dct, 3,3
mova m0, [dct_avx512]
DCT8x8_LOAD_FENC_AVX512 m1, m0, 0, 4 ; 0 2 1 3
mov r1d, 0xaaaaaaaa
kmovd k1, r1d
psrld m0, 5
DCT8x8_LOAD_FDEC_AVX512 m3, m0, m2, 0, 4
mov r1d, 0xf0f0f0f0
kmovd k2, r1d
pxor xm4, xm4
knotw k3, k2
call dct4x4x4_internal_avx512
mova [r0], m0
mova [r0+64], m1
RET
%macro SUB4x16_DCT_AVX512 2 ; dst, src
vpermd m1, m5, [r1+1*%2*64]
mova m3, [r2+2*%2*64]
vpermt2d m3, m6, [r2+2*%2*64+64]
call dct4x4x4_internal_avx512
mova [r0+%1*64 ], m0
mova [r0+%1*64+128], m1
%endmacro
cglobal sub16x16_dct
psrld m5, [dct_avx512], 10
mov eax, 0xaaaaaaaa
kmovd k1, eax
mov eax, 0xf0f0f0f0
kmovd k2, eax
PROLOGUE 3,3
pxor xm4, xm4
knotw k3, k2
psrld m6, m5, 4
SUB4x16_DCT_AVX512 0, 0
SUB4x16_DCT_AVX512 1, 1
SUB4x16_DCT_AVX512 4, 2
SUB4x16_DCT_AVX512 5, 3
RET
cglobal sub8x8_dct_dc, 3,3
mova m3, [dct_avx512]
DCT8x8_LOAD_FENC_AVX512 m0, m3, 0, 4 ; 0 2 1 3
mov r1d, 0xaa
kmovb k1, r1d
psrld m3, 5
DCT8x8_LOAD_FDEC_AVX512 m1, m3, m2, 0, 4
pxor xm3, xm3
psadbw m0, m3
psadbw m1, m3
psubw m0, m1
vpmovqw xmm0, m0
vprold xmm1, xmm0, 16
paddw xmm0, xmm1 ; 0 0 2 2 1 1 3 3
punpckhqdq xmm2, xmm0, xmm0
psubw xmm1, xmm0, xmm2 ; 0-1 0-1 2-3 2-3
paddw xmm0, xmm2 ; 0+1 0+1 2+3 2+3
punpckldq xmm0, xmm1 ; 0+1 0+1 0-1 0-1 2+3 2+3 2-3 2-3
punpcklqdq xmm1, xmm0, xmm0
psubw xmm0 {k1}, xm3, xmm0
paddw xmm0, xmm1 ; 0+1+2+3 0+1-2-3 0-1+2-3 0-1-2+3
movhps [r0], xmm0
RET
cglobal sub8x16_dct_dc, 3,3
mova m5, [dct_avx512]
DCT8x8_LOAD_FENC_AVX512 m0, m5, 0, 8 ; 0 4 1 5
DCT8x8_LOAD_FENC_AVX512 m1, m5, 4, 12 ; 2 6 3 7
mov r1d, 0xaa
kmovb k1, r1d
psrld m5, 5
DCT8x8_LOAD_FDEC_AVX512 m2, m5, m4, 0, 8
DCT8x8_LOAD_FDEC_AVX512 m3, m5, m4, 4, 12
pxor xm4, xm4
psadbw m0, m4
psadbw m1, m4
psadbw m2, m4
psadbw m3, m4
psubw m0, m2
psubw m1, m3
SBUTTERFLY qdq, 0, 1, 2
paddw m0, m1
vpmovqw xmm0, m0 ; 0 2 4 6 1 3 5 7
psrlq xmm2, xmm0, 32
psubw xmm1, xmm0, xmm2 ; 0-4 2-6 1-5 3-7
paddw xmm0, xmm2 ; 0+4 2+6 1+5 3+7
punpckhdq xmm2, xmm0, xmm1
punpckldq xmm0, xmm1
psubw xmm1, xmm0, xmm2 ; 0-1+4-5 2-3+6-7 0-1-4+5 2-3-6+7
paddw xmm0, xmm2 ; 0+1+4+5 2+3+6+7 0+1-4-5 2+3-6-7
punpcklwd xmm0, xmm1
psrlq xmm2, xmm0, 32
psubw xmm1, xmm0, xmm2 ; 0+1-2-3+4+5-6-7 0-1-2+3+4-5-6+7 0+1-2-3-4-5+6+7 0-1-2+3-4+5+6-7
paddw xmm0, xmm2 ; 0+1+2+3+4+5+6+7 0-1+2-3+4-5+6-7 0+1+2+3-4-5-6-7 0-1+2-3-4+5-6+7
shufps xmm0, xmm1, q0220
mova [r0], xmm0
RET
%macro SARSUMSUB 3 ; a, b, tmp
mova m%3, m%1
vpsraw m%1 {k1}, 1
psubw m%1, m%2 ; 0-2 1>>1-3
vpsraw m%2 {k1}, 1
paddw m%2, m%3 ; 0+2 1+3>>1
%endmacro
cglobal add8x8_idct, 2,2
mova m1, [r1]
mova m2, [r1+64]
mova m3, [dct_avx512]
vbroadcasti32x4 m4, [pw_32]
mov r1d, 0xf0f0f0f0
kxnorb k2, k2, k2
kmovd k1, r1d
kmovb k3, k2
vshufi32x4 m0, m1, m2, q2020 ; 0 1 4 5 8 9 c d
vshufi32x4 m1, m2, q3131 ; 2 3 6 7 a b e f
psrlq m5, m3, 56 ; {0, 3, 1, 2, 4, 7, 5, 6} * FDEC_STRIDE
vpgatherqq m6 {k2}, [r0+m5]
SARSUMSUB 0, 1, 2
SBUTTERFLY wd, 1, 0, 2
psrlq m7, m3, 28
SUMSUB_BA w, 0, 1, 2 ; 0+1+2+3>>1 0+1>>1-2-3
vprold m1, 16 ; 0-1>>1-2+3 0-1+2-3>>1
SBUTTERFLY dq, 0, 1, 2
psrlq m3, 24
SARSUMSUB 0, 1, 2
vpermi2q m3, m1, m0
vpermt2q m1, m7, m0
paddw m3, m4 ; += 32
SUMSUB_BA w, 1, 3, 0
psraw m1, 6 ; 0'+1'+2'+3'>>1 0'+1'>>1-2'-3'
psraw m3, 6 ; 0'-1'+2'-3'>>1 0'-1'>>1-2'+3'
pxor xm0, xm0
SBUTTERFLY bw, 6, 0, 2
paddsw m1, m6
paddsw m3, m0
packuswb m1, m3
vpscatterqq [r0+m5] {k3}, m1
RET
%endif ; HIGH_BIT_DEPTH
INIT_MMX
@ -1883,3 +2128,161 @@ cglobal zigzag_interleave_8x8_cavlc, 3,3,6
mov [r2+8], r0w
RET
%endif ; !HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH
INIT_ZMM avx512
cglobal zigzag_scan_4x4_frame, 2,2
mova m0, [scan_frame_avx512]
vpermd m0, m0, [r1]
mova [r0], m0
RET
cglobal zigzag_scan_4x4_field, 2,2
mova m0, [r1]
pshufd xmm1, [r1+8], q3102
mova [r0], m0
movu [r0+8], xmm1
RET
cglobal zigzag_scan_8x8_frame, 2,2
psrld m0, [scan_frame_avx512], 4
mova m1, [r1+0*64]
mova m2, [r1+1*64]
mova m3, [r1+2*64]
mova m4, [r1+3*64]
mov r1d, 0x01fe7f80
kmovd k1, r1d
kshiftrd k2, k1, 16
vpermd m5, m0, m3 ; __ __ __ __ __ __ __ __ __ __ __ __ __ __ 32 40
psrld m6, m0, 5
vpermi2d m0, m1, m2 ; 0 8 1 2 9 16 24 17 10 3 4 11 18 25 __ __
vmovdqa64 m0 {k1}, m5
mova [r0+0*64], m0
mova m5, m1
vpermt2d m1, m6, m2 ; __ 26 19 12 5 6 13 20 27 __ __ __ __ __ __ __
psrld m0, m6, 5
vpermi2d m6, m3, m4 ; 33 __ __ __ __ __ __ __ __ 34 41 48 56 49 42 35
vmovdqa32 m6 {k2}, m1
mova [r0+1*64], m6
vpermt2d m5, m0, m2 ; 28 21 14 7 15 22 29 __ __ __ __ __ __ __ __ 30
psrld m1, m0, 5
vpermi2d m0, m3, m4 ; __ __ __ __ __ __ __ 36 43 50 57 58 51 44 37 __
vmovdqa32 m5 {k1}, m0
mova [r0+2*64], m5
vpermt2d m3, m1, m4 ; __ __ 38 45 52 59 60 53 46 39 47 54 61 62 55 63
vpermd m2, m1, m2 ; 23 31 __ __ __ __ __ __ __ __ __ __ __ __ __ __
vmovdqa64 m2 {k2}, m3
mova [r0+3*64], m2
RET
cglobal zigzag_scan_8x8_field, 2,2
mova m0, [scan_field_avx512]
mova m1, [r1+0*64]
mova m2, [r1+1*64]
mova m3, [r1+2*64]
mova m4, [r1+3*64]
mov r1d, 0x3f
kmovb k1, r1d
psrld m5, m0, 5
vpermi2d m0, m1, m2
vmovdqa64 m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15
vpermt2d m1, m5, m2
psrld m5, 5
vmovdqa64 m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31
vpermt2d m2, m5, m3
psrld m5, 5
vpermt2d m3, m5, m4
mova [r0+0*64], m0
mova [r0+1*64], m1
mova [r0+2*64], m2
mova [r0+3*64], m3
RET
cglobal zigzag_interleave_8x8_cavlc, 3,3
mova m0, [cavlc_shuf_avx512]
mova m1, [r1+0*64]
mova m2, [r1+1*64]
mova m3, [r1+2*64]
mova m4, [r1+3*64]
kxnorb k1, k1, k1
por m7, m1, m2
psrld m5, m0, 5
vpermi2d m0, m1, m2 ; a0 a1 b0 b1
vpternlogd m7, m3, m4, 0xfe ; m1|m2|m3|m4
psrld m6, m5, 5
vpermi2d m5, m3, m4 ; b2 b3 a2 a3
vptestmd k0, m7, m7
vpermt2d m1, m6, m2 ; c0 c1 d0 d1
psrld m6, 5
vpermt2d m3, m6, m4 ; d2 d3 c2 c3
vshufi32x4 m2, m0, m5, q1032 ; b0 b1 b2 b3
vmovdqa32 m5 {k1}, m0 ; a0 a1 a2 a3
vshufi32x4 m4, m1, m3, q1032 ; d0 d1 d2 d3
vmovdqa32 m3 {k1}, m1 ; c0 c1 c2 c3
mova [r0+0*64], m5
mova [r0+1*64], m2
mova [r0+2*64], m3
mova [r0+3*64], m4
kmovw r1d, k0
test r1d, 0x1111
setnz [r2]
test r1d, 0x2222
setnz [r2+1]
test r1d, 0x4444
setnz [r2+8]
test r1d, 0x8888
setnz [r2+9]
RET
%else ; !HIGH_BIT_DEPTH
INIT_YMM avx512
cglobal zigzag_scan_4x4_frame, 2,2
mova m0, [scan_frame_avx512]
vpermw m0, m0, [r1]
mova [r0], m0
RET
cglobal zigzag_scan_4x4_field, 2,2
mova m0, [r1]
pshuflw xmm1, [r1+4], q3102
mova [r0], m0
movq [r0+4], xmm1
RET
INIT_ZMM avx512
cglobal zigzag_scan_8x8_frame, 2,2
psrlw m0, [scan_frame_avx512], 4
scan8_avx512:
mova m1, [r1]
mova m2, [r1+64]
psrlw m3, m0, 6
vpermi2w m0, m1, m2
vpermt2w m1, m3, m2
mova [r0], m0
mova [r0+64], m1
RET
cglobal zigzag_scan_8x8_field, 2,2
mova m0, [scan_field_avx512]
jmp scan8_avx512
cglobal zigzag_interleave_8x8_cavlc, 3,3
mova m0, [cavlc_shuf_avx512]
mova m1, [r1]
mova m2, [r1+64]
psrlw m3, m0, 6
vpermi2w m0, m1, m2
vpermt2w m1, m3, m2
kxnorb k2, k2, k2
vptestmd k0, m0, m0
vptestmd k1, m1, m1
mova [r0], m0
mova [r0+64], m1
ktestw k2, k0
setnz [r2]
setnc [r2+1]
ktestw k2, k1
setnz [r2+8]
setnc [r2+9]
RET
%endif ; !HIGH_BIT_DEPTH

@ -34,6 +34,7 @@ void x264_sub16x16_dct_mmx ( dctcoef dct[16][16], pixel *pix1, pixel *pix2
void x264_sub8x8_dct_sse2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_sse2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_ssse3 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub4x4_dct_avx512 ( int16_t dct [16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_ssse3 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_ssse3( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
@ -41,12 +42,16 @@ void x264_sub16x16_dct_avx ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2
void x264_sub8x8_dct_xop ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_xop ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx2 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_avx512 ( int16_t dct[ 4][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct_avx2 ( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_ssse3( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub16x16_dct_avx512( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_mmx2 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct_dc_sse2 ( dctcoef dct [ 4], pixel *pix1, pixel *pix2 );
void x264_sub8x8_dct_dc_avx512 ( int16_t dct [ 4], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_sse2 ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_ssse3 ( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x16_dct_dc_avx ( dctcoef dct [ 8], pixel *pix1, pixel *pix2 );
void x264_sub8x16_dct_dc_avx512( int16_t dct [ 8], uint8_t *pix1, uint8_t *pix2 );
void x264_add4x4_idct_mmx ( uint8_t *p_dst, int16_t dct [16] );
void x264_add4x4_idct_sse2 ( uint16_t *p_dst, int32_t dct [16] );
@ -59,6 +64,7 @@ void x264_add16x16_idct_dc_mmx2 ( uint8_t *p_dst, int16_t dct [16] );
void x264_add8x8_idct_sse2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx2 ( pixel *p_dst, dctcoef dct[ 4][16] );
void x264_add8x8_idct_avx512 ( uint8_t *p_dst, int16_t dct[ 4][16] );
void x264_add16x16_idct_sse2 ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx ( pixel *p_dst, dctcoef dct[16][16] );
void x264_add16x16_idct_avx2 ( pixel *p_dst, dctcoef dct[16][16] );
@ -101,22 +107,26 @@ void x264_add16x16_idct8_sse2( pixel *dst, dctcoef dct[4][64] );
void x264_add8x8_idct8_avx ( pixel *dst, dctcoef dct [64] );
void x264_add16x16_idct8_avx ( pixel *dst, dctcoef dct[4][64] );
void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_mmx2 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_8x8_frame_sse2 ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_ssse3 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_avx ( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_8x8_frame_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_frame_avx512( dctcoef level[64], dctcoef dct[64] );
void x264_zigzag_scan_4x4_frame_mmx ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_frame_ssse3 ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_4x4_frame_avx ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_xop ( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_4x4_field_sse ( int16_t level[16], int16_t dct[16] );
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_4x4_field_sse2 ( int32_t level[16], int32_t dct[16] );
void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] );
void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_avx ( int32_t level[64], int32_t dct[64] );
void x264_zigzag_scan_8x8_field_xop ( int16_t level[64], int16_t dct[64] );
void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] );
int x264_zigzag_sub_4x4_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4_frame_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_frame_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
@ -126,8 +136,9 @@ int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, u
int x264_zigzag_sub_4x4ac_field_avx ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
void x264_zigzag_interleave_8x8_cavlc_mmx ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_sse2 ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx ( dctcoef *dst, dctcoef *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx2( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx2 ( int16_t *dst, int16_t *src, uint8_t *nnz );
void x264_zigzag_interleave_8x8_cavlc_avx512( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif

@ -28,10 +28,14 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 32
load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15
insert_top_shuf: dd 0,1,4,5,7,2,3,6
SECTION_RODATA 64
load_bytes_zmm_shuf: dd 0x50404032, 0x70606053, 0xd0c0c0b4, 0xf0e0e0d5
dd 0x50404036, 0x70606057, 0xd0c0c0b8, 0xf0e0e0d9
dd 0x50104001, 0x70306023, 0xd090c083, 0xf0b0e0a5
dd 0x50104005, 0x70306027, 0xd090c087, 0xf0b0e0a9
load_bytes_ymm_shuf: dd 0x06050403, 0x0e0d0c1b, 0x07060544, 0x0f0e0d5c
dd 0x06050473, 0x0e0d0c2b, 0x07060534, 0x0f0e0d6c
transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
SECTION .text
@ -906,9 +910,8 @@ DEBLOCK_LUMA_INTRA
movq m3, %4
punpcklwd m0, m2
punpcklwd m1, m3
mova m2, m0
punpckhdq m2, m0, m1
punpckldq m0, m1
punpckhdq m2, m1
movq m4, %5
movq m6, %6
@ -916,9 +919,8 @@ DEBLOCK_LUMA_INTRA
movq m7, %8
punpcklwd m4, m6
punpcklwd m5, m7
mova m6, m4
punpckhdq m6, m4, m5
punpckldq m4, m5
punpckhdq m6, m5
punpckhqdq m1, m0, m4
punpckhqdq m3, m2, m6
@ -2278,13 +2280,10 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
RET
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
; uint8_t bs[2][4][4], int mvy_limit, int bframe )
;-----------------------------------------------------------------------------
%define scan8start (4+1*8)
%define nnz r0+scan8start
%define ref r1+scan8start
@ -2292,145 +2291,54 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8
%define bs0 r3
%define bs1 r3+32
%macro LOAD_BYTES_MMX 1
movd m2, [%1+8*0-1]
movd m0, [%1+8*0]
movd m3, [%1+8*2-1]
movd m1, [%1+8*2]
punpckldq m2, [%1+8*1-1]
punpckldq m0, [%1+8*1]
punpckldq m3, [%1+8*3-1]
punpckldq m1, [%1+8*3]
%endmacro
%macro DEBLOCK_STRENGTH_REFS_MMX 0
LOAD_BYTES_MMX ref
pxor m2, m0
pxor m3, m1
por m2, [bs0+0]
por m3, [bs0+8]
movq [bs0+0], m2
movq [bs0+8], m3
movd m2, [ref-8*1]
movd m3, [ref+8*1]
punpckldq m2, m0 ; row -1, row 0
punpckldq m3, m1 ; row 1, row 2
pxor m0, m2
pxor m1, m3
por m0, [bs1+0]
por m1, [bs1+8]
movq [bs1+0], m0
movq [bs1+8], m1
%endmacro
%macro DEBLOCK_STRENGTH_MVS_MMX 2
mova m0, [mv-%2]
mova m1, [mv-%2+8]
psubw m0, [mv]
psubw m1, [mv+8]
packsswb m0, m1
ABSB m0, m1
psubusb m0, m7
packsswb m0, m0
por m0, [%1]
movd [%1], m0
%endmacro
%macro DEBLOCK_STRENGTH_NNZ_MMX 1
por m2, m0
por m3, m1
mova m4, [%1]
mova m5, [%1+8]
pminub m2, m6
pminub m3, m6
pminub m4, m6 ; mv ? 1 : 0
pminub m5, m6
paddb m2, m2 ; nnz ? 2 : 0
paddb m3, m3
pmaxub m2, m4
pmaxub m3, m5
%endmacro
%macro LOAD_BYTES_XMM 1
movu m2, [%1-4] ; FIXME could be aligned if we changed nnz's allocation
%macro LOAD_BYTES_XMM 2 ; src, aligned
%if %2
mova m2, [%1-4]
mova m1, [%1+12]
%else
movu m2, [%1-4]
movu m1, [%1+12]
pslldq m0, m2, 1
%endif
psllq m0, m2, 8
shufps m2, m1, q3131 ; cur nnz, all rows
pslldq m1, 1
psllq m1, 8
shufps m0, m1, q3131 ; left neighbors
%if cpuflag(avx) || (%2 && cpuflag(ssse3))
palignr m1, m2, [%1-20], 12
%else
pslldq m1, m2, 4
movd m3, [%1-8] ; could be palignr if nnz was aligned
movd m3, [%1-8]
por m1, m3 ; top neighbors
%endif
%endmacro
INIT_MMX mmx2
cglobal deblock_strength, 6,6
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
movd m7, r4d
SPLATW m7, m7
mova m6, [pb_1]
pxor m0, m0
mova [bs0+0], m0
mova [bs0+8], m0
mova [bs1+0], m0
mova [bs1+8], m0
.lists:
DEBLOCK_STRENGTH_REFS_MMX
mov r4d, 4
.mvs:
DEBLOCK_STRENGTH_MVS_MMX bs0, 4
DEBLOCK_STRENGTH_MVS_MMX bs1, 4*8
add r2, 4*8
add r3, 4
dec r4d
jg .mvs
add r1, 40
add r2, 4*8
sub r3, 16
dec r5d
jge .lists
; Check nnz
LOAD_BYTES_MMX nnz
DEBLOCK_STRENGTH_NNZ_MMX bs0
; Transpose column output
SBUTTERFLY bw, 2, 3, 4
SBUTTERFLY bw, 2, 3, 4
mova [bs0+0], m2
mova [bs0+8], m3
movd m2, [nnz-8*1]
movd m3, [nnz+8*1]
punpckldq m2, m0 ; row -1, row 0
punpckldq m3, m1 ; row 1, row 2
DEBLOCK_STRENGTH_NNZ_MMX bs1
mova [bs1+0], m2
mova [bs1+8], m3
RET
%if UNIX64
DECLARE_REG_TMP 5
%else
DECLARE_REG_TMP 4
%endif
%macro DEBLOCK_STRENGTH_XMM 0
cglobal deblock_strength, 6,6,7
cglobal deblock_strength, 5,5,7
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
movd m6, r4d
movifnidn t0d, r5m
SPLATW m6, m6
pxor m4, m4 ; bs0
pxor m5, m5 ; bs1
.lists:
; Check refs
LOAD_BYTES_XMM ref
LOAD_BYTES_XMM ref, 0
pxor m0, m2
pxor m1, m2
por m4, m0
por m5, m1
; Check mvs
%if cpuflag(ssse3)
%if cpuflag(ssse3) && notcpuflag(avx)
mova m0, [mv+4*8*0]
mova m1, [mv+4*8*1]
palignr m3, m0, [mv+4*8*0-16], 12
@ -2483,11 +2391,11 @@ cglobal deblock_strength, 6,6,7
por m5, m0
add r1, 40
add r2, 4*8*5
dec r5d
dec t0d
jge .lists
; Check nnz
LOAD_BYTES_XMM nnz
LOAD_BYTES_XMM nnz, 1
por m0, m2
por m1, m2
mova m6, [pb_1]
@ -2521,67 +2429,120 @@ DEBLOCK_STRENGTH_XMM
%macro LOAD_BYTES_YMM 1
movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
pshufb m0, [load_bytes_shuf] ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
mova m2, [insert_top_shuf]
pshufb m0, m6 ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2
vpermd m0, m2, m0 ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS
vpbroadcastd m2, [%1-8] ; ABCD ....
vpblendd m0, m0, m2, 00010000b ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
vpblendd m0, m0, m2, 0x80
vpermd m0, m7, m0 ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
%endmacro
INIT_YMM avx2
cglobal deblock_strength, 6,6,7
cglobal deblock_strength, 5,5,8
mova m6, [load_bytes_ymm_shuf]
; Prepare mv comparison register
shl r4d, 8
add r4d, 3 - (1<<8)
movd xm6, r4d
vpbroadcastw m6, xm6
pxor m5, m5 ; bs0,bs1
movd xm5, r4d
movifnidn t0d, r5m
vpbroadcastw m5, xm5
psrld m7, m6, 4
pxor m4, m4 ; bs0,bs1
.lists:
; Check refs
LOAD_BYTES_YMM ref
pxor m0, m1
por m5, m0
por m4, m0
; Check mvs
movu xm0, [mv-4+4*8*0]
vinserti128 m0, m0, [mv+4*8*-1], 1
vbroadcasti128 m2, [mv+4*8* 0]
vinserti128 m1, m2, [mv-4+4*8*1], 0
vbroadcasti128 m3, [mv+4*8* 1]
movu xm0, [mv+0*4*8-4]
vinserti128 m0, m0, [mv-1*4*8 ], 1
vbroadcasti128 m2, [mv+0*4*8 ]
vinserti128 m1, m2, [mv+1*4*8-4], 0
psubw m0, m2
psubw m1, m3
vinserti128 m2, m3, [mv-4+4*8*2], 0
vbroadcasti128 m4, [mv+4*8* 2]
vinserti128 m3, m4, [mv-4+4*8*3], 0
psubw m2, m4
vbroadcasti128 m4, [mv+4*8* 3]
psubw m3, m4
vbroadcasti128 m2, [mv+1*4*8 ]
psubw m1, m2
packsswb m0, m1
packsswb m2, m3
vinserti128 m1, m2, [mv+2*4*8-4], 0
vbroadcasti128 m3, [mv+2*4*8 ]
vinserti128 m2, m3, [mv+3*4*8-4], 0
psubw m1, m3
vbroadcasti128 m3, [mv+3*4*8 ]
psubw m2, m3
packsswb m1, m2
pabsb m0, m0
pabsb m2, m2
psubusb m0, m6
psubusb m2, m6
packsswb m0, m2
por m5, m0
pabsb m1, m1
psubusb m0, m5
psubusb m1, m5
packsswb m0, m1
por m4, m0
add r1, 40
add r2, 4*8*5
dec r5d
dec t0d
jge .lists
; Check nnz
LOAD_BYTES_YMM nnz
mova m2, [pb_1]
por m0, m1
mova m6, [pb_1]
pminub m0, m6
pminub m5, m6 ; mv ? 1 : 0
pminub m0, m2
pminub m4, m2 ; mv ? 1 : 0
paddb m0, m0 ; nnz ? 2 : 0
pmaxub m5, m0
vextracti128 [bs1], m5, 1
pshufb xm5, [transpose_shuf]
mova [bs0], xm5
pmaxub m0, m4
vextracti128 [bs1], m0, 1
pshufb xm0, [transpose_shuf]
mova [bs0], xm0
RET
%macro LOAD_BYTES_ZMM 1
vpermd m1, m6, [%1-12]
pshufb m1, m7 ; EF FG GH HI JK KL LM MN OP PQ QR RS TU UV VW WX
%endmacro ; AF BG CH DI FK GL HM IN KP LQ MR NS PU QV RW SX
INIT_ZMM avx512
cglobal deblock_strength, 5,5
mova m6, [load_bytes_zmm_shuf]
shl r4d, 8
add r4d, 3 - (1<<8)
vpbroadcastw m5, r4d
mov r4d, 0x34cc34cc ; {1,-1} * 11001100b
kmovb k1, r4d
vpbroadcastd m4, r4d
movifnidn t0d, r5m
psrld m7, m6, 4
pxor xm3, xm3
.lists:
vbroadcasti64x2 m2, [mv+32]
vinserti64x2 m0, m2, [mv-32], 2
vbroadcasti64x2 m1, [mv+ 0]
vinserti64x2 m0, m0, [mv- 4], 0
vbroadcasti64x2 m1 {k1}, [mv+64]
vinserti64x2 m0, m0, [mv+60], 1
psubw m0, m1
vinserti64x2 m1, m1, [mv+28], 0
vbroadcasti64x2 m2 {k1}, [mv+96]
vinserti64x2 m1, m1, [mv+92], 1
psubw m1, m2
packsswb m0, m1
pabsb m0, m0
psubusb m0, m5
LOAD_BYTES_ZMM ref
pmaddubsw m1, m4 ; E-F F-G G-H H-I ...
vpternlogd m3, m0, m1, 0xfe ; m3 | m0 | m1
add r1, 40
add r2, 4*8*5
dec t0d
jge .lists
LOAD_BYTES_ZMM nnz
mova ym2, [pb_1]
vptestmw k1, m1, m1
vptestmw k2, m3, m3
vpaddb ym0 {k1}{z}, ym2, ym2 ; nnz ? 2 : 0
vpmaxub ym0 {k2}, ym2 ; mv ? 1 : 0
vextracti128 [bs1], ym0, 1
pshufb xm0, [transpose_shuf]
mova [bs0], xm0
RET

@ -83,11 +83,11 @@ cextern deinterleave_shufd
%endmacro
%endif
%macro AVG_END 0
lea t4, [t4+t5*2*SIZEOF_PIXEL]
%macro AVG_END 0-1 2 ; rows
lea t2, [t2+t3*2*SIZEOF_PIXEL]
lea t4, [t4+t5*2*SIZEOF_PIXEL]
lea t0, [t0+t1*2*SIZEOF_PIXEL]
sub eax, 2
sub eax, %1
jg .height_loop
RET
%endmacro
@ -148,17 +148,24 @@ cextern deinterleave_shufd
%macro BIWEIGHT_START_SSSE3 0
movzx t6d, byte r6m ; FIXME x86_64
mov t7d, 64
sub t7d, t6d
shl t7d, 8
add t6d, t7d
%if mmsize > 16
vbroadcasti128 m4, [pw_512]
%else
mova m4, [pw_512]
movd xm3, t6d
%endif
lea t7d, [t6+(64<<8)]
shl t6d, 8
sub t7d, t6d
%if cpuflag(avx512)
vpbroadcastw m3, t7d
%else
movd xm3, t7d
%if cpuflag(avx2)
vpbroadcastw m3, xm3
%else
SPLATW m3, m3 ; weight_dst,src
%endif
%endif
%endmacro
%if HIGH_BIT_DEPTH
@ -268,6 +275,66 @@ cglobal pixel_avg_weight_w16
mova [t0], xm0
vextracti128 [t0+t1], m0, 1
AVG_END
INIT_YMM avx512
cglobal pixel_avg_weight_w8
BIWEIGHT_START
kxnorb k1, k1, k1
kaddb k1, k1, k1
AVG_START 5
.height_loop:
movq xm0, [t2]
movq xm2, [t4]
movq xm1, [t2+t3]
movq xm5, [t4+t5]
lea t2, [t2+t3*2]
lea t4, [t4+t5*2]
vpbroadcastq m0 {k1}, [t2]
vpbroadcastq m2 {k1}, [t4]
vpbroadcastq m1 {k1}, [t2+t3]
vpbroadcastq m5 {k1}, [t4+t5]
punpcklbw m0, m2
punpcklbw m1, m5
pmaddubsw m0, m3
pmaddubsw m1, m3
pmulhrsw m0, m4
pmulhrsw m1, m4
packuswb m0, m1
vextracti128 xmm1, m0, 1
movq [t0], xm0
movhps [t0+t1], xm0
lea t0, [t0+t1*2]
movq [t0], xmm1
movhps [t0+t1], xmm1
AVG_END 4
INIT_ZMM avx512
cglobal pixel_avg_weight_w16
BIWEIGHT_START
AVG_START 5
.height_loop:
movu xm0, [t2]
movu xm1, [t4]
vinserti128 ym0, [t2+t3], 1
vinserti128 ym1, [t4+t5], 1
lea t2, [t2+t3*2]
lea t4, [t4+t5*2]
vinserti32x4 m0, [t2], 2
vinserti32x4 m1, [t4], 2
vinserti32x4 m0, [t2+t3], 3
vinserti32x4 m1, [t4+t5], 3
SBUTTERFLY bw, 0, 1, 2
pmaddubsw m0, m3
pmaddubsw m1, m3
pmulhrsw m0, m4
pmulhrsw m1, m4
packuswb m0, m1
mova [t0], xm0
vextracti128 [t0+t1], ym0, 1
lea t0, [t0+t1*2]
vextracti32x4 [t0], m0, 2
vextracti32x4 [t0+t1], m0, 3
AVG_END 4
%endif ;HIGH_BIT_DEPTH
;=============================================================================
@ -738,6 +805,12 @@ INIT_XMM avx2
AVG_FUNC 16, movdqu, movdqa
AVGH 16, 16
AVGH 16, 8
INIT_XMM avx512
AVGH 16, 16
AVGH 16, 8
AVGH 8, 16
AVGH 8, 8
AVGH 8, 4
%endif ;HIGH_BIT_DEPTH
@ -2125,7 +2198,7 @@ INIT_XMM sse2
MC_CHROMA
INIT_XMM ssse3
MC_CHROMA_SSSE3
INIT_XMM ssse3, cache64
INIT_XMM cache64, ssse3
MC_CHROMA_SSSE3
INIT_XMM avx
MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64

@ -30,18 +30,15 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 32
pw_1024: times 16 dw 1024
filt_mul20: times 32 db 20
filt_mul15: times 16 db 1, -5
filt_mul51: times 16 db -5, 1
hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
SECTION_RODATA 64
%if HIGH_BIT_DEPTH
v210_mask: times 4 dq 0xc00ffc003ff003ff
v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
v210_shuf_avx512: db 0, 0,34, 1,35,34, 4, 4,38, 5,39,38, 8, 8,42, 9, ; luma, chroma
db 43,42,12,12,46,13,47,46,16,16,50,17,51,50,20,20,
db 54,21,55,54,24,24,58,25,59,58,28,28,62,29,63,62
v210_mask: dd 0x3ff003ff, 0xc00ffc00, 0x3ff003ff, 0xc00ffc00
v210_luma_shuf: db 1, 2, 4, 5, 6, 7, 9,10,12,13,14,15,12,13,14,15
v210_chroma_shuf: db 0, 1, 2, 3, 5, 6, 8, 9,10,11,13,14,10,11,13,14
; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
@ -58,6 +55,13 @@ deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
%endif ; !HIGH_BIT_DEPTH
pw_1024: times 16 dw 1024
filt_mul20: times 32 db 20
filt_mul15: times 16 db 1, -5
filt_mul51: times 16 db -5, 1
hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
mbtree_prop_list_avx512_shuf: dw 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7
mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
mbtree_fix8_pack_shuf: db 1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
@ -1044,8 +1048,8 @@ PLANE_COPY_CORE 1
%endif ; HIGH_BIT_DEPTH
%endmacro
%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
mova m0, [%3]
%macro DEINTERLEAVE 6 ; dsta, dstb, src, dsta==dstb+8, shuffle constant, is aligned
mov%6 m0, [%3]
%if mmsize == 32
pshufb m0, %5
vpermq m0, m0, q3120
@ -1056,7 +1060,7 @@ PLANE_COPY_CORE 1
vextracti128 [%2], m0, 1
%endif
%elif HIGH_BIT_DEPTH
mova m1, [%3+mmsize]
mov%6 m1, [%3+mmsize]
psrld m2, m0, 16
psrld m3, m1, 16
pand m0, %5
@ -1181,8 +1185,8 @@ cglobal store_interleave_chroma, 5,5
%macro PLANE_DEINTERLEAVE 0
;-----------------------------------------------------------------------------
; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
; pixel *dstv, intptr_t i_dstv,
; void plane_copy_deinterleave( pixel *dsta, intptr_t i_dsta,
; pixel *dstb, intptr_t i_dstb,
; pixel *src, intptr_t i_src, int w, int h )
;-----------------------------------------------------------------------------
%if ARCH_X86_64
@ -1407,17 +1411,37 @@ cglobal plane_copy_deinterleave_v210, 7,7,7
neg r6
mov src, r4
mov org_w, r6
%if cpuflag(avx512)
vpbroadcastd m2, [v210_mask]
vpbroadcastd m3, [v210_shuf_avx512]
psrlw m3, 6 ; dw 0, 4
mova m4, [v210_shuf_avx512] ; luma
psrlw m5, m4, 8 ; chroma
%else
%if mmsize == 32
vbroadcasti128 m2, [v210_mask]
vbroadcasti128 m3, [v210_luma_shuf]
vbroadcasti128 m4, [v210_chroma_shuf]
%else
mova m2, [v210_mask]
mova m3, [v210_luma_shuf]
mova m4, [v210_chroma_shuf]
%endif
mova m5, [v210_mult] ; also functions as vpermd index for avx2
pshufd m6, m5, q1102
%endif
ALIGN 16
.loop:
movu m1, [r4]
pandn m0, m2, m1
pand m1, m2
%if cpuflag(avx512)
psrld m0, 10
vpsrlvw m1, m3
mova m6, m0
vpermt2w m0, m4, m1
vpermt2w m1, m5, m6
%else
pshufb m0, m3
pshufb m1, m4
pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
@ -1425,11 +1449,12 @@ ALIGN 16
%if mmsize == 32
vpermd m0, m5, m0
vpermd m1, m5, m1
%endif
%endif
movu [r0+r6], m0
movu [r2+r6], m1
add r4, mmsize
add r6, 3*mmsize/4
add r6, mmsize*3/4
jl .loop
add r0, r1
add r2, r3
@ -1461,6 +1486,8 @@ PLANE_DEINTERLEAVE_V210
INIT_YMM avx2
LOAD_DEINTERLEAVE_CHROMA
PLANE_DEINTERLEAVE_V210
INIT_ZMM avx512
PLANE_DEINTERLEAVE_V210
%else
INIT_XMM sse2
PLANE_DEINTERLEAVE_RGB
@ -1473,82 +1500,85 @@ LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
PLANE_DEINTERLEAVE_RGB
%endif
; These functions are not general-use; not only do the SSE ones require aligned input,
; but they also will fail if given a non-mod16 size.
; memzero SSE will fail for non-mod128.
; These functions are not general-use; not only do they require aligned input, but memcpy
; requires size to be a multiple of 16 and memzero requires size to be a multiple of 128.
;-----------------------------------------------------------------------------
; void *memcpy_aligned( void *dst, const void *src, size_t n );
;-----------------------------------------------------------------------------
%macro MEMCPY 0
cglobal memcpy_aligned, 3,3
%if mmsize == 16
%if mmsize == 32
test r2d, 16
jz .copy2
mova m0, [r1+r2-16]
mova [r0+r2-16], m0
jz .copy32
mova xm0, [r1+r2-16]
mova [r0+r2-16], xm0
sub r2d, 16
.copy2:
%endif
test r2d, 2*mmsize
jz .copy4start
jle .ret
.copy32:
%endif
test r2d, mmsize
jz .loop
mova m0, [r1+r2-mmsize]
mova [r0+r2-mmsize], m0
sub r2d, mmsize
jle .ret
.loop:
mova m0, [r1+r2-1*mmsize]
mova m1, [r1+r2-2*mmsize]
mova [r0+r2-1*mmsize], m0
mova [r0+r2-2*mmsize], m1
sub r2d, 2*mmsize
.copy4start:
test r2d, r2d
jz .ret
.copy4:
mova m0, [r1+r2-1*mmsize]
mova m1, [r1+r2-2*mmsize]
mova m2, [r1+r2-3*mmsize]
mova m3, [r1+r2-4*mmsize]
mova [r0+r2-1*mmsize], m0
mova [r0+r2-2*mmsize], m1
mova [r0+r2-3*mmsize], m2
mova [r0+r2-4*mmsize], m3
sub r2d, 4*mmsize
jg .copy4
jg .loop
.ret:
REP_RET
RET
%endmacro
INIT_MMX mmx
MEMCPY
INIT_XMM sse
MEMCPY
;-----------------------------------------------------------------------------
; void *memzero_aligned( void *dst, size_t n );
;-----------------------------------------------------------------------------
%macro MEMZERO 1
%macro MEMZERO 0
cglobal memzero_aligned, 2,2
add r0, r1
neg r1
%if mmsize == 8
pxor m0, m0
%else
xorps m0, m0
%endif
.loop:
%assign i 0
%rep %1
mova [r0 + r1 + i], m0
%assign i i+mmsize
%assign %%i mmsize
%rep 128 / mmsize
movaps [r0 + r1 - %%i], m0
%assign %%i %%i+mmsize
%endrep
add r1, mmsize*%1
jl .loop
sub r1d, 128
jg .loop
RET
%endmacro
INIT_MMX mmx
MEMZERO 8
INIT_XMM sse
MEMZERO 8
MEMCPY
MEMZERO
INIT_YMM avx
MEMZERO 4
MEMCPY
MEMZERO
INIT_ZMM avx512
MEMZERO
cglobal memcpy_aligned, 3,4
dec r2d ; offset of the last byte
rorx r3d, r2d, 2
and r2d, ~63
and r3d, 15 ; n = number of dwords minus one to copy in the tail
mova m0, [r1+r2]
not r3d ; bits 0-4: (n^15)+16, bits 16-31: 0xffff
shrx r3d, r3d, r3d ; 0xffff >> (n^15)
kmovw k1, r3d ; (1 << (n+1)) - 1
vmovdqa32 [r0+r2] {k1}, m0
sub r2d, 64
jl .ret
.loop:
mova m0, [r1+r2]
mova [r0+r2], m0
sub r2d, 64
jge .loop
.ret:
RET
%if HIGH_BIT_DEPTH == 0
;-----------------------------------------------------------------------------
@ -2147,13 +2177,13 @@ MBTREE
cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
vbroadcastss m5, [r5]
mov r5d, r6m
lea r0, [r0+r5*2]
lea r2, [r2+r5*2]
add r5d, r5d
add r1, r5
add r2, r5
add r3, r5
add r4, r5
neg r5
sub r1, r5
sub r3, r5
sub r0, r5
mova xm4, [pw_3fff]
%if notcpuflag(avx2)
pxor xm7, xm7
@ -2165,9 +2195,8 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
pmovzxwd m2, [r1+r5] ; prop
pand xm3, xm4, [r3+r5] ; inter
pmovzxwd m3, xm3
pminsd m3, m0
pmaddwd m1, m0
psubd m3, m0, m3
psubusw m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
@ -2184,7 +2213,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
movu xm1, [r4+r5]
movu xm2, [r1+r5]
pand xm3, xm4, [r3+r5]
pminsw xm3, xm0
psubusw xm3, xm0, xm3
INT16_UNPACK 0
INT16_UNPACK 1
INT16_UNPACK 2
@ -2194,7 +2223,6 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m1, m0
subps m3, m0, m3
mulps m1, m5 ; intra*invq*fps_factor>>8
addps m1, m2 ; prop + (intra*invq*fps_factor>>8)
rcpps m2, m0 ; 1 / intra 1st approximation
@ -2205,7 +2233,7 @@ cglobal mbtree_propagate_cost, 6,6,8-2*cpuflag(avx2)
subps m2, m0 ; 2nd approximation for 1/intra
mulps m1, m2 ; / intra
%endif
vcvtps2dq m1, m1
cvtps2dq m1, m1
vextractf128 xm2, m1, 1
packssdw xm1, xm2
mova [r0+r5], xm1
@ -2219,6 +2247,39 @@ MBTREE_AVX
INIT_YMM avx2
MBTREE_AVX
INIT_ZMM avx512
cglobal mbtree_propagate_cost, 6,6
vbroadcastss m5, [r5]
mov r5d, 0x3fff3fff
vpbroadcastd ym4, r5d
mov r5d, r6m
lea r2, [r2+r5*2]
add r5d, r5d
add r1, r5
neg r5
sub r4, r5
sub r3, r5
sub r0, r5
.loop:
pmovzxwd m0, [r2+r5] ; intra
pmovzxwd m1, [r1+r5] ; prop
pmovzxwd m2, [r4+r5] ; invq
pand ym3, ym4, [r3+r5] ; inter
pmovzxwd m3, ym3
psubusw m3, m0, m3
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
vdivps m1, m0, {rn-sae}
fmaddps m1, m2, m5, m1
mulps m1, m3
cvtps2dq m1, m1
vpmovsdw [r0+r5], m1
add r5, 32
jl .loop
RET
%macro MBTREE_PROPAGATE_LIST 0
;-----------------------------------------------------------------------------
; void mbtree_propagate_list_internal( int16_t (*mvs)[2], int16_t *propagate_amount, uint16_t *lowres_costs,
@ -2372,6 +2433,112 @@ cglobal mbtree_propagate_list_internal, 4+2*UNIX64,5+UNIX64,8
jl .loop
RET
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
; uint16_t *lowres_costs, int bipred_weight, int mb_y,
; int width, int height, int stride, int list_mask );
;-----------------------------------------------------------------------------
INIT_ZMM avx512
cglobal mbtree_propagate_list_internal, 5,7,21
mova xm16, [pw_0xc000]
vpbroadcastw xm17, r5m ; bipred_weight << 9
vpbroadcastw ym18, r10m ; 1 << (list+LOWRES_COST_SHIFT)
vbroadcasti32x8 m5, [mbtree_prop_list_avx512_shuf]
vbroadcasti32x8 m6, [pd_0123]
vpord m6, r6m {1to16} ; 0 y 1 y 2 y 3 y 4 y 5 y 6 y 7 y
vbroadcasti128 m7, [pd_8]
vbroadcasti128 m8, [pw_31]
vbroadcasti128 m9, [pw_32]
psllw m10, m9, 4
pcmpeqw ym19, ym19 ; pw_m1
vpbroadcastw ym20, r7m ; width
psrld m11, m7, 3 ; pd_1
psrld m12, m8, 16 ; pd_31
vpbroadcastd m13, r8m ; height
vpbroadcastd m14, r9m ; stride
pslld m15, m14, 16
por m15, m11 ; {1, stride, 1, stride} ...
lea r4, [r4+2*r0] ; lowres_costs
lea r3, [r3+2*r0] ; propagate_amount
lea r2, [r2+4*r0] ; mvs
neg r0
mov r6d, 0x5555ffff
kmovd k4, r6d
kshiftrd k5, k4, 16 ; 0x5555
kshiftlw k6, k4, 8 ; 0xff00
.loop:
vbroadcasti128 ym1, [r4+2*r0]
mova xm4, [r3+2*r0]
vpcmpuw k1, xm1, xm16, 5 ; if (lists_used == 3)
vpmulhrsw xm4 {k1}, xm17 ; propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
vptestmw k1, ym1, ym18
vpermw m4, m5, m4
vbroadcasti32x8 m3, [r2+4*r0] ; {mvx, mvy}
psraw m0, m3, 5
paddw m0, m6 ; {mbx, mby} = ({x, y} >> 5) + {h->mb.i_mb_x, h->mb.i_mb_y}
paddd m6, m7 ; i_mb_x += 8
pand m3, m8 ; {x, y}
vprold m1, m3, 20 ; {y, x} << 4
psubw m3 {k4}, m9, m3 ; {32-x, 32-y}, {32-x, y}
psubw m1 {k5}, m10, m1 ; ({32-y, x}, {y, x}) << 4
pmullw m3, m1
paddsw m3, m3 ; prevent signed overflow in idx0 (32*32<<5 == 0x8000)
pmulhrsw m2, m3, m4 ; idx01weight idx23weightp
pslld ym1, ym0, 16
psubw ym1, ym19
vmovdqu16 ym1 {k5}, ym0
vpcmpuw k2, ym1, ym20, 1 ; {mbx, mbx+1} < width
kunpckwd k2, k2, k2
psrad m1, m0, 16
paddd m1 {k6}, m11
vpcmpud k1 {k1}, m1, m13, 1 ; mby < height | mby+1 < height
pmaddwd m0, m15
paddd m0 {k6}, m14 ; idx0 | idx2
vmovdqu16 m2 {k2}{z}, m2 ; idx01weight | idx23weight
vptestmd k1 {k1}, m2, m2 ; mask out offsets with no changes
; We're handling dwords, but the offsets are in words so there may be partial overlaps.
; We can work around this by handling dword-aligned and -unaligned offsets separately.
vptestmd k0, m0, m11
kandnw k2, k0, k1 ; dword-aligned offsets
kmovw k3, k2
vpgatherdd m3 {k2}, [r1+2*m0]
; If there are conflicts in the offsets we have to handle them before storing the results.
; By creating a permutation index using vplzcntd we can resolve all conflicts in parallel
; in ceil(log2(n)) iterations where n is the largest number of duplicate offsets.
vpconflictd m4, m0
vpbroadcastmw2d m1, k1
vptestmd k2, m1, m4
ktestw k2, k2
jz .no_conflicts
pand m1, m4 ; mask away unused offsets to avoid false positives
vplzcntd m1, m1
pxor m1, m12 ; lzcnt gives us the distance from the msb, we want it from the lsb
.conflict_loop:
vpermd m4 {k2}{z}, m1, m2
vpermd m1 {k2}, m1, m1 ; shift the index one step forward
paddsw m2, m4 ; add the weights of conflicting offsets
vpcmpd k2, m1, m12, 2
ktestw k2, k2
jnz .conflict_loop
.no_conflicts:
paddsw m3, m2
vpscatterdd [r1+2*m0] {k3}, m3
kandw k1, k0, k1 ; dword-unaligned offsets
kmovw k2, k1
vpgatherdd m1 {k1}, [r1+2*m0]
paddsw m1, m2 ; all conflicts have already been resolved
vpscatterdd [r1+2*m0] {k2}, m1
add r0, 8
jl .loop
RET
%endif
%macro MBTREE_FIX8 0
;-----------------------------------------------------------------------------
; void mbtree_fix8_pack( uint16_t *dst, float *src, int count )

@ -32,7 +32,8 @@
void func##_mmx2 args;\
void func##_sse2 args;\
void func##_ssse3 args;\
void func##_avx2 args;
void func##_avx2 args;\
void func##_avx512 args;
DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
@ -99,17 +100,17 @@ void x264_plane_copy_interleave_core_sse2( pixel *dst, intptr_t i_dst,
void x264_plane_copy_interleave_core_avx( pixel *dst, intptr_t i_dst,
pixel *srcu, intptr_t i_srcu,
pixel *srcv, intptr_t i_srcv, int w, int h );
void x264_plane_copy_deinterleave_sse2( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
void x264_plane_copy_deinterleave_sse2( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, intptr_t i_dstu,
uint8_t *dstv, intptr_t i_dstv,
void x264_plane_copy_deinterleave_ssse3( uint8_t *dsta, intptr_t i_dsta,
uint8_t *dstb, intptr_t i_dstb,
uint8_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
void x264_plane_copy_deinterleave_avx( uint16_t *dsta, intptr_t i_dsta,
uint16_t *dstb, intptr_t i_dstb,
uint16_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_avx2( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
void x264_plane_copy_deinterleave_avx2( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_rgb_sse2 ( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
@ -123,7 +124,7 @@ void x264_plane_copy_deinterleave_rgb_avx2 ( pixel *dsta, intptr_t i_dsta,
pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc,
pixel *src, intptr_t i_src, int pw, int w, int h );
void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
void x264_plane_copy_deinterleave_v210_ssse3 ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
@ -132,6 +133,9 @@ void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_v210_avx512( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint32_t *src, intptr_t i_src, int w, int h );
void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
@ -143,11 +147,12 @@ void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
void x264_memzero_aligned_mmx( void *dst, size_t n );
void x264_memzero_aligned_sse( void *dst, size_t n );
void x264_memzero_aligned_avx( void *dst, size_t n );
void *x264_memcpy_aligned_sse ( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_avx ( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_avx512( void *dst, const void *src, size_t n );
void x264_memzero_aligned_sse ( void *dst, size_t n );
void x264_memzero_aligned_avx ( void *dst, size_t n );
void x264_memzero_aligned_avx512( void *dst, size_t n );
void x264_integral_init4h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init4h_avx2( uint16_t *sum, uint8_t *pix, intptr_t stride );
void x264_integral_init8h_sse4( uint16_t *sum, uint8_t *pix, intptr_t stride );
@ -160,13 +165,15 @@ void x264_integral_init4v_avx2( uint16_t *sum8, uint16_t *sum4, intptr_t stride
void x264_integral_init8v_mmx ( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_sse2( uint16_t *sum8, intptr_t stride );
void x264_integral_init8v_avx2( uint16_t *sum8, intptr_t stride );
void x264_mbtree_propagate_cost_sse2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
void x264_mbtree_propagate_cost_sse2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_fma4( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
void x264_mbtree_propagate_cost_fma4 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2 ( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_propagate_cost_avx2( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
void x264_mbtree_propagate_cost_avx512( int16_t *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len );
void x264_mbtree_fix8_pack_ssse3( uint16_t *dst, float *src, int count );
void x264_mbtree_fix8_pack_avx2 ( uint16_t *dst, float *src, int count );
@ -179,7 +186,7 @@ void x264_mc_chroma_##cpu( pixel *dstu, pixel *dstv, intptr_t i_dst, pixel *src,
MC_CHROMA(mmx2)
MC_CHROMA(sse2)
MC_CHROMA(ssse3)
MC_CHROMA(ssse3_cache64)
MC_CHROMA(cache64_ssse3)
MC_CHROMA(avx)
MC_CHROMA(avx2)
@ -498,6 +505,15 @@ PLANE_COPY(32, avx)
PLANE_COPY_SWAP(16, ssse3)
PLANE_COPY_SWAP(32, avx2)
#if HIGH_BIT_DEPTH
PLANE_COPY_YUYV(64, sse2)
PLANE_COPY_YUYV(64, avx)
#else
PLANE_COPY_YUYV(32, sse2)
PLANE_COPY_YUYV(32, ssse3)
#endif
PLANE_COPY_YUYV(64, avx2)
PLANE_INTERLEAVE(mmx2)
PLANE_INTERLEAVE(sse2)
#if HIGH_BIT_DEPTH
@ -538,6 +554,21 @@ PROPAGATE_LIST(ssse3)
PROPAGATE_LIST(avx)
PROPAGATE_LIST(avx2)
#if ARCH_X86_64
void x264_mbtree_propagate_list_internal_avx512( size_t len, uint16_t *ref_costs, int16_t (*mvs)[2], int16_t *propagate_amount,
uint16_t *lowres_costs, int bipred_weight, int mb_y,
int width, int height, int stride, int list_mask );
static void x264_mbtree_propagate_list_avx512( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],
int16_t *propagate_amount, uint16_t *lowres_costs,
int bipred_weight, int mb_y, int len, int list )
{
x264_mbtree_propagate_list_internal_avx512( len, ref_costs, mvs, propagate_amount, lowres_costs, bipred_weight << 9,
mb_y << 16, h->mb.i_mb_width, h->mb.i_mb_height, h->mb.i_mb_stride,
(1 << LOWRES_COST_SHIFT) << list );
}
#endif
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_MMX) )
@ -547,8 +578,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
pf->memcpy_aligned = x264_memcpy_aligned_mmx;
pf->memzero_aligned = x264_memzero_aligned_mmx;
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
@ -606,6 +635,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2;
if( cpu&X264_CPU_SSE2_IS_FAST )
{
@ -661,6 +691,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;
@ -677,6 +708,11 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx512;
}
#else // !HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
@ -702,6 +738,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->hpel_filter = x264_hpel_filter_sse2_amd;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_sse2;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_sse2;
@ -763,6 +800,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_ssse3;
}
if( !(cpu&X264_CPU_SLOW_PALIGNR) )
@ -779,7 +817,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( cpu&X264_CPU_CACHELINE_64 )
{
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
pf->mc_chroma = x264_mc_chroma_cache64_ssse3;
pf->mc_luma = mc_luma_cache64_ssse3;
pf->get_ref = get_ref_cache64_ssse3;
if( cpu&X264_CPU_SLOW_ATOM )
@ -828,10 +866,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
}
if( cpu&X264_CPU_AVX512 )
{
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx512;
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512;
}
#endif // HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_AVX) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_avx;
pf->memzero_aligned = x264_memzero_aligned_avx;
pf->plane_copy = x264_plane_copy_avx;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx;
@ -844,10 +892,20 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->plane_copy_swap = x264_plane_copy_swap_avx2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
pf->plane_copy_deinterleave_yuyv = x264_plane_copy_deinterleave_yuyv_avx2;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_avx2;
pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_avx2;
if( !(cpu&X264_CPU_AVX512) )
return;
pf->memcpy_aligned = x264_memcpy_aligned_avx512;
pf->memzero_aligned = x264_memzero_aligned_avx512;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx512;
#if ARCH_X86_64
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx512;
#endif
}

@ -32,6 +32,8 @@
%include "x86util.asm"
SECTION_RODATA 32
var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
hmul_16p: times 16 db 1
times 8 db 1, -1
hmul_8p: times 8 db 1
@ -701,25 +703,32 @@ SSD_NV12
%if HIGH_BIT_DEPTH == 0
%if %1
mova m7, [pw_00ff]
%elif mmsize < 32
%elif mmsize == 16
pxor m7, m7 ; zero
%endif
%endif ; !HIGH_BIT_DEPTH
%endmacro
%macro VAR_END 2
%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256
HADDUW m5, m2
%else
HADDW m5, m2
%macro VAR_END 0
pmaddwd m5, [pw_1]
SBUTTERFLY dq, 5, 6, 0
paddd m5, m6
%if mmsize == 32
vextracti128 xm6, m5, 1
paddd xm5, xm6
%endif
HADDD m6, m1
MOVHL xm6, xm5
paddd xm5, xm6
%if ARCH_X86_64
punpckldq m5, m6
movq rax, m5
movq rax, xm5
%else
movd eax, xm5
%if cpuflag(avx)
pextrd edx, xm5, 1
%else
movd eax, m5
movd edx, m6
pshuflw xm5, xm5, q1032
movd edx, xm5
%endif
%endif
RET
%endmacro
@ -739,61 +748,25 @@ SSD_NV12
paddd m6, m4
%endmacro
%macro VAR_2ROW 2
mov r2d, %2
.loop:
%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
mova m3, [r0+%1]
mova m4, [r0+%1+mmsize]
%else ; !HIGH_BIT_DEPTH
mova m0, [r0]
mova m3, [r0+%1]
punpckhbw m1, m0, m7
punpcklbw m0, m7
punpckhbw m4, m3, m7
punpcklbw m3, m7
%endif ; HIGH_BIT_DEPTH
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
VAR_CORE
dec r2d
jg .loop
%endmacro
;-----------------------------------------------------------------------------
; int pixel_var_wxh( uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
INIT_MMX mmx2
cglobal pixel_var_16x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
cglobal pixel_var_8x16, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 8, 16
cglobal pixel_var_8x8, 2,3
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 4
VAR_END 8, 8
%if HIGH_BIT_DEPTH
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
FIX_STRIDES r1
VAR_START 0
VAR_2ROW r1, 8
VAR_END 16, 16
mov r2d, 8
.loop:
mova m0, [r0]
mova m1, [r0+mmsize]
mova m3, [r0+r1]
mova m4, [r0+r1+mmsize]
lea r0, [r0+r1*2]
VAR_CORE
dec r2d
jg .loop
VAR_END
cglobal pixel_var_8x8, 2,3,8
lea r2, [r1*3]
@ -809,18 +782,16 @@ cglobal pixel_var_8x8, 2,3,8
mova m3, [r0+r1*4]
mova m4, [r0+r2*2]
VAR_CORE
VAR_END 8, 8
VAR_END
%endmacro ; VAR
INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
%endif ; HIGH_BIT_DEPTH
%if HIGH_BIT_DEPTH == 0
%else ; HIGH_BIT_DEPTH == 0
%macro VAR 0
cglobal pixel_var_16x16, 2,3,8
VAR_START 1
@ -833,7 +804,7 @@ cglobal pixel_var_16x16, 2,3,8
VAR_CORE
dec r2d
jg .loop
VAR_END 16, 16
VAR_END
cglobal pixel_var_8x8, 2,4,8
VAR_START 1
@ -849,7 +820,7 @@ cglobal pixel_var_8x8, 2,4,8
VAR_CORE
dec r2d
jg .loop
VAR_END 8, 8
VAR_END
cglobal pixel_var_8x16, 2,4,8
VAR_START 1
@ -865,15 +836,13 @@ cglobal pixel_var_8x16, 2,4,8
VAR_CORE
dec r2d
jg .loop
VAR_END 8, 16
VAR_END
%endmacro ; VAR
INIT_XMM sse2
VAR
INIT_XMM avx
VAR
INIT_XMM xop
VAR
%endif ; !HIGH_BIT_DEPTH
INIT_YMM avx2
@ -898,209 +867,357 @@ cglobal pixel_var_16x16, 2,4,7
VAR_CORE
dec r2d
jg .loop
vextracti128 xm0, m5, 1
vextracti128 xm1, m6, 1
paddw xm5, xm0
paddd xm6, xm1
HADDW xm5, xm2
HADDD xm6, xm1
%if ARCH_X86_64
punpckldq xm5, xm6
movq rax, xm5
VAR_END
%macro VAR_AVX512_CORE 1 ; accum
%if %1
paddw m0, m2
pmaddwd m2, m2
paddw m0, m3
pmaddwd m3, m3
paddd m1, m2
paddd m1, m3
%else
movd eax, xm5
movd edx, xm6
paddw m0, m2, m3
pmaddwd m2, m2
pmaddwd m3, m3
paddd m1, m2, m3
%endif
RET
%endmacro
%macro VAR2_END 3
HADDW %2, xm1
movd r1d, %2
imul r1d, r1d
HADDD %3, xm1
shr r1d, %1
movd eax, %3
movd [r4], %3
sub eax, r1d ; sqr - (sum * sum >> shift)
RET
%macro VAR_AVX512_CORE_16x16 1 ; accum
%if HIGH_BIT_DEPTH
mova ym2, [r0]
vinserti64x4 m2, [r0+r1], 1
mova ym3, [r0+2*r1]
vinserti64x4 m3, [r0+r3], 1
%else
vbroadcasti64x2 ym2, [r0]
vbroadcasti64x2 m2 {k1}, [r0+r1]
vbroadcasti64x2 ym3, [r0+2*r1]
vbroadcasti64x2 m3 {k1}, [r0+r3]
pshufb m2, m4
pshufb m3, m4
%endif
VAR_AVX512_CORE %1
%endmacro
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *, intptr_t, pixel *, intptr_t, int * )
;-----------------------------------------------------------------------------
%macro VAR2_8x8_MMX 2
cglobal pixel_var2_8x%1, 5,6
FIX_STRIDES r1, r3
VAR_START 0
mov r5d, %1
.loop:
%macro VAR_AVX512_CORE_8x8 1 ; accum
%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+mmsize]
psubw m0, [r2]
psubw m1, [r2+mmsize]
%else ; !HIGH_BIT_DEPTH
movq m0, [r0]
movq m1, m0
movq m2, [r2]
movq m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
%endif ; HIGH_BIT_DEPTH
paddw m5, m0
paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
paddd m6, m0
paddd m6, m1
add r0, r1
add r2, r3
dec r5d
jg .loop
VAR2_END %2, m5, m6
mova xm2, [r0]
mova xm3, [r0+r1]
%else
movq xm2, [r0]
movq xm3, [r0+r1]
%endif
vinserti128 ym2, [r0+2*r1], 1
vinserti128 ym3, [r0+r2], 1
lea r0, [r0+4*r1]
vinserti32x4 m2, [r0], 2
vinserti32x4 m3, [r0+r1], 2
vinserti32x4 m2, [r0+2*r1], 3
vinserti32x4 m3, [r0+r2], 3
%if HIGH_BIT_DEPTH == 0
punpcklbw m2, m4
punpcklbw m3, m4
%endif
VAR_AVX512_CORE %1
%endmacro
INIT_ZMM avx512
cglobal pixel_var_16x16, 2,4
FIX_STRIDES r1
mov r2d, 0xf0
lea r3, [3*r1]
%if HIGH_BIT_DEPTH == 0
vbroadcasti64x4 m4, [var_shuf_avx512]
kmovb k1, r2d
%endif
VAR_AVX512_CORE_16x16 0
.loop:
lea r0, [r0+4*r1]
VAR_AVX512_CORE_16x16 1
sub r2d, 0x50
jg .loop
%if ARCH_X86_64 == 0
INIT_MMX mmx2
VAR2_8x8_MMX 8, 6
VAR2_8x8_MMX 16, 7
pop r3d
%assign regs_used 3
%endif
var_avx512_end:
vbroadcasti32x4 m2, [pw_1]
pmaddwd m0, m2
SBUTTERFLY dq, 0, 1, 2
paddd m0, m1
vextracti32x8 ym1, m0, 1
paddd ym0, ym1
vextracti128 xm1, ym0, 1
paddd xmm0, xm0, xm1
punpckhqdq xmm1, xmm0, xmm0
paddd xmm0, xmm1
%if ARCH_X86_64
movq rax, xmm0
%else
movd eax, xmm0
pextrd edx, xmm0, 1
%endif
RET
%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth
cglobal pixel_var_8x8, 2,3
lea r2, [3*r1]
pxor xm4, xm4
VAR_AVX512_CORE_8x8 0
jmp var_avx512_end
%endif
cglobal pixel_var_8x16, 2,3
FIX_STRIDES r1
lea r2, [3*r1]
%if HIGH_BIT_DEPTH == 0
pxor xm4, xm4
%endif
VAR_AVX512_CORE_8x8 0
lea r0, [r0+4*r1]
VAR_AVX512_CORE_8x8 1
jmp var_avx512_end
;-----------------------------------------------------------------------------
; int pixel_var2_8x8( pixel *fenc, pixel *fdec, int ssd[2] )
;-----------------------------------------------------------------------------
%if ARCH_X86_64
DECLARE_REG_TMP 6
%else
DECLARE_REG_TMP 2
%endif
%macro VAR2_END 3 ; src, tmp, shift
movifnidn r2, r2mp
pshufd %2, %1, q3331
pmuludq %1, %1
movq [r2], %2 ; sqr_u sqr_v
psrld %1, %3
psubd %2, %1 ; sqr - (sum * sum >> shift)
MOVHL %1, %2
paddd %1, %2
movd eax, %1
RET
%endmacro
%macro VAR2_8x8_SSE2 2
cglobal pixel_var2_8x%1, 5,6,8
VAR_START 1
mov r5d, %1/2
%if HIGH_BIT_DEPTH
cglobal pixel_var2_8x%1, 2,3,6
pxor m4, m4
pxor m5, m5
%define %%sum2 m4
%define %%sqr2 m5
%else
cglobal pixel_var2_8x%1, 2,3,7
mova m6, [pw_00ff]
%define %%sum2 m0
%define %%sqr2 m1
%endif
pxor m0, m0 ; sum
pxor m1, m1 ; sqr
mov t0d, (%1-1)*FENC_STRIDEB
.loop:
%if HIGH_BIT_DEPTH
mova m0, [r0]
mova m1, [r0+r1*2]
mova m2, [r2]
mova m3, [r2+r3*2]
%else ; !HIGH_BIT_DEPTH
movq m1, [r0]
movhps m1, [r0+r1]
movq m3, [r2]
movhps m3, [r2+r3]
DEINTB 0, 1, 2, 3, 7
%endif ; HIGH_BIT_DEPTH
psubw m0, m2
psubw m1, m3
paddw m5, m0
paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
paddd m6, m0
paddd m6, m1
lea r0, [r0+r1*2*SIZEOF_PIXEL]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
dec r5d
jg .loop
VAR2_END %2, m5, m6
mova m2, [r0+1*t0]
psubw m2, [r1+2*t0]
mova m3, [r0+1*t0+16]
psubw m3, [r1+2*t0+32]
%else
mova m3, [r0+1*t0]
movq m5, [r1+2*t0]
punpcklqdq m5, [r1+2*t0+16]
DEINTB 2, 3, 4, 5, 6
psubw m2, m4
psubw m3, m5
%endif
paddw m0, m2
pmaddwd m2, m2
paddw %%sum2, m3
pmaddwd m3, m3
paddd m1, m2
paddd %%sqr2, m3
sub t0d, FENC_STRIDEB
jge .loop
%if HIGH_BIT_DEPTH
SBUTTERFLY dq, 0, 4, 2
paddw m0, m4 ; sum_u sum_v
pmaddwd m0, [pw_1]
SBUTTERFLY dq, 1, 5, 2
paddd m1, m5 ; sqr_u sqr_v
SBUTTERFLY dq, 0, 1, 2
paddd m0, m1
%else
pmaddwd m0, [pw_1]
shufps m2, m0, m1, q2020
shufps m0, m1, q3131
paddd m0, m2
pshufd m0, m0, q3120 ; sum_u sqr_u sum_v sqr_v
%endif
VAR2_END m0, m1, %2
%endmacro
INIT_XMM sse2
VAR2_8x8_SSE2 8, 6
VAR2_8x8_SSE2 16, 7
%macro VAR2_CORE 3 ; src1, src2, accum
%if %3
paddw m0, %1
pmaddwd %1, %1
paddw m0, %2
pmaddwd %2, %2
paddd m1, %1
paddd m1, %2
%else
paddw m0, %1, %2
pmaddwd %1, %1
pmaddwd %2, %2
paddd m1, %1, %2
%endif
%endmacro
%if HIGH_BIT_DEPTH == 0
%macro VAR2_8x8_SSSE3 2
cglobal pixel_var2_8x%1, 5,6,8
pxor m5, m5 ; sum
pxor m6, m6 ; sum squared
mova m7, [hsub_mul]
mov r5d, %1/4
INIT_XMM ssse3
cglobal pixel_var2_internal
pxor m0, m0 ; sum
pxor m1, m1 ; sqr
.loop:
movq m0, [r0]
movq m2, [r2]
movq m1, [r0+r1]
movq m3, [r2+r3]
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
punpcklbw m0, m2
punpcklbw m1, m3
movq m2, [r0]
movq m3, [r2]
punpcklbw m2, m3
movq m3, [r0+r1]
movq m4, [r2+r3]
punpcklbw m3, m4
pmaddubsw m0, m7
pmaddubsw m1, m7
movq m2, [r0+1*t0]
punpcklbw m2, [r1+2*t0]
movq m3, [r0+1*t0-1*FENC_STRIDE]
punpcklbw m3, [r1+2*t0-1*FDEC_STRIDE]
movq m4, [r0+1*t0-2*FENC_STRIDE]
punpcklbw m4, [r1+2*t0-2*FDEC_STRIDE]
movq m5, [r0+1*t0-3*FENC_STRIDE]
punpcklbw m5, [r1+2*t0-3*FDEC_STRIDE]
pmaddubsw m2, m7
pmaddubsw m3, m7
paddw m5, m0
paddw m5, m1
paddw m5, m2
paddw m5, m3
pmaddwd m0, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
paddd m6, m0
paddd m6, m1
paddd m6, m2
paddd m6, m3
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
dec r5d
pmaddubsw m4, m7
pmaddubsw m5, m7
VAR2_CORE m2, m3, 1
VAR2_CORE m4, m5, 1
sub t0d, 4*FENC_STRIDE
jg .loop
VAR2_END %2, m5, m6
pmaddwd m0, [pw_1]
ret
%macro VAR2_8x8_SSSE3 2
cglobal pixel_var2_8x%1, 2,3,8
mova m7, [hsub_mul]
mov t0d, (%1-1)*FENC_STRIDE
call pixel_var2_internal_ssse3 ; u
add r0, 8
add r1, 16
SBUTTERFLY qdq, 0, 1, 6
paddd m1, m0
mov t0d, (%1-1)*FENC_STRIDE
call pixel_var2_internal_ssse3 ; v
SBUTTERFLY qdq, 0, 6, 2
paddd m0, m6
phaddd m1, m0 ; sum_u sqr_u sum_v sqr_v
VAR2_END m1, m0, %2
%endmacro
INIT_XMM ssse3
VAR2_8x8_SSSE3 8, 6
VAR2_8x8_SSSE3 16, 7
INIT_XMM xop
VAR2_8x8_SSSE3 8, 6
VAR2_8x8_SSSE3 16, 7
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_AVX2_LOAD 3 ; offset_reg, row1_offset, row2_offset
%if HIGH_BIT_DEPTH
%if mmsize == 64
mova m2, [r1+2*%1+%2*FDEC_STRIDEB]
vshufi32x4 m2, [r1+2*%1+%2*FDEC_STRIDEB+64], q2020
mova m3, [r1+2*%1+%3*FDEC_STRIDEB]
vshufi32x4 m3, [r1+2*%1+%3*FDEC_STRIDEB+64], q2020
%else
mova xm2, [r1+2*%1+%2*FDEC_STRIDEB]
vinserti128 m2, [r1+2*%1+%2*FDEC_STRIDEB+32], 1
mova xm3, [r1+2*%1+%3*FDEC_STRIDEB]
vinserti128 m3, [r1+2*%1+%3*FDEC_STRIDEB+32], 1
%endif
psubw m2, [r0+1*%1+%2*FENC_STRIDEB]
psubw m3, [r0+1*%1+%3*FENC_STRIDEB]
%else
pmovzxbw m2, [r0+1*%1+%2*FENC_STRIDE]
mova m4, [r1+2*%1+%2*FDEC_STRIDE]
pmovzxbw m3, [r0+1*%1+%3*FENC_STRIDE]
mova m5, [r1+2*%1+%3*FDEC_STRIDE]
punpcklbw m4, m6
punpcklbw m5, m6
psubw m2, m4
psubw m3, m5
%endif
%endmacro
%macro VAR2_8x8_AVX2 2
cglobal pixel_var2_8x%1, 5,6,6
pxor m3, m3 ; sum
pxor m4, m4 ; sum squared
mova m5, [hsub_mul]
mov r5d, %1/4
%if HIGH_BIT_DEPTH
cglobal pixel_var2_8x%1, 2,3,4
%else
cglobal pixel_var2_8x%1, 2,3,7
pxor m6, m6
%endif
mov t0d, (%1-3)*FENC_STRIDEB
VAR2_AVX2_LOAD t0, 2, 1
VAR2_CORE m2, m3, 0
.loop:
movq xm0, [r0]
movq xm1, [r2]
vinserti128 m0, m0, [r0+r1], 1
vinserti128 m1, m1, [r2+r3], 1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
punpcklbw m0, m1
movq xm1, [r0]
movq xm2, [r2]
vinserti128 m1, m1, [r0+r1], 1
vinserti128 m2, m2, [r2+r3], 1
lea r0, [r0+r1*2]
lea r2, [r2+r3*2]
punpcklbw m1, m2
pmaddubsw m0, m5
pmaddubsw m1, m5
paddw m3, m0
paddw m3, m1
pmaddwd m0, m0
pmaddwd m1, m1
paddd m4, m0
paddd m4, m1
dec r5d
VAR2_AVX2_LOAD t0, 0, -1
VAR2_CORE m2, m3, 1
sub t0d, 2*FENC_STRIDEB
jg .loop
vextracti128 xm0, m3, 1
vextracti128 xm1, m4, 1
paddw xm3, xm0
paddd xm4, xm1
VAR2_END %2, xm3, xm4
pmaddwd m0, [pw_1]
SBUTTERFLY qdq, 0, 1, 2
paddd m0, m1
vextracti128 xm1, m0, 1
phaddd xm0, xm1
VAR2_END xm0, xm1, %2
%endmacro
INIT_YMM avx2
VAR2_8x8_AVX2 8, 6
VAR2_8x8_AVX2 16, 7
%endif ; !HIGH_BIT_DEPTH
%macro VAR2_AVX512_END 1 ; shift
vbroadcasti32x4 m2, [pw_1]
pmaddwd m0, m2
SBUTTERFLY qdq, 0, 1, 2
paddd m0, m1
vextracti32x8 ym1, m0, 1
paddd ym0, ym1
psrlq ym1, ym0, 32
paddd ym0, ym1
vpmovqd xmm0, ym0 ; sum_u, sqr_u, sum_v, sqr_v
VAR2_END xmm0, xmm1, %1
%endmacro
INIT_ZMM avx512
cglobal pixel_var2_8x8, 2,3
%if HIGH_BIT_DEPTH == 0
pxor xm6, xm6
%endif
VAR2_AVX2_LOAD 0, 0, 2
VAR2_CORE m2, m3, 0
VAR2_AVX2_LOAD 0, 4, 6
VAR2_CORE m2, m3, 1
VAR2_AVX512_END 6
cglobal pixel_var2_8x16, 2,3
%if HIGH_BIT_DEPTH == 0
pxor xm6, xm6
%endif
mov t0d, 10*FENC_STRIDEB
VAR2_AVX2_LOAD 0, 14, 12
VAR2_CORE m2, m3, 0
.loop:
VAR2_AVX2_LOAD t0, 0, -2
VAR2_CORE m2, m3, 1
sub t0d, 4*FENC_STRIDEB
jg .loop
VAR2_AVX512_END 7
;=============================================================================
; SATD
@ -4583,6 +4700,244 @@ cglobal intra_sad_x9_8x8, 5,7,8
mov rsp, r6
mov eax, r2d
RET
%macro SATD_AVX512_LOAD4 2 ; size, opmask
vpbroadcast%1 m0, [r0]
vpbroadcast%1 m0 {%2}, [r0+2*r1]
vpbroadcast%1 m2, [r2]
vpbroadcast%1 m2 {%2}, [r2+2*r3]
add r0, r1
add r2, r3
vpbroadcast%1 m1, [r0]
vpbroadcast%1 m1 {%2}, [r0+2*r1]
vpbroadcast%1 m3, [r2]
vpbroadcast%1 m3 {%2}, [r2+2*r3]
%endmacro
%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3
vpbroadcast%1 %{2}0, [r0]
vpbroadcast%1 %{2}0 {%3}, [r0+2*r1]
vpbroadcast%1 %{2}2, [r2]
vpbroadcast%1 %{2}2 {%3}, [r2+2*r3]
vpbroadcast%1 m0 {%4}, [r0+4*r1]
vpbroadcast%1 m2 {%4}, [r2+4*r3]
vpbroadcast%1 m0 {%5}, [r0+2*r4]
vpbroadcast%1 m2 {%5}, [r2+2*r5]
vpbroadcast%1 %{2}1, [r0+r1]
vpbroadcast%1 %{2}1 {%3}, [r0+r4]
vpbroadcast%1 %{2}3, [r2+r3]
vpbroadcast%1 %{2}3 {%3}, [r2+r5]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
vpbroadcast%1 m1 {%4}, [r0+r1]
vpbroadcast%1 m3 {%4}, [r2+r3]
vpbroadcast%1 m1 {%5}, [r0+r4]
vpbroadcast%1 m3 {%5}, [r2+r5]
%endmacro
%macro SATD_AVX512_PACKED 0
DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4
SUMSUB_BA w, 0, 1, 2
SBUTTERFLY qdq, 0, 1, 2
SUMSUB_BA w, 0, 1, 2
HMAXABSW2 0, 1, 2, 3
%endmacro
%macro SATD_AVX512_END 0-1 0 ; sa8d
paddw m0 {k1}{z}, m1 ; zero-extend to dwords
%if ARCH_X86_64
%if mmsize == 64
vextracti32x8 ym1, m0, 1
paddd ym0, ym1
%endif
%if mmsize >= 32
vextracti128 xm1, ym0, 1
paddd xmm0, xm0, xm1
%endif
punpckhqdq xmm1, xmm0, xmm0
paddd xmm0, xmm1
movq rax, xmm0
rorx rdx, rax, 32
%if %1
lea eax, [rax+rdx+1]
shr eax, 1
%else
add eax, edx
%endif
%else
HADDD m0, m1
movd eax, xm0
%if %1
inc eax
shr eax, 1
%endif
%endif
RET
%endmacro
%macro HMAXABSW2 4 ; a, b, tmp1, tmp2
pabsw m%1, m%1
pabsw m%2, m%2
psrldq m%3, m%1, 2
psrld m%4, m%2, 16
pmaxsw m%1, m%3
pmaxsw m%2, m%4
%endmacro
INIT_ZMM avx512
cglobal pixel_satd_16x8_internal
vbroadcasti64x4 m6, [hmul_16p]
kxnorb k2, k2, k2
mov r4d, 0x55555555
knotw k2, k2
kmovd k1, r4d
lea r4, [3*r1]
lea r5, [3*r3]
satd_16x8_avx512:
vbroadcasti128 ym0, [r0]
vbroadcasti32x4 m0 {k2}, [r0+4*r1] ; 0 0 4 4
vbroadcasti128 ym4, [r2]
vbroadcasti32x4 m4 {k2}, [r2+4*r3]
vbroadcasti128 ym2, [r0+2*r1]
vbroadcasti32x4 m2 {k2}, [r0+2*r4] ; 2 2 6 6
vbroadcasti128 ym5, [r2+2*r3]
vbroadcasti32x4 m5 {k2}, [r2+2*r5]
DIFF_SUMSUB_SSSE3 0, 4, 2, 5, 6
vbroadcasti128 ym1, [r0+r1]
vbroadcasti128 ym4, [r2+r3]
vbroadcasti128 ym3, [r0+r4]
vbroadcasti128 ym5, [r2+r5]
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
vbroadcasti32x4 m1 {k2}, [r0+r1] ; 1 1 5 5
vbroadcasti32x4 m4 {k2}, [r2+r3]
vbroadcasti32x4 m3 {k2}, [r0+r4] ; 3 3 7 7
vbroadcasti32x4 m5 {k2}, [r2+r5]
DIFF_SUMSUB_SSSE3 1, 4, 3, 5, 6
HADAMARD4_V 0, 1, 2, 3, 4
HMAXABSW2 0, 2, 4, 5
HMAXABSW2 1, 3, 4, 5
paddw m4, m0, m2 ; m1
paddw m2, m1, m3 ; m0
ret
cglobal pixel_satd_8x8_internal
vbroadcasti64x4 m4, [hmul_16p]
mov r4d, 0x55555555
kmovd k1, r4d ; 01010101
kshiftlb k2, k1, 5 ; 10100000
kshiftlb k3, k1, 4 ; 01010000
lea r4, [3*r1]
lea r5, [3*r3]
satd_8x8_avx512:
SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
SATD_AVX512_PACKED ; 3 1 3 1 7 5 7 5
ret
cglobal pixel_satd_16x8, 4,6
call pixel_satd_16x8_internal_avx512
jmp satd_zmm_avx512_end
cglobal pixel_satd_16x16, 4,6
call pixel_satd_16x8_internal_avx512
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddw m7, m0, m1
call satd_16x8_avx512
paddw m1, m7
jmp satd_zmm_avx512_end
cglobal pixel_satd_8x8, 4,6
call pixel_satd_8x8_internal_avx512
satd_zmm_avx512_end:
SATD_AVX512_END
cglobal pixel_satd_8x16, 4,6
call pixel_satd_8x8_internal_avx512
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddw m5, m0, m1
call satd_8x8_avx512
paddw m1, m5
jmp satd_zmm_avx512_end
INIT_YMM avx512
cglobal pixel_satd_4x8_internal
vbroadcasti128 m4, [hmul_4p]
mov r4d, 0x55550c
kmovd k2, r4d ; 00001100
kshiftlb k3, k2, 2 ; 00110000
kshiftlb k4, k2, 4 ; 11000000
kshiftrd k1, k2, 8 ; 01010101
lea r4, [3*r1]
lea r5, [3*r3]
satd_4x8_avx512:
SATD_AVX512_LOAD8 d, xm, k2, k3, k4 ; 0 0 2 2 4 4 6 6
satd_ymm_avx512: ; 1 1 3 3 5 5 7 7
SATD_AVX512_PACKED
ret
cglobal pixel_satd_8x4, 4,5
mova m4, [hmul_16p]
mov r4d, 0x5555
kmovw k1, r4d
SATD_AVX512_LOAD4 q, k1 ; 2 0 2 0
call satd_ymm_avx512 ; 3 1 3 1
jmp satd_ymm_avx512_end2
cglobal pixel_satd_4x8, 4,6
call pixel_satd_4x8_internal_avx512
satd_ymm_avx512_end:
%if ARCH_X86_64 == 0
pop r5d
%assign regs_used 5
%endif
satd_ymm_avx512_end2:
SATD_AVX512_END
cglobal pixel_satd_4x16, 4,6
call pixel_satd_4x8_internal_avx512
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
paddw m5, m0, m1
call satd_4x8_avx512
paddw m1, m5
jmp satd_ymm_avx512_end
INIT_XMM avx512
cglobal pixel_satd_4x4, 4,5
mova m4, [hmul_4p]
mov r4d, 0x550c
kmovw k2, r4d
kshiftrw k1, k2, 8
SATD_AVX512_LOAD4 d, k2 ; 0 0 2 2
SATD_AVX512_PACKED ; 1 1 3 3
SWAP 0, 1
SATD_AVX512_END
INIT_ZMM avx512
cglobal pixel_sa8d_8x8, 4,6
vbroadcasti64x4 m4, [hmul_16p]
mov r4d, 0x55555555
kmovd k1, r4d ; 01010101
kshiftlb k2, k1, 5 ; 10100000
kshiftlb k3, k1, 4 ; 01010000
lea r4, [3*r1]
lea r5, [3*r3]
SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4
DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5
SUMSUB_BA w, 0, 1, 2
SBUTTERFLY qdq, 0, 1, 2
SUMSUB_BA w, 0, 1, 2
shufps m2, m0, m1, q2020
shufps m1, m0, m1, q3131
SUMSUB_BA w, 2, 1, 0
vshufi32x4 m0, m2, m1, q1010
vshufi32x4 m1, m2, m1, q3232
SUMSUB_BA w, 0, 1, 2
HMAXABSW2 0, 1, 2, 3
SATD_AVX512_END 1
%endif ; HIGH_BIT_DEPTH
;=============================================================================
@ -4867,7 +5222,7 @@ ASD8
add r6, 4*%1
sub r0d, 4*%1
jg .loop
WIN64_RESTORE_XMM rsp
WIN64_RESTORE_XMM
%if mmsize==32
vzeroupper
%endif

@ -52,6 +52,7 @@ DECL_X1( sad, sse2_aligned )
DECL_X1( sad, ssse3 )
DECL_X1( sad, ssse3_aligned )
DECL_X1( sad, avx2 )
DECL_X1( sad, avx512 )
DECL_X4( sad, mmx2 )
DECL_X4( sad, sse2 )
DECL_X4( sad, sse3 )
@ -59,6 +60,7 @@ DECL_X4( sad, ssse3 )
DECL_X4( sad, xop )
DECL_X4( sad, avx )
DECL_X4( sad, avx2 )
DECL_X4( sad, avx512 )
DECL_X1( ssd, mmx )
DECL_X1( ssd, mmx2 )
DECL_X1( ssd, sse2slow )
@ -75,6 +77,7 @@ DECL_X1( satd, sse4 )
DECL_X1( satd, avx )
DECL_X1( satd, xop )
DECL_X1( satd, avx2 )
DECL_X1( satd, avx512 )
DECL_X1( sa8d, mmx2 )
DECL_X1( sa8d, sse2 )
DECL_X1( sa8d, ssse3 )
@ -83,6 +86,7 @@ DECL_X1( sa8d, sse4 )
DECL_X1( sa8d, avx )
DECL_X1( sa8d, xop )
DECL_X1( sa8d, avx2 )
DECL_X1( sa8d, avx512 )
DECL_X1( sad, cache32_mmx2 );
DECL_X1( sad, cache64_mmx2 );
DECL_X1( sad, cache64_sse2 );
@ -92,11 +96,10 @@ DECL_X4( sad, cache64_mmx2 );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride ))
@ -165,16 +168,14 @@ void x264_pixel_ssim_4x4x2_core_avx ( const pixel *pix1, intptr_t stride1,
const pixel *pix2, intptr_t stride2, int sums[2][4] );
float x264_pixel_ssim_end4_sse2( int sum0[5][4], int sum1[5][4], int width );
float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x8_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x8_ssse3 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_mmx2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x16_sse2 ( pixel *, intptr_t, pixel *, intptr_t, int * );
int x264_pixel_var2_8x16_ssse3( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_xop ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x16_avx2 ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );

@ -468,7 +468,7 @@ PREDICT_4x4 w, wd, dq, qdq
INIT_MMX mmx2
PREDICT_4x4 b, bw, wd, dq
INIT_MMX ssse3
%define predict_4x4_vr_ssse3 predict_4x4_vr_ssse3_cache64
%define predict_4x4_vr_ssse3 predict_4x4_vr_cache64_ssse3
PREDICT_4x4 b, bw, wd, dq
%endif
@ -940,7 +940,7 @@ INIT_XMM sse2
PREDICT_8x8_DDLR
INIT_XMM ssse3
PREDICT_8x8_DDLR
INIT_XMM ssse3, cache64
INIT_XMM cache64, ssse3
PREDICT_8x8_DDLR
%elif ARCH_X86_64 == 0
INIT_MMX mmx2

@ -511,8 +511,8 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
*predict_8x8_filter = x264_predict_8x8_filter_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_ssse3_cache64;
pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_ssse3_cache64;
pf[I_PRED_8x8_DDL]= x264_predict_8x8_ddl_cache64_ssse3;
pf[I_PRED_8x8_DDR]= x264_predict_8x8_ddr_cache64_ssse3;
}
if( !(cpu&X264_CPU_AVX) )
return;
@ -604,6 +604,6 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3;
pf[I_PRED_4x4_HD] = x264_predict_4x4_hd_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_ssse3_cache64;
pf[I_PRED_4x4_VR] = x264_predict_4x4_vr_cache64_ssse3;
#endif // HIGH_BIT_DEPTH
}

@ -93,12 +93,12 @@ void x264_predict_8x8_dc_left_sse2( uint16_t *src, uint16_t edge[36] );
void x264_predict_8x8_ddl_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_cache64_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddl_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_mmx2( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_ssse3_cache64( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_cache64_ssse3( pixel *src, pixel edge[36] );
void x264_predict_8x8_ddr_avx( pixel *src, pixel edge[36] );
void x264_predict_8x8_vl_sse2( pixel *src, pixel edge[36] );
void x264_predict_8x8_vl_ssse3( pixel *src, pixel edge[36] );
@ -129,7 +129,7 @@ void x264_predict_4x4_vl_avx( uint16_t *src );
void x264_predict_4x4_vr_mmx2( uint8_t *src );
void x264_predict_4x4_vr_sse2( uint16_t *src );
void x264_predict_4x4_vr_ssse3( pixel *src );
void x264_predict_4x4_vr_ssse3_cache64( uint8_t *src );
void x264_predict_4x4_vr_cache64_ssse3( uint8_t *src );
void x264_predict_4x4_vr_avx( uint16_t *src );
void x264_predict_4x4_hd_mmx2( pixel *src );
void x264_predict_4x4_hd_sse2( uint16_t *src );

@ -30,7 +30,14 @@
%include "x86inc.asm"
%include "x86util.asm"
SECTION_RODATA 32
SECTION_RODATA 64
%if HIGH_BIT_DEPTH
decimate_shuf_avx512: dd 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15
%else
dequant_shuf_avx512: dw 0, 2, 4, 6, 8,10,12,14,16,18,20,22,24,26,28,30
dw 32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62
%endif
%macro DQM4 3
dw %1, %2, %1, %2, %2, %3, %2, %3
@ -42,14 +49,6 @@ SECTION_RODATA 32
dw %4, %2, %6, %2, %4, %2, %6, %2
%endmacro
dequant4_scale:
DQM4 10, 13, 16
DQM4 11, 14, 18
DQM4 13, 16, 20
DQM4 14, 18, 23
DQM4 16, 20, 25
DQM4 18, 23, 29
dequant8_scale:
DQM8 20, 18, 32, 19, 25, 24
DQM8 22, 19, 35, 21, 28, 26
@ -58,6 +57,14 @@ dequant8_scale:
DQM8 32, 28, 51, 30, 40, 38
DQM8 36, 32, 58, 34, 46, 43
dequant4_scale:
DQM4 10, 13, 16
DQM4 11, 14, 18
DQM4 13, 16, 20
DQM4 14, 18, 23
DQM4 16, 20, 25
DQM4 18, 23, 29
decimate_mask_table4:
db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4
db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14
@ -743,6 +750,163 @@ DEQUANT 4, 4, 4
DEQUANT 8, 6, 4
%endif
%macro DEQUANT_START_AVX512 1-2 0 ; shift, flat
%if %2 == 0
movifnidn t2d, r2m
%endif
imul t0d, t2d, 0x2b
shr t0d, 8 ; i_qbits = i_qp / 6
lea t1d, [t0*5]
sub t2d, t0d
sub t2d, t1d ; i_mf = i_qp % 6
shl t2d, %1
%if %2
%ifdef PIC
%define dmf r1+t2
lea r1, [dequant8_scale]
%else
%define dmf t2+dequant8_scale
%endif
%elif ARCH_X86_64
%define dmf r1+t2
%else
%define dmf r1
add r1, r1mp ; dequant_mf[i_mf]
%endif
movifnidn r0, r0mp
%endmacro
INIT_ZMM avx512
cglobal dequant_4x4, 0,3
DEQUANT_START_AVX512 6
mova m0, [dmf]
%if HIGH_BIT_DEPTH
pmaddwd m0, [r0]
%endif
sub t0d, 4
jl .rshift
%if HIGH_BIT_DEPTH
vpbroadcastd m1, t0d
vpsllvd m0, m1
mova [r0], m0
%else
vpbroadcastw ym1, t0d
vpmovsdw ym0, m0
pmullw ym0, [r0]
vpsllvw ym0, ym1
mova [r0], ym0
%endif
RET
.rshift:
%if HIGH_BIT_DEPTH == 0
pmovzxwd m1, [r0]
pmaddwd m0, m1
%endif
mov r1d, 1<<31
shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
neg t0d
vpbroadcastd m1, r1d
vpbroadcastd m2, t0d
paddd m0, m1
vpsravd m0, m2
%if HIGH_BIT_DEPTH
mova [r0], m0
%else
vpmovsdw [r0], m0
%endif
RET
cglobal dequant_8x8, 0,3
DEQUANT_START_AVX512 8
mova m0, [dmf+0*64]
mova m1, [dmf+1*64]
mova m2, [dmf+2*64]
mova m3, [dmf+3*64]
%if HIGH_BIT_DEPTH
pmaddwd m0, [r0+0*64]
pmaddwd m1, [r0+1*64]
pmaddwd m2, [r0+2*64]
pmaddwd m3, [r0+3*64]
%else
mova m6, [dequant_shuf_avx512]
%endif
sub t0d, 6
jl .rshift
%if HIGH_BIT_DEPTH
vpbroadcastd m4, t0d
vpsllvd m0, m4
vpsllvd m1, m4
vpsllvd m2, m4
vpsllvd m3, m4
jmp .end
.rshift:
%else
vpbroadcastw m4, t0d
vpermt2w m0, m6, m1
vpermt2w m2, m6, m3
pmullw m0, [r0]
pmullw m2, [r0+64]
vpsllvw m0, m4
vpsllvw m2, m4
mova [r0], m0
mova [r0+64], m2
RET
.rshift:
pmovzxwd m4, [r0+0*32]
pmovzxwd m5, [r0+1*32]
pmaddwd m0, m4
pmaddwd m1, m5
pmovzxwd m4, [r0+2*32]
pmovzxwd m5, [r0+3*32]
pmaddwd m2, m4
pmaddwd m3, m5
%endif
mov r1d, 1<<31
shrx r1d, r1d, t0d ; 1 << (-i_qbits-1)
neg t0d
vpbroadcastd m4, r1d
vpbroadcastd m5, t0d
paddd m0, m4
paddd m1, m4
vpsravd m0, m5
vpsravd m1, m5
paddd m2, m4
paddd m3, m4
vpsravd m2, m5
vpsravd m3, m5
%if HIGH_BIT_DEPTH
.end:
mova [r0+0*64], m0
mova [r0+1*64], m1
mova [r0+2*64], m2
mova [r0+3*64], m3
%else
vpermt2w m0, m6, m1
vpermt2w m2, m6, m3
mova [r0], m0
mova [r0+64], m2
%endif
RET
%if HIGH_BIT_DEPTH == 0
cglobal dequant_8x8_flat16, 0,3
movifnidn t2d, r2m
cmp t2d, 12
jl dequant_8x8_avx512
sub t2d, 12
DEQUANT_START_AVX512 6, 1
vpbroadcastw m0, t0d
mova m1, [dmf]
vpsllvw m1, m0
pmullw m0, m1, [r0]
pmullw m1, [r0+64]
mova [r0], m0
mova [r0+64], m1
RET
%endif
%undef dmf
%macro DEQUANT_DC 2
cglobal dequant_4x4dc, 0,3,6
DEQUANT_START 6, 6
@ -1208,13 +1372,12 @@ cglobal denoise_dct, 4,4,4
; int decimate_score( dctcoef *dct )
;-----------------------------------------------------------------------------
%macro DECIMATE_MASK 5
%if mmsize==16
%macro DECIMATE_MASK 4
%if HIGH_BIT_DEPTH
movdqa m0, [%3+ 0]
movdqa m1, [%3+32]
packssdw m0, [%3+16]
packssdw m1, [%3+48]
mova m0, [%3+0*16]
packssdw m0, [%3+1*16]
mova m1, [%3+2*16]
packssdw m1, [%3+3*16]
ABSW2 m0, m1, m0, m1, m3, m4
%else
ABSW m0, [%3+ 0], m3
@ -1226,40 +1389,35 @@ cglobal denoise_dct, 4,4,4
pcmpgtb m0, %4
pmovmskb %1, m2
pmovmskb %2, m0
%else ; mmsize==8
%endmacro
%macro DECIMATE_MASK16_AVX512 0
mova m0, [r0]
%if HIGH_BIT_DEPTH
movq m0, [%3+ 0]
movq m1, [%3+16]
movq m2, [%3+32]
movq m3, [%3+48]
packssdw m0, [%3+ 8]
packssdw m1, [%3+24]
packssdw m2, [%3+40]
packssdw m3, [%3+56]
%else
movq m0, [%3+ 0]
movq m1, [%3+ 8]
movq m2, [%3+16]
movq m3, [%3+24]
%endif
ABSW2 m0, m1, m0, m1, m6, m7
ABSW2 m2, m3, m2, m3, m6, m7
packsswb m0, m1
packsswb m2, m3
pxor m4, m4
pxor m6, m6
pcmpeqb m4, m0
pcmpeqb m6, m2
pcmpgtb m0, %4
pcmpgtb m2, %4
pmovmskb %5, m4
pmovmskb %1, m6
shl %1, 8
or %1, %5
pmovmskb %5, m0
pmovmskb %2, m2
shl %2, 8
or %2, %5
vptestmd k0, m0, m0
pabsd m0, m0
vpcmpud k1, m0, [pd_1] {1to16}, 6
%else
vptestmw k0, m0, m0
pabsw m0, m0
vpcmpuw k1, m0, [pw_1], 6
%endif
%endmacro
%macro SHRX 2
%if cpuflag(bmi2)
shrx %1, %1, %2
%else
shr %1, %2b ; %2 has to be rcx/ecx
%endif
%endmacro
%macro BLSR 2
%if cpuflag(bmi1)
blsr %1, %2
%else
lea %1, [%2-1]
and %1, %2
%endif
%endmacro
@ -1269,33 +1427,60 @@ cextern decimate_table8
%macro DECIMATE4x4 1
cglobal decimate_score%1, 1,3
%ifdef PIC
lea r4, [decimate_table4]
lea r5, [decimate_mask_table4]
%define table r4
%define mask_table r5
%if cpuflag(avx512)
DECIMATE_MASK16_AVX512
xor eax, eax
kmovw edx, k0
%if %1 == 15
shr edx, 1
%else
%define table decimate_table4
%define mask_table decimate_mask_table4
test edx, edx
%endif
DECIMATE_MASK edx, eax, r0, [pb_1], ecx
jz .ret
ktestw k1, k1
jnz .ret9
%else
DECIMATE_MASK edx, eax, r0, [pb_1]
xor edx, 0xffff
je .ret
jz .ret
test eax, eax
jne .ret9
%if %1==15
jnz .ret9
%if %1 == 15
shr edx, 1
%endif
%endif
%ifdef PIC
lea r4, [decimate_mask_table4]
%define mask_table r4
%else
%define mask_table decimate_mask_table4
%endif
movzx ecx, dl
movzx eax, byte [mask_table + rcx]
%if ARCH_X86_64
xor edx, ecx
jz .ret
%if cpuflag(lzcnt)
lzcnt ecx, ecx
lea r5, [decimate_table4-32]
add r5, rcx
%else
bsr ecx, ecx
lea r5, [decimate_table4-1]
sub r5, rcx
%endif
%define table r5
%else
cmp edx, ecx
je .ret
jz .ret
bsr ecx, ecx
shr edx, 1
shr edx, cl
SHRX edx, ecx
%define table decimate_table4
%endif
tzcnt ecx, edx
shr edx, 1
shr edx, cl
SHRX edx, ecx
add al, byte [table + rcx]
add al, byte [mask_table + rdx]
.ret:
@ -1303,88 +1488,115 @@ cglobal decimate_score%1, 1,3
.ret9:
mov eax, 9
RET
%endmacro
%if ARCH_X86_64 == 0
INIT_MMX mmx2
DECIMATE4x4 15
DECIMATE4x4 16
%endif
INIT_XMM sse2
DECIMATE4x4 15
DECIMATE4x4 16
INIT_XMM ssse3
DECIMATE4x4 15
DECIMATE4x4 16
; 2x gt1 output, 2x nz output, 1x mask
%macro DECIMATE_MASK64_AVX2 5
pabsw m0, [r0+ 0]
pabsw m2, [r0+32]
pabsw m1, [r0+64]
pabsw m3, [r0+96]
packsswb m0, m2
packsswb m1, m3
pcmpgtb m2, m0, %5 ; the > 1 checks don't care about order, so
pcmpgtb m3, m1, %5 ; we can save latency by doing them here
pmovmskb %1, m2
pmovmskb %2, m3
or %1, %2
jne .ret9
%macro DECIMATE_MASK64_AVX2 2 ; nz_low, nz_high
mova m0, [r0+0*32]
packsswb m0, [r0+1*32]
mova m1, [r0+2*32]
packsswb m1, [r0+3*32]
mova m4, [pb_1]
pabsb m2, m0
pabsb m3, m1
por m2, m3 ; the > 1 checks don't care about order, so
ptest m4, m2 ; we can save latency by doing them here
jnc .ret9
vpermq m0, m0, q3120
vpermq m1, m1, q3120
pxor m4, m4
pcmpeqb m0, m4
pcmpeqb m1, m4
pmovmskb %3, m0
pmovmskb %4, m1
pmovmskb %1, m0
pmovmskb %2, m1
%endmacro
%macro DECIMATE8x8 0
%macro DECIMATE_MASK64_AVX512 0
mova m0, [r0]
%if HIGH_BIT_DEPTH
packssdw m0, [r0+1*64]
mova m1, [r0+2*64]
packssdw m1, [r0+3*64]
packsswb m0, m1
vbroadcasti32x4 m1, [pb_1]
pabsb m2, m0
vpcmpub k0, m2, m1, 6
ktestq k0, k0
jnz .ret9
mova m1, [decimate_shuf_avx512]
vpermd m0, m1, m0
vptestmb k1, m0, m0
%else
mova m1, [r0+64]
vbroadcasti32x4 m3, [pb_1]
packsswb m2, m0, m1
pabsb m2, m2
vpcmpub k0, m2, m3, 6
ktestq k0, k0
jnz .ret9
vptestmw k1, m0, m0
vptestmw k2, m1, m1
%endif
%endmacro
%macro DECIMATE8x8 0
%if ARCH_X86_64
cglobal decimate_score64, 1,5
%ifdef PIC
lea r4, [decimate_table8]
%define table r4
%if mmsize == 64
DECIMATE_MASK64_AVX512
xor eax, eax
%if HIGH_BIT_DEPTH
kmovq r1, k1
test r1, r1
jz .ret
%else
%define table decimate_table8
kortestd k1, k2
jz .ret
kunpckdq k1, k2, k1
kmovq r1, k1
%endif
mova m5, [pb_1]
%if mmsize==32
DECIMATE_MASK64_AVX2 eax, r2d, r1d, r3d, m5
shl r3, 32
or r1, r3
xor r1, -1
je .ret
%elif mmsize == 32
DECIMATE_MASK64_AVX2 r1d, eax
not r1
shl rax, 32
xor r1, rax
jz .ret
%else
DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5, null
mova m5, [pb_1]
DECIMATE_MASK r1d, eax, r0+SIZEOF_DCTCOEF* 0, m5
test eax, eax
jne .ret9
DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5, null
jnz .ret9
DECIMATE_MASK r2d, eax, r0+SIZEOF_DCTCOEF*16, m5
shl r2d, 16
or r1d, r2d
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5, null
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*32, m5
shl r2, 32
or eax, r3d
or r1, r2
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5, null
DECIMATE_MASK r2d, r3d, r0+SIZEOF_DCTCOEF*48, m5
not r1
shl r2, 48
or r1, r2
xor r1, -1
je .ret
xor r1, r2
jz .ret
add eax, r3d
jne .ret9
jnz .ret9
%endif
%ifdef PIC
lea r4, [decimate_table8]
%define table r4
%else
%define table decimate_table8
%endif
mov al, -6
.loop:
tzcnt rcx, r1
shr r1, cl
add al, byte [table + rcx]
jge .ret9
shr r1, 1
jne .loop
SHRX r1, rcx
%if cpuflag(bmi2)
test r1, r1
%endif
jnz .loop
add al, 6
.ret:
REP_RET
@ -1393,85 +1605,107 @@ cglobal decimate_score64, 1,5
RET
%else ; ARCH
%if mmsize == 8
cglobal decimate_score64, 1,6
%else
cglobal decimate_score64, 1,5
cglobal decimate_score64, 1,4
%if mmsize == 64
DECIMATE_MASK64_AVX512
xor eax, eax
%if HIGH_BIT_DEPTH
kshiftrq k2, k1, 32
%endif
mova m5, [pb_1]
%if mmsize==32
DECIMATE_MASK64_AVX2 r0, r2, r3, r4, m5
xor r3, -1
je .tryret
xor r4, -1
.cont:
%else
DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF* 0, m5, r5
kmovd r2, k1
kmovd r3, k2
test r2, r2
jne .ret9
DECIMATE_MASK r4, r2, r0+SIZEOF_DCTCOEF*16, m5, r5
shl r4, 16
or r3, r4
DECIMATE_MASK r4, r1, r0+SIZEOF_DCTCOEF*32, m5, r5
jz .tryret
%elif mmsize == 32
DECIMATE_MASK64_AVX2 r2, r3
xor eax, eax
not r3
xor r2, -1
jz .tryret
%else
mova m5, [pb_1]
DECIMATE_MASK r2, r1, r0+SIZEOF_DCTCOEF* 0, m5
test r1, r1
jnz .ret9
DECIMATE_MASK r3, r1, r0+SIZEOF_DCTCOEF*16, m5
not r2
shl r3, 16
xor r2, r3
mov r0m, r2
DECIMATE_MASK r3, r2, r0+SIZEOF_DCTCOEF*32, m5
or r2, r1
DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5, r5
shl r1, 16
or r4, r1
xor r3, -1
je .tryret
xor r4, -1
.cont:
DECIMATE_MASK r1, r0, r0+SIZEOF_DCTCOEF*48, m5
add r0, r2
jne .ret9
jnz .ret9
mov r2, r0m
not r3
shl r1, 16
xor r3, r1
test r2, r2
jz .tryret
%endif
mov al, -6
.loop:
tzcnt ecx, r2
add al, byte [decimate_table8 + ecx]
jge .ret9
sub ecx, 31 ; increase the shift count by one to shift away the lowest set bit as well
jz .run31 ; only bits 0-4 are used so we have to explicitly handle the case of 1<<31
shrd r2, r3, cl
SHRX r3, ecx
%if notcpuflag(bmi2)
test r2, r2
%endif
jnz .loop
BLSR r2, r3
jz .end
.largerun:
tzcnt ecx, r3
shr r3, 1
SHRX r3, ecx
.loop2:
tzcnt ecx, r3
test r3, r3
je .largerun
shrd r3, r4, cl
shr r4, cl
add al, byte [decimate_table8 + ecx]
jge .ret9
shrd r3, r4, 1
shr r4, 1
shr r3, 1
SHRX r3, ecx
.run31:
test r3, r3
jne .loop
test r4, r4
jne .loop
jnz .loop2
.end:
add al, 6
.ret:
REP_RET
.tryret:
xor r4, -1
jne .cont
RET
.tryret:
BLSR r2, r3
jz .ret
mov al, -6
jmp .largerun
.ret9:
mov eax, 9
RET
.largerun:
mov r3, r4
xor r4, r4
tzcnt ecx, r3
shr r3, cl
shr r3, 1
jne .loop
add al, 6
RET
.ret:
REP_RET
%endif ; ARCH
%endmacro
%if ARCH_X86_64 == 0
INIT_MMX mmx2
DECIMATE8x8
%endif
INIT_XMM sse2
DECIMATE4x4 15
DECIMATE4x4 16
DECIMATE8x8
INIT_XMM ssse3
DECIMATE4x4 15
DECIMATE4x4 16
DECIMATE8x8
%if HIGH_BIT_DEPTH
INIT_ZMM avx512
%else
INIT_YMM avx2
DECIMATE8x8
INIT_YMM avx512
%endif
DECIMATE4x4 15
DECIMATE4x4 16
INIT_ZMM avx512
DECIMATE8x8
;-----------------------------------------------------------------------------
; int coeff_last( dctcoef *dct )
@ -1556,7 +1790,7 @@ cglobal coeff_last4, 1,3
INIT_MMX mmx2
COEFF_LAST4
INIT_MMX mmx2, lzcnt
INIT_MMX lzcnt
COEFF_LAST4
%macro COEFF_LAST8 0
@ -1579,7 +1813,7 @@ COEFF_LAST8
%endif
INIT_XMM sse2
COEFF_LAST8
INIT_XMM sse2, lzcnt
INIT_XMM lzcnt
COEFF_LAST8
%else ; !HIGH_BIT_DEPTH
@ -1642,7 +1876,7 @@ cglobal coeff_last8, 1,3
INIT_MMX mmx2
COEFF_LAST48
INIT_MMX mmx2, lzcnt
INIT_MMX lzcnt
COEFF_LAST48
%endif ; HIGH_BIT_DEPTH
@ -1707,7 +1941,7 @@ COEFF_LAST
%endif
INIT_XMM sse2
COEFF_LAST
INIT_XMM sse2, lzcnt
INIT_XMM lzcnt
COEFF_LAST
%macro LAST_MASK_AVX2 2
@ -1729,7 +1963,7 @@ COEFF_LAST
%endmacro
%if ARCH_X86_64 == 0
INIT_YMM avx2,lzcnt
INIT_YMM avx2
cglobal coeff_last64, 1,2
pxor m2, m2
LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF*32
@ -1744,7 +1978,7 @@ cglobal coeff_last64, 1,2
add eax, 32
RET
%else
INIT_YMM avx2,lzcnt
INIT_YMM avx2
cglobal coeff_last64, 1,3
pxor m2, m2
LAST_MASK_AVX2 r1d, r0+SIZEOF_DCTCOEF* 0
@ -1756,6 +1990,70 @@ cglobal coeff_last64, 1,3
RET
%endif
%macro COEFF_LAST_AVX512 2 ; num, w/d
cglobal coeff_last%1, 1,2
mova m0, [r0-(%1&1)*SIZEOF_DCTCOEF]
vptestm%2 k0, m0, m0
%if %1 == 15
mov eax, 30
kmovw r1d, k0
lzcnt r1d, r1d
sub eax, r1d
%else
kmovw eax, k0
lzcnt eax, eax
xor eax, 31
%endif
RET
%endmacro
%macro COEFF_LAST64_AVX512 1 ; w/d
cglobal coeff_last64, 1,2
pxor xm0, xm0
vpcmp%1 k0, m0, [r0+0*64], 4
vpcmp%1 k1, m0, [r0+1*64], 4
%if HIGH_BIT_DEPTH
vpcmp%1 k2, m0, [r0+2*64], 4
vpcmp%1 k3, m0, [r0+3*64], 4
kunpckwd k0, k1, k0
kunpckwd k1, k3, k2
%endif
%if ARCH_X86_64
kunpckdq k0, k1, k0
kmovq rax, k0
lzcnt rax, rax
xor eax, 63
%else
kmovd r1d, k1
kmovd eax, k0
lzcnt r1d, r1d
lzcnt eax, eax
xor r1d, 32
cmovnz eax, r1d
xor eax, 31
%endif
RET
%endmacro
%if HIGH_BIT_DEPTH
INIT_XMM avx512
COEFF_LAST_AVX512 4, d
INIT_YMM avx512
COEFF_LAST_AVX512 8, d
INIT_ZMM avx512
COEFF_LAST_AVX512 15, d
COEFF_LAST_AVX512 16, d
COEFF_LAST64_AVX512 d
%else ; !HIGH_BIT_DEPTH
INIT_XMM avx512
COEFF_LAST_AVX512 8, w
INIT_YMM avx512
COEFF_LAST_AVX512 15, w
COEFF_LAST_AVX512 16, w
INIT_ZMM avx512
COEFF_LAST64_AVX512 w
%endif ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; int coeff_level_run( dctcoef *dct, run_level_t *runlevel )
;-----------------------------------------------------------------------------
@ -1833,15 +2131,17 @@ COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_XMM sse2, lzcnt
INIT_MMX lzcnt
COEFF_LEVELRUN 4
%if HIGH_BIT_DEPTH == 0
COEFF_LEVELRUN 8
%endif
INIT_XMM lzcnt
%if HIGH_BIT_DEPTH
COEFF_LEVELRUN 8
%endif
COEFF_LEVELRUN 15
COEFF_LEVELRUN 16
INIT_MMX mmx2, lzcnt
COEFF_LEVELRUN 4
COEFF_LEVELRUN 8
; Similar to the one above, but saves the DCT
; coefficients in m0/m1 so we don't have to load
@ -1968,7 +2268,7 @@ INIT_XMM ssse3, lzcnt
COEFF_LEVELRUN_LUT 8
COEFF_LEVELRUN_LUT 15
COEFF_LEVELRUN_LUT 16
INIT_XMM avx2, lzcnt
INIT_XMM avx2
COEFF_LEVELRUN_LUT 15
COEFF_LEVELRUN_LUT 16
%endif

@ -66,12 +66,15 @@ void x264_dequant_8x8_xop( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_4x4dc_avx2( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx2( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_avx512( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_avx512( dctcoef dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_mmx( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_avx2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_avx2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_8x8_flat16_avx512( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_idct_dequant_2x4_dc_sse2( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
void x264_idct_dequant_2x4_dc_avx ( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
void x264_idct_dequant_2x4_dconly_sse2( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
@ -85,16 +88,16 @@ void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_avx2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
int x264_decimate_score15_mmx2( dctcoef *dct );
int x264_decimate_score15_sse2( dctcoef *dct );
int x264_decimate_score15_ssse3( dctcoef *dct );
int x264_decimate_score16_mmx2( dctcoef *dct );
int x264_decimate_score15_avx512( dctcoef *dct );
int x264_decimate_score16_sse2( dctcoef *dct );
int x264_decimate_score16_ssse3( dctcoef *dct );
int x264_decimate_score64_mmx2( dctcoef *dct );
int x264_decimate_score16_avx512( dctcoef *dct );
int x264_decimate_score64_sse2( dctcoef *dct );
int x264_decimate_score64_ssse3( dctcoef *dct );
int x264_decimate_score64_avx2( int16_t *dct );
int x264_decimate_score64_avx512( dctcoef *dct );
int x264_coeff_last4_mmx2( dctcoef *dct );
int x264_coeff_last8_mmx2( dctcoef *dct );
int x264_coeff_last15_mmx2( dctcoef *dct );
@ -104,33 +107,37 @@ int x264_coeff_last8_sse2( dctcoef *dct );
int x264_coeff_last15_sse2( dctcoef *dct );
int x264_coeff_last16_sse2( dctcoef *dct );
int x264_coeff_last64_sse2( dctcoef *dct );
int x264_coeff_last4_mmx2_lzcnt( dctcoef *dct );
int x264_coeff_last8_mmx2_lzcnt( dctcoef *dct );
int x264_coeff_last8_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last15_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last16_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last64_sse2_lzcnt( dctcoef *dct );
int x264_coeff_last64_avx2_lzcnt( dctcoef *dct );
int x264_coeff_last4_lzcnt( dctcoef *dct );
int x264_coeff_last8_lzcnt( dctcoef *dct );
int x264_coeff_last15_lzcnt( dctcoef *dct );
int x264_coeff_last16_lzcnt( dctcoef *dct );
int x264_coeff_last64_lzcnt( dctcoef *dct );
int x264_coeff_last64_avx2 ( dctcoef *dct );
int x264_coeff_last4_avx512( int32_t *dct );
int x264_coeff_last8_avx512( dctcoef *dct );
int x264_coeff_last15_avx512( dctcoef *dct );
int x264_coeff_last16_avx512( dctcoef *dct );
int x264_coeff_last64_avx512( dctcoef *dct );
int x264_coeff_level_run16_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_avx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_avx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_avx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_mmx2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_mmx2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_sse2_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_ssse3( dctcoef *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run8_ssse3_lzcnt( dctcoef *dct, x264_run_level_t *runlevel );
int x264_trellis_cabac_4x4_sse2 ( TRELLIS_PARAMS, int b_ac );

@ -106,8 +106,6 @@ SAD 4, 16
SAD 4, 8
SAD 4, 4
;=============================================================================
; SAD XMM
;=============================================================================
@ -119,118 +117,64 @@ SAD 4, 4
RET
%endmacro
%macro SAD_W16 0
;-----------------------------------------------------------------------------
; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x16, 4,4,8
movu m0, [r2]
movu m1, [r2+r3]
lea r2, [r2+2*r3]
movu m2, [r2]
%macro SAD_W16 1 ; h
cglobal pixel_sad_16x%1, 4,4
%ifidn cpuname, sse2
.skip_prologue:
%endif
%assign %%i 0
%if ARCH_X86_64
lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile
lea r5, [3*r3]
%rep %1/4
movu m1, [r2]
psadbw m1, [r0]
movu m3, [r2+r3]
lea r2, [r2+2*r3]
psadbw m0, [r0]
psadbw m1, [r0+r1]
lea r0, [r0+2*r1]
movu m4, [r2]
paddw m0, m1
psadbw m2, [r0]
psadbw m3, [r0+r1]
lea r0, [r0+2*r1]
movu m5, [r2+r3]
lea r2, [r2+2*r3]
paddw m2, m3
movu m6, [r2]
movu m7, [r2+r3]
lea r2, [r2+2*r3]
movu m2, [r2+2*r3]
psadbw m2, [r0+2*r1]
movu m4, [r2+r5]
psadbw m4, [r0+r6]
%if %%i != %1/4-1
lea r2, [r2+4*r3]
lea r0, [r0+4*r1]
%endif
paddw m1, m3
paddw m2, m4
ACCUM paddw, 0, 1, %%i
paddw m0, m2
psadbw m4, [r0]
psadbw m5, [r0+r1]
lea r0, [r0+2*r1]
%assign %%i %%i+1
%endrep
%else ; The cost of having to save and restore registers on x86-32
%rep %1/2 ; nullifies the benefit of having 3*stride in registers.
movu m1, [r2]
paddw m4, m5
psadbw m6, [r0]
psadbw m7, [r0+r1]
lea r0, [r0+2*r1]
movu m2, [r2+r3]
lea r2, [r2+2*r3]
paddw m6, m7
movu m3, [r2]
paddw m0, m4
movu m4, [r2+r3]
lea r2, [r2+2*r3]
paddw m0, m6
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
movu m5, [r2]
paddw m1, m2
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
movu m6, [r2+r3]
lea r2, [r2+2*r3]
paddw m3, m4
movu m7, [r2]
paddw m0, m1
movu m1, [r2+r3]
paddw m0, m3
psadbw m5, [r0]
psadbw m6, [r0+r1]
lea r0, [r0+2*r1]
paddw m5, m6
psadbw m7, [r0]
psadbw m1, [r0+r1]
paddw m7, m1
paddw m0, m5
paddw m0, m7
SAD_END_SSE2
;-----------------------------------------------------------------------------
; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
;-----------------------------------------------------------------------------
cglobal pixel_sad_16x8, 4,4
movu m0, [r2]
movu m2, [r2+r3]
lea r2, [r2+2*r3]
movu m3, [r2]
movu m4, [r2+r3]
psadbw m0, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw m0, m2
paddw m3, m4
paddw m0, m3
movu m1, [r2]
movu m2, [r2+r3]
%if %%i != %1/2-1
lea r2, [r2+2*r3]
movu m3, [r2]
movu m4, [r2+r3]
psadbw m1, [r0]
psadbw m2, [r0+r1]
lea r0, [r0+2*r1]
psadbw m3, [r0]
psadbw m4, [r0+r1]
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
paddw m1, m2
paddw m3, m4
paddw m0, m1
paddw m0, m3
%endif
ACCUM paddw, 0, 1, %%i
paddw m0, m2
%assign %%i %%i+1
%endrep
%endif
SAD_END_SSE2
%endmacro
INIT_XMM sse2
SAD_W16
SAD_W16 16
SAD_W16 8
INIT_XMM sse3
SAD_W16
SAD_W16 16
SAD_W16 8
INIT_XMM sse2, aligned
SAD_W16
SAD_W16 16
SAD_W16 8
%macro SAD_INC_4x8P_SSE 1
movq m1, [r0]
@ -259,7 +203,132 @@ cglobal pixel_sad_8x16_sse2, 4,4
SAD_INC_4x8P_SSE 1
SAD_INC_4x8P_SSE 1
SAD_END_SSE2
%macro SAD_W48_AVX512 3 ; w, h, d/q
cglobal pixel_sad_%1x%2, 4,4
kxnorb k1, k1, k1
kaddb k1, k1, k1
%assign %%i 0
%if ARCH_X86_64 && %2 != 4
lea r6, [3*r1]
lea r5, [3*r3]
%rep %2/4
mov%3 m1, [r0]
vpbroadcast%3 m1 {k1}, [r0+r1]
mov%3 m3, [r2]
vpbroadcast%3 m3 {k1}, [r2+r3]
mov%3 m2, [r0+2*r1]
vpbroadcast%3 m2 {k1}, [r0+r6]
mov%3 m4, [r2+2*r3]
vpbroadcast%3 m4 {k1}, [r2+r5]
%if %%i != %2/4-1
lea r0, [r0+4*r1]
lea r2, [r2+4*r3]
%endif
psadbw m1, m3
psadbw m2, m4
ACCUM paddd, 0, 1, %%i
paddd m0, m2
%assign %%i %%i+1
%endrep
%else
%rep %2/2
mov%3 m1, [r0]
vpbroadcast%3 m1 {k1}, [r0+r1]
mov%3 m2, [r2]
vpbroadcast%3 m2 {k1}, [r2+r3]
%if %%i != %2/2-1
lea r0, [r0+2*r1]
lea r2, [r2+2*r3]
%endif
psadbw m1, m2
ACCUM paddd, 0, 1, %%i
%assign %%i %%i+1
%endrep
%endif
%if %1 == 8
punpckhqdq m1, m0, m0
paddd m0, m1
%endif
movd eax, m0
RET
%endmacro
INIT_XMM avx512
SAD_W48_AVX512 4, 4, d
SAD_W48_AVX512 4, 8, d
SAD_W48_AVX512 4, 16, d
SAD_W48_AVX512 8, 4, q
SAD_W48_AVX512 8, 8, q
SAD_W48_AVX512 8, 16, q
%macro SAD_W16_AVX512_START 1 ; h
cmp r1d, FENC_STRIDE ; optimized for the most common fenc case, which
jne pixel_sad_16x%1_sse2.skip_prologue ; has the rows laid out contiguously in memory
lea r1, [3*r3]
%endmacro
%macro SAD_W16_AVX512_END 0
paddd m0, m1
paddd m0, m2
paddd m0, m3
%if mmsize == 64
vextracti32x8 ym1, m0, 1
paddd ym0, ym1
%endif
vextracti128 xm1, ym0, 1
paddd xmm0, xm0, xm1
punpckhqdq xmm1, xmm0, xmm0
paddd xmm0, xmm1
movd eax, xmm0
RET
%endmacro
INIT_YMM avx512
cglobal pixel_sad_16x8, 4,4
SAD_W16_AVX512_START 8
movu xm0, [r2]
vinserti128 m0, [r2+r3], 1
psadbw m0, [r0+0*32]
movu xm1, [r2+2*r3]
vinserti128 m1, [r2+r1], 1
lea r2, [r2+4*r3]
psadbw m1, [r0+1*32]
movu xm2, [r2]
vinserti128 m2, [r2+r3], 1
psadbw m2, [r0+2*32]
movu xm3, [r2+2*r3]
vinserti128 m3, [r2+r1], 1
psadbw m3, [r0+3*32]
SAD_W16_AVX512_END
INIT_ZMM avx512
cglobal pixel_sad_16x16, 4,4
SAD_W16_AVX512_START 16
movu xm0, [r2]
vinserti128 ym0, [r2+r3], 1
movu xm1, [r2+4*r3]
vinserti32x4 m0, [r2+2*r3], 2
vinserti32x4 m1, [r2+2*r1], 2
vinserti32x4 m0, [r2+r1], 3
lea r2, [r2+4*r3]
vinserti32x4 m1, [r2+r3], 1
psadbw m0, [r0+0*64]
vinserti32x4 m1, [r2+r1], 3
lea r2, [r2+4*r3]
psadbw m1, [r0+1*64]
movu xm2, [r2]
vinserti128 ym2, [r2+r3], 1
movu xm3, [r2+4*r3]
vinserti32x4 m2, [r2+2*r3], 2
vinserti32x4 m3, [r2+2*r1], 2
vinserti32x4 m2, [r2+r1], 3
lea r2, [r2+4*r3]
vinserti32x4 m3, [r2+r3], 1
psadbw m2, [r0+2*64]
vinserti32x4 m3, [r2+r1], 3
psadbw m3, [r0+3*64]
SAD_W16_AVX512_END
;-----------------------------------------------------------------------------
; void pixel_vsad( pixel *src, intptr_t stride );
@ -1548,6 +1617,225 @@ SAD_X_AVX2 3, 16, 8, 7
SAD_X_AVX2 4, 16, 16, 8
SAD_X_AVX2 4, 16, 8, 8
%macro SAD_X_W4_AVX512 2 ; x, h
cglobal pixel_sad_x%1_4x%2, %1+2,%1+3
mov t1d, 0xa
kmovb k1, t1d
lea t1, [3*t0]
kaddb k2, k1, k1
kshiftlb k3, k1, 2
%assign %%i 0
%rep %2/4
movu m6, [r0+%%i*64]
vmovddup m6 {k1}, [r0+%%i*64+32]
movd xmm2, [r1]
movd xmm4, [r1+t0]
vpbroadcastd xmm2 {k1}, [r1+2*t0]
vpbroadcastd xmm4 {k1}, [r1+t1]
vpbroadcastd xmm2 {k2}, [r2+t0]
vpbroadcastd xmm4 {k2}, [r2]
vpbroadcastd xmm2 {k3}, [r2+t1] ; a0 a2 b1 b3
vpbroadcastd xmm4 {k3}, [r2+2*t0] ; a1 a3 b0 b2
vpmovqd s1, m6 ; s0 s2 s1 s3
movd xmm3, [r3]
movd xmm5, [r3+t0]
vpbroadcastd xmm3 {k1}, [r3+2*t0]
vpbroadcastd xmm5 {k1}, [r3+t1]
%if %1 == 4
vpbroadcastd xmm3 {k2}, [r4+t0]
vpbroadcastd xmm5 {k2}, [r4]
vpbroadcastd xmm3 {k3}, [r4+t1] ; c0 c2 d1 d3
vpbroadcastd xmm5 {k3}, [r4+2*t0] ; c1 c3 d0 d2
%endif
%if %%i != %2/4-1
%assign %%j 1
%rep %1
lea r%+%%j, [r%+%%j+4*t0]
%assign %%j %%j+1
%endrep
%endif
pshufd s2, s1, q1032
psadbw xmm2, s1
psadbw xmm4, s2
psadbw xmm3, s1
psadbw xmm5, s2
%if %%i
paddd xmm0, xmm2
paddd xmm1, xmm3
paddd xmm0, xmm4
paddd xmm1, xmm5
%else
paddd xmm0, xmm2, xmm4
paddd xmm1, xmm3, xmm5
%endif
%assign %%i %%i+1
%endrep
%if %1 == 4
movifnidn t2, r6mp
%else
movifnidn t2, r5mp
%endif
packusdw xmm0, xmm1
mova [t2], xmm0
RET
%endmacro
%macro SAD_X_W8_AVX512 2 ; x, h
cglobal pixel_sad_x%1_8x%2, %1+2,%1+3
kxnorb k3, k3, k3
lea t1, [3*t0]
kaddb k1, k3, k3
kshiftlb k2, k3, 2
kshiftlb k3, k3, 3
%assign %%i 0
%rep %2/4
movddup m6, [r0+%%i*64] ; s0 s0 s1 s1
movq xm2, [r1]
movq xm4, [r1+2*t0]
vpbroadcastq xm2 {k1}, [r2]
vpbroadcastq xm4 {k1}, [r2+2*t0]
vpbroadcastq m2 {k2}, [r1+t0]
vpbroadcastq m4 {k2}, [r1+t1]
vpbroadcastq m2 {k3}, [r2+t0] ; a0 b0 a1 b1
vpbroadcastq m4 {k3}, [r2+t1] ; a2 b2 a3 b3
movddup m7, [r0+%%i*64+32] ; s2 s2 s3 s3
movq xm3, [r3]
movq xm5, [r3+2*t0]
%if %1 == 4
vpbroadcastq xm3 {k1}, [r4]
vpbroadcastq xm5 {k1}, [r4+2*t0]
%endif
vpbroadcastq m3 {k2}, [r3+t0]
vpbroadcastq m5 {k2}, [r3+t1]
%if %1 == 4
vpbroadcastq m3 {k3}, [r4+t0] ; c0 d0 c1 d1
vpbroadcastq m5 {k3}, [r4+t1] ; c2 d2 c3 d3
%endif
%if %%i != %2/4-1
%assign %%j 1
%rep %1
lea r%+%%j, [r%+%%j+4*t0]
%assign %%j %%j+1
%endrep
%endif
psadbw m2, m6
psadbw m4, m7
psadbw m3, m6
psadbw m5, m7
ACCUM paddd, 0, 2, %%i
ACCUM paddd, 1, 3, %%i
paddd m0, m4
paddd m1, m5
%assign %%i %%i+1
%endrep
%if %1 == 4
movifnidn t2, r6mp
%else
movifnidn t2, r5mp
%endif
packusdw m0, m1
vextracti128 xm1, m0, 1
paddd xm0, xm1
mova [t2], xm0
RET
%endmacro
%macro SAD_X_W16_AVX512 2 ; x, h
cglobal pixel_sad_x%1_16x%2, %1+2,%1+3
lea t1, [3*t0]
%assign %%i 0
%rep %2/4
mova m6, [r0+%%i*64] ; s0 s1 s2 s3
movu xm2, [r3]
movu xm4, [r3+t0]
%if %1 == 4
vinserti128 ym2, [r4+t0], 1
vinserti128 ym4, [r4], 1
%endif
vinserti32x4 m2, [r1+2*t0], 2
vinserti32x4 m4, [r1+t1], 2
vinserti32x4 m2, [r2+t1], 3 ; c0 d1 a2 b3
vinserti32x4 m4, [r2+2*t0], 3 ; c1 d0 a3 b2
vpermq m7, m6, q1032 ; s1 s0 s3 s2
movu xm3, [r1]
movu xm5, [r1+t0]
vinserti128 ym3, [r2+t0], 1
vinserti128 ym5, [r2], 1
vinserti32x4 m3, [r3+2*t0], 2
vinserti32x4 m5, [r3+t1], 2
%if %1 == 4
vinserti32x4 m3, [r4+t1], 3 ; a0 b1 c2 d3
vinserti32x4 m5, [r4+2*t0], 3 ; a1 b0 c3 d2
%endif
%if %%i != %2/4-1
%assign %%j 1
%rep %1
lea r%+%%j, [r%+%%j+4*t0]
%assign %%j %%j+1
%endrep
%endif
psadbw m2, m6
psadbw m4, m7
psadbw m3, m6
psadbw m5, m7
ACCUM paddd, 0, 2, %%i
ACCUM paddd, 1, 3, %%i
paddd m0, m4
paddd m1, m5
%assign %%i %%i+1
%endrep
%if %1 == 4
movifnidn t2, r6mp
%else
movifnidn t2, r5mp
%endif
mov t1d, 0x1111
kmovw k1, t1d
vshufi32x4 m0, m0, q1032
paddd m0, m1
punpckhqdq m1, m0, m0
paddd m0, m1
vpcompressd m0 {k1}{z}, m0
mova [t2], xm0
RET
%endmacro
; t0 = stride, t1 = tmp/stride3, t2 = scores
%if WIN64
%define s1 xmm16 ; xmm6 and xmm7 reduces code size, but
%define s2 xmm17 ; they're callee-saved on win64
DECLARE_REG_TMP 4, 6, 0
%else
%define s1 xmm6
%define s2 xmm7
%if ARCH_X86_64
DECLARE_REG_TMP 4, 6, 5 ; scores is passed in a register on unix64
%else
DECLARE_REG_TMP 4, 5, 0
%endif
%endif
INIT_YMM avx512
SAD_X_W4_AVX512 3, 4 ; x3_4x4
SAD_X_W4_AVX512 3, 8 ; x3_4x8
SAD_X_W8_AVX512 3, 4 ; x3_8x4
SAD_X_W8_AVX512 3, 8 ; x3_8x8
SAD_X_W8_AVX512 3, 16 ; x3_8x16
INIT_ZMM avx512
SAD_X_W16_AVX512 3, 8 ; x3_16x8
SAD_X_W16_AVX512 3, 16 ; x3_16x16
DECLARE_REG_TMP 5, 6, 0
INIT_YMM avx512
SAD_X_W4_AVX512 4, 4 ; x4_4x4
SAD_X_W4_AVX512 4, 8 ; x4_4x8
SAD_X_W8_AVX512 4, 4 ; x4_8x4
SAD_X_W8_AVX512 4, 8 ; x4_8x8
SAD_X_W8_AVX512 4, 16 ; x4_8x16
INIT_ZMM avx512
SAD_X_W16_AVX512 4, 8 ; x4_16x8
SAD_X_W16_AVX512 4, 16 ; x4_16x16
;=============================================================================
; SAD cacheline split
;=============================================================================

@ -323,6 +323,8 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
%endmacro
%define required_stack_alignment ((mmsize + 15) & ~15)
%define vzeroupper_required (mmsize > 16 && (ARCH_X86_64 == 0 || xmm_regs_used > 16 || notcpuflag(avx512)))
%define high_mm_regs (16*cpuflag(avx512))
%macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only)
%ifnum %1
@ -414,10 +416,10 @@ DECLARE_REG 7, rdi, 64
DECLARE_REG 8, rsi, 72
DECLARE_REG 9, rbx, 80
DECLARE_REG 10, rbp, 88
DECLARE_REG 11, R12, 96
DECLARE_REG 12, R13, 104
DECLARE_REG 13, R14, 112
DECLARE_REG 14, R15, 120
DECLARE_REG 11, R14, 96
DECLARE_REG 12, R15, 104
DECLARE_REG 13, R12, 112
DECLARE_REG 14, R13, 120
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
@ -436,15 +438,16 @@ DECLARE_REG 14, R15, 120
%macro WIN64_PUSH_XMM 0
; Use the shadow space to store XMM6 and XMM7, the rest needs stack space allocated.
%if xmm_regs_used > 6
%if xmm_regs_used > 6 + high_mm_regs
movaps [rstk + stack_offset + 8], xmm6
%endif
%if xmm_regs_used > 7
%if xmm_regs_used > 7 + high_mm_regs
movaps [rstk + stack_offset + 24], xmm7
%endif
%if xmm_regs_used > 8
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
%if %%xmm_regs_on_stack > 0
%assign %%i 8
%rep xmm_regs_used-8
%rep %%xmm_regs_on_stack
movaps [rsp + (%%i-8)*16 + stack_size + 32], xmm %+ %%i
%assign %%i %%i+1
%endrep
@ -453,53 +456,56 @@ DECLARE_REG 14, R15, 120
%macro WIN64_SPILL_XMM 1
%assign xmm_regs_used %1
ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 8
ASSERT xmm_regs_used <= 16 + high_mm_regs
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
%if %%xmm_regs_on_stack > 0
; Allocate stack space for callee-saved xmm registers plus shadow space and align the stack.
%assign %%pad (xmm_regs_used-8)*16 + 32
%assign %%pad %%xmm_regs_on_stack*16 + 32
%assign stack_size_padded %%pad + ((-%%pad-stack_offset-gprsize) & (STACK_ALIGNMENT-1))
SUB rsp, stack_size_padded
%endif
WIN64_PUSH_XMM
%endmacro
%macro WIN64_RESTORE_XMM_INTERNAL 1
%macro WIN64_RESTORE_XMM_INTERNAL 0
%assign %%pad_size 0
%if xmm_regs_used > 8
%assign %%i xmm_regs_used
%rep xmm_regs_used-8
%assign %%xmm_regs_on_stack xmm_regs_used - high_mm_regs - 8
%if %%xmm_regs_on_stack > 0
%assign %%i xmm_regs_used - high_mm_regs
%rep %%xmm_regs_on_stack
%assign %%i %%i-1
movaps xmm %+ %%i, [%1 + (%%i-8)*16 + stack_size + 32]
movaps xmm %+ %%i, [rsp + (%%i-8)*16 + stack_size + 32]
%endrep
%endif
%if stack_size_padded > 0
%if stack_size > 0 && required_stack_alignment > STACK_ALIGNMENT
mov rsp, rstkm
%else
add %1, stack_size_padded
add rsp, stack_size_padded
%assign %%pad_size stack_size_padded
%endif
%endif
%if xmm_regs_used > 7
movaps xmm7, [%1 + stack_offset - %%pad_size + 24]
%if xmm_regs_used > 7 + high_mm_regs
movaps xmm7, [rsp + stack_offset - %%pad_size + 24]
%endif
%if xmm_regs_used > 6
movaps xmm6, [%1 + stack_offset - %%pad_size + 8]
%if xmm_regs_used > 6 + high_mm_regs
movaps xmm6, [rsp + stack_offset - %%pad_size + 8]
%endif
%endmacro
%macro WIN64_RESTORE_XMM 1
WIN64_RESTORE_XMM_INTERNAL %1
%macro WIN64_RESTORE_XMM 0
WIN64_RESTORE_XMM_INTERNAL
%assign stack_offset (stack_offset-stack_size_padded)
%assign stack_size_padded 0
%assign xmm_regs_used 0
%endmacro
%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0
%define has_epilogue regs_used > 7 || stack_size > 0 || vzeroupper_required || xmm_regs_used > 6+high_mm_regs
%macro RET 0
WIN64_RESTORE_XMM_INTERNAL rsp
WIN64_RESTORE_XMM_INTERNAL
POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
%if mmsize == 32
%if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@ -518,14 +524,15 @@ DECLARE_REG 7, R10, 16
DECLARE_REG 8, R11, 24
DECLARE_REG 9, rbx, 32
DECLARE_REG 10, rbp, 40
DECLARE_REG 11, R12, 48
DECLARE_REG 12, R13, 56
DECLARE_REG 13, R14, 64
DECLARE_REG 14, R15, 72
DECLARE_REG 11, R14, 48
DECLARE_REG 12, R15, 56
DECLARE_REG 13, R12, 64
DECLARE_REG 14, R13, 72
%macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names...
%assign num_args %1
%assign regs_used %2
%assign xmm_regs_used %3
ASSERT regs_used >= num_args
SETUP_STACK_POINTER %4
ASSERT regs_used <= 15
@ -535,7 +542,7 @@ DECLARE_REG 14, R15, 72
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0
%define has_epilogue regs_used > 9 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
@ -546,7 +553,7 @@ DECLARE_REG 14, R15, 72
%endif
%endif
POP_IF_USED 14, 13, 12, 11, 10, 9
%if mmsize == 32
%if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@ -591,7 +598,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
DEFINE_ARGS_INTERNAL %0, %4, %5
%endmacro
%define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0
%define has_epilogue regs_used > 3 || stack_size > 0 || vzeroupper_required
%macro RET 0
%if stack_size_padded > 0
@ -602,7 +609,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%endif
%endif
POP_IF_USED 6, 5, 4, 3
%if mmsize == 32
%if vzeroupper_required
vzeroupper
%endif
AUTO_REP_RET
@ -613,7 +620,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%endmacro
%macro WIN64_RESTORE_XMM 1
%macro WIN64_RESTORE_XMM 0
%endmacro
%macro WIN64_PUSH_XMM 0
%endmacro
@ -624,7 +631,7 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
; We can automatically detect "follows a branch", but not a branch target.
; (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.)
%macro REP_RET 0
%if has_epilogue
%if has_epilogue || cpuflag(ssse3)
RET
%else
rep ret
@ -712,7 +719,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign stack_offset 0 ; stack pointer offset relative to the return address
%assign stack_size 0 ; amount of stack space that can be freely used inside a function
%assign stack_size_padded 0 ; total amount of allocated stack space, including space for callee-saved xmm registers on WIN64 and alignment padding
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64
%assign xmm_regs_used 0 ; number of XMM registers requested, used for dealing with callee-saved registers on WIN64 and vzeroupper
%ifnidn %3, ""
PROLOGUE %3
%endif
@ -775,24 +782,25 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign cpuflags_sse (1<<4) | cpuflags_mmx2
%assign cpuflags_sse2 (1<<5) | cpuflags_sse
%assign cpuflags_sse2slow (1<<6) | cpuflags_sse2
%assign cpuflags_sse3 (1<<7) | cpuflags_sse2
%assign cpuflags_ssse3 (1<<8) | cpuflags_sse3
%assign cpuflags_sse4 (1<<9) | cpuflags_ssse3
%assign cpuflags_sse42 (1<<10)| cpuflags_sse4
%assign cpuflags_avx (1<<11)| cpuflags_sse42
%assign cpuflags_xop (1<<12)| cpuflags_avx
%assign cpuflags_fma4 (1<<13)| cpuflags_avx
%assign cpuflags_fma3 (1<<14)| cpuflags_avx
%assign cpuflags_avx2 (1<<15)| cpuflags_fma3
%assign cpuflags_cache32 (1<<16)
%assign cpuflags_cache64 (1<<17)
%assign cpuflags_slowctz (1<<18)
%assign cpuflags_lzcnt (1<<19)
%assign cpuflags_aligned (1<<20) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<21)
%assign cpuflags_bmi1 (1<<22)|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<23)|cpuflags_bmi1
%assign cpuflags_lzcnt (1<<7) | cpuflags_sse2
%assign cpuflags_sse3 (1<<8) | cpuflags_sse2
%assign cpuflags_ssse3 (1<<9) | cpuflags_sse3
%assign cpuflags_sse4 (1<<10)| cpuflags_ssse3
%assign cpuflags_sse42 (1<<11)| cpuflags_sse4
%assign cpuflags_aesni (1<<12)| cpuflags_sse42
%assign cpuflags_avx (1<<13)| cpuflags_sse42
%assign cpuflags_xop (1<<14)| cpuflags_avx
%assign cpuflags_fma4 (1<<15)| cpuflags_avx
%assign cpuflags_fma3 (1<<16)| cpuflags_avx
%assign cpuflags_bmi1 (1<<17)| cpuflags_avx|cpuflags_lzcnt
%assign cpuflags_bmi2 (1<<18)| cpuflags_bmi1
%assign cpuflags_avx2 (1<<19)| cpuflags_fma3|cpuflags_bmi2
%assign cpuflags_avx512 (1<<20)| cpuflags_avx2 ; F, CD, BW, DQ, VL
%assign cpuflags_cache32 (1<<21)
%assign cpuflags_cache64 (1<<22)
%assign cpuflags_aligned (1<<23) ; not a cpu feature, but a function variant
%assign cpuflags_atom (1<<24)
; Returns a boolean value expressing whether or not the specified cpuflag is enabled.
%define cpuflag(x) (((((cpuflags & (cpuflags_ %+ x)) ^ (cpuflags_ %+ x)) - 1) >> 31) & 1)
@ -835,7 +843,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%if ARCH_X86_64 || cpuflag(sse2)
%ifdef __NASM_VER__
ALIGNMODE k8
ALIGNMODE p6
%else
CPU amdnop
%endif
@ -848,11 +856,12 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%endif
%endmacro
; Merge mmx and sse*
; Merge mmx, sse*, and avx*
; m# is a simd register of the currently selected size
; xm# is the corresponding xmm register if mmsize >= 16, otherwise the same as m#
; ym# is the corresponding ymm register if mmsize >= 32, otherwise the same as m#
; (All 3 remain in sync through SWAP.)
; zm# is the corresponding zmm register if mmsize >= 64, otherwise the same as m#
; (All 4 remain in sync through SWAP.)
%macro CAT_XDEFINE 3
%xdefine %1%2 %3
@ -862,6 +871,18 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%undef %1%2
%endmacro
; Prefer registers 16-31 over 0-15 to avoid having to use vzeroupper
%macro AVX512_MM_PERMUTATION 0-1 0 ; start_reg
%if ARCH_X86_64 && cpuflag(avx512)
%assign %%i %1
%rep 16-%1
%assign %%i_high %%i+16
SWAP %%i, %%i_high
%assign %%i %%i+1
%endrep
%endif
%endmacro
%macro INIT_MMX 0-1+
%assign avx_enabled 0
%define RESET_MM_PERMUTATION INIT_MMX %1
@ -877,7 +898,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
CAT_XDEFINE nnmm, %%i, %%i
%assign %%i %%i+1
%endrep
%rep 8
%rep 24
CAT_UNDEF m, %%i
CAT_UNDEF nnmm, %%i
%assign %%i %%i+1
@ -891,7 +912,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%define mmsize 16
%define num_mmregs 8
%if ARCH_X86_64
%define num_mmregs 16
%define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
@ -904,6 +925,10 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
%if WIN64
; Swap callee-saved registers with volatile registers
AVX512_MM_PERMUTATION 6
%endif
%endmacro
%macro INIT_YMM 0-1+
@ -912,7 +937,7 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%define mmsize 32
%define num_mmregs 8
%if ARCH_X86_64
%define num_mmregs 16
%define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
@ -925,6 +950,29 @@ BRANCH_INSTR jz, je, jnz, jne, jl, jle, jnl, jnle, jg, jge, jng, jnge, ja, jae,
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
AVX512_MM_PERMUTATION
%endmacro
%macro INIT_ZMM 0-1+
%assign avx_enabled 1
%define RESET_MM_PERMUTATION INIT_ZMM %1
%define mmsize 64
%define num_mmregs 8
%if ARCH_X86_64
%define num_mmregs 32
%endif
%define mova movdqa
%define movu movdqu
%undef movh
%define movnta movntdq
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, zmm %+ %%i
CAT_XDEFINE nnzmm, %%i, %%i
%assign %%i %%i+1
%endrep
INIT_CPUFLAGS %1
AVX512_MM_PERMUTATION
%endmacro
INIT_XMM
@ -933,18 +981,26 @@ INIT_XMM
%define mmmm%1 mm%1
%define mmxmm%1 mm%1
%define mmymm%1 mm%1
%define mmzmm%1 mm%1
%define xmmmm%1 mm%1
%define xmmxmm%1 xmm%1
%define xmmymm%1 xmm%1
%define xmmzmm%1 xmm%1
%define ymmmm%1 mm%1
%define ymmxmm%1 xmm%1
%define ymmymm%1 ymm%1
%define ymmzmm%1 ymm%1
%define zmmmm%1 mm%1
%define zmmxmm%1 xmm%1
%define zmmymm%1 ymm%1
%define zmmzmm%1 zmm%1
%define xm%1 xmm %+ m%1
%define ym%1 ymm %+ m%1
%define zm%1 zmm %+ m%1
%endmacro
%assign i 0
%rep 16
%rep 32
DECLARE_MMCAST i
%assign i i+1
%endrep
@ -1032,7 +1088,11 @@ INIT_XMM
; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
%macro call 1
%ifid %1
call_internal %1 %+ SUFFIX, %1
%else
call %1
%endif
%endmacro
%macro call_internal 2
%xdefine %%i %2
@ -1075,12 +1135,17 @@ INIT_XMM
;=============================================================================
%assign i 0
%rep 16
%rep 32
%if i < 8
CAT_XDEFINE sizeofmm, i, 8
CAT_XDEFINE regnumofmm, i, i
%endif
CAT_XDEFINE sizeofxmm, i, 16
CAT_XDEFINE sizeofymm, i, 32
CAT_XDEFINE sizeofzmm, i, 64
CAT_XDEFINE regnumofxmm, i, i
CAT_XDEFINE regnumofymm, i, i
CAT_XDEFINE regnumofzmm, i, i
%assign i i+1
%endrep
%undef i
@ -1197,7 +1262,7 @@ INIT_XMM
%endmacro
%endmacro
; Instructions with both VEX and non-VEX encodings
; Instructions with both VEX/EVEX and legacy encodings
; Non-destructive instructions are written without parameters
AVX_INSTR addpd, sse2, 1, 0, 1
AVX_INSTR addps, sse, 1, 0, 1
@ -1529,15 +1594,48 @@ FMA4_INSTR fmsubadd, pd, ps
FMA4_INSTR fnmadd, pd, ps, sd, ss
FMA4_INSTR fnmsub, pd, ps, sd, ss
; workaround: vpbroadcastq is broken in x86_32 due to a yasm bug (fixed in 1.3.0)
%ifdef __YASM_VER__
%if __YASM_VERSION_ID__ < 0x01030000 && ARCH_X86_64 == 0
%macro vpbroadcastq 2
%if sizeof%1 == 16
movddup %1, %2
; Macros for converting VEX instructions to equivalent EVEX ones.
%macro EVEX_INSTR 2-3 0 ; vex, evex, prefer_evex
%macro %1 2-7 fnord, fnord, %1, %2, %3
%ifidn %3, fnord
%define %%args %1, %2
%elifidn %4, fnord
%define %%args %1, %2, %3
%else
vbroadcastsd %1, %2
%define %%args %1, %2, %3, %4
%endif
%endmacro
%assign %%evex_required cpuflag(avx512) & %7
%ifnum regnumof%1
%if regnumof%1 >= 16 || sizeof%1 > 32
%assign %%evex_required 1
%endif
%endif
%endif
%ifnum regnumof%2
%if regnumof%2 >= 16 || sizeof%2 > 32
%assign %%evex_required 1
%endif
%endif
%if %%evex_required
%6 %%args
%else
%5 %%args ; Prefer VEX over EVEX due to shorter instruction length
%endif
%endmacro
%endmacro
EVEX_INSTR vbroadcastf128, vbroadcastf32x4
EVEX_INSTR vbroadcasti128, vbroadcasti32x4
EVEX_INSTR vextractf128, vextractf32x4
EVEX_INSTR vextracti128, vextracti32x4
EVEX_INSTR vinsertf128, vinsertf32x4
EVEX_INSTR vinserti128, vinserti32x4
EVEX_INSTR vmovdqa, vmovdqa32
EVEX_INSTR vmovdqu, vmovdqu32
EVEX_INSTR vpand, vpandd
EVEX_INSTR vpandn, vpandnd
EVEX_INSTR vpor, vpord
EVEX_INSTR vpxor, vpxord
EVEX_INSTR vrcpps, vrcp14ps, 1 ; EVEX versions have higher precision
EVEX_INSTR vrcpss, vrcp14ss, 1
EVEX_INSTR vrsqrtps, vrsqrt14ps, 1
EVEX_INSTR vrsqrtss, vrsqrt14ss, 1

@ -303,24 +303,24 @@
%endmacro
%macro HADDD 2 ; sum junk
%if sizeof%1 == 32
%define %2 xmm%2
vextracti128 %2, %1, 1
%define %1 xmm%1
paddd %1, %2
%if sizeof%1 >= 64
vextracti32x8 ymm%2, zmm%1, 1
paddd ymm%1, ymm%2
%endif
%if mmsize >= 16
MOVHL %2, %1
paddd %1, %2
%if sizeof%1 >= 32
vextracti128 xmm%2, ymm%1, 1
paddd xmm%1, xmm%2
%endif
%if sizeof%1 >= 16
MOVHL xmm%2, xmm%1
paddd xmm%1, xmm%2
%endif
%if cpuflag(xop) && sizeof%1 == 16
vphadddq %1, %1
vphadddq xmm%1, xmm%1
%else
PSHUFLW %2, %1, q0032
paddd %1, %2
PSHUFLW xmm%2, xmm%1, q1032
paddd xmm%1, xmm%2
%endif
%undef %1
%undef %2
%endmacro
%macro HADDW 2 ; reg, tmp

@ -34,37 +34,23 @@
typedef struct
{
/* 16x16 */
int i_rd16x16;
x264_me_t me16x16;
x264_me_t bi16x16; /* for b16x16 BI mode, since MVs can differ from l0/l1 */
/* 8x8 */
int i_cost8x8;
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
ALIGNED_4( int16_t mvc[32][5][2] );
x264_me_t me8x8[4];
/* Sub 4x4 */
int i_cost4x4[4]; /* cost per 8x8 partition */
x264_me_t me4x4[4][4];
/* Sub 8x4 */
int i_cost8x4[4]; /* cost per 8x8 partition */
x264_me_t me8x4[4][2];
/* Sub 4x8 */
int i_cost4x8[4]; /* cost per 8x8 partition */
x264_me_t me4x8[4][2];
/* 16x8 */
int i_cost16x8;
x264_me_t me16x8[2];
/* 8x16 */
int i_cost8x16;
x264_me_t me8x16[2];
int i_rd16x16;
int i_cost8x8;
int i_cost4x4[4]; /* cost per 8x8 partition */
int i_cost8x4[4]; /* cost per 8x8 partition */
int i_cost4x8[4]; /* cost per 8x8 partition */
int i_cost16x8;
int i_cost8x16;
/* [ref][0] is 16x16 mv, [ref][1..4] are 8x8 mv from partition [0..3] */
ALIGNED_4( int16_t mvc[32][5][2] );
} x264_mb_analysis_list_t;
typedef struct
@ -278,29 +264,31 @@ static uint16_t x264_cost_i4x4_mode[(QP_MAX+2)*32];
static int init_costs( x264_t *h, float *logs, int qp )
{
int lambda = x264_lambda_tab[qp];
if( h->cost_mv[qp] )
return 0;
int mv_range = h->param.analyse.i_mv_range;
int lambda = x264_lambda_tab[qp];
/* factor of 4 from qpel, 2 from sign, and 2 because mv can be opposite from mvp */
CHECKED_MALLOC( h->cost_mv[qp], (4*4*2048 + 1) * sizeof(uint16_t) );
h->cost_mv[qp] += 2*4*2048;
for( int i = 0; i <= 2*4*2048; i++ )
CHECKED_MALLOC( h->cost_mv[qp], (4*4*mv_range + 1) * sizeof(uint16_t) );
h->cost_mv[qp] += 2*4*mv_range;
for( int i = 0; i <= 2*4*mv_range; i++ )
{
h->cost_mv[qp][-i] =
h->cost_mv[qp][i] = X264_MIN( lambda * logs[i] + .5f, (1<<16)-1 );
h->cost_mv[qp][i] = X264_MIN( (int)(lambda * logs[i] + .5f), UINT16_MAX );
}
x264_pthread_mutex_lock( &cost_ref_mutex );
for( int i = 0; i < 3; i++ )
for( int j = 0; j < 33; j++ )
x264_cost_ref[qp][i][j] = X264_MIN( i ? lambda * bs_size_te( i, j ) : 0, (1<<16)-1 );
x264_cost_ref[qp][i][j] = i ? X264_MIN( lambda * bs_size_te( i, j ), UINT16_MAX ) : 0;
x264_pthread_mutex_unlock( &cost_ref_mutex );
if( h->param.analyse.i_me_method >= X264_ME_ESA && !h->cost_mv_fpel[qp][0] )
{
for( int j = 0; j < 4; j++ )
{
CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*2048 + 1) * sizeof(uint16_t) );
h->cost_mv_fpel[qp][j] += 2*2048;
for( int i = -2*2048; i < 2*2048; i++ )
CHECKED_MALLOC( h->cost_mv_fpel[qp][j], (4*mv_range + 1) * sizeof(uint16_t) );
h->cost_mv_fpel[qp][j] += 2*mv_range;
for( int i = -2*mv_range; i < 2*mv_range; i++ )
h->cost_mv_fpel[qp][j][i] = h->cost_mv[qp][i*4+j];
}
}
@ -314,12 +302,13 @@ fail:
int x264_analyse_init_costs( x264_t *h )
{
float *logs = x264_malloc( (2*4*2048+1) * sizeof(float) );
int mv_range = h->param.analyse.i_mv_range;
float *logs = x264_malloc( (2*4*mv_range+1) * sizeof(float) );
if( !logs )
return -1;
logs[0] = 0.718f;
for( int i = 1; i <= 2*4*2048; i++ )
for( int i = 1; i <= 2*4*mv_range; i++ )
logs[i] = log2f( i+1 ) * 2.0f + 1.718f;
for( int qp = X264_MIN( h->param.rc.i_qp_min, QP_MAX_SPEC ); qp <= h->param.rc.i_qp_max; qp++ )
@ -338,13 +327,14 @@ fail:
void x264_analyse_free_costs( x264_t *h )
{
int mv_range = h->param.analyse.i_mv_range;
for( int i = 0; i < QP_MAX+1; i++ )
{
if( h->cost_mv[i] )
x264_free( h->cost_mv[i] - 2*4*2048 );
x264_free( h->cost_mv[i] - 2*4*mv_range );
if( h->cost_mv_fpel[i][0] )
for( int j = 0; j < 4; j++ )
x264_free( h->cost_mv_fpel[i][j] - 2*2048 );
x264_free( h->cost_mv_fpel[i][j] - 2*mv_range );
}
}
@ -465,11 +455,10 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
int i_fpel_border = 6; // umh: 1 for diamond, 2 for octagon, 2 for hpel
/* Calculate max allowed MV range */
#define CLIP_FMV(mv) x264_clip3( mv, -i_fmv_range, i_fmv_range-1 )
h->mb.mv_min[0] = 4*( -16*h->mb.i_mb_x - 24 );
h->mb.mv_max[0] = 4*( 16*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 24 );
h->mb.mv_min_spel[0] = CLIP_FMV( h->mb.mv_min[0] );
h->mb.mv_max_spel[0] = CLIP_FMV( h->mb.mv_max[0] );
h->mb.mv_min_spel[0] = X264_MAX( h->mb.mv_min[0], -i_fmv_range );
h->mb.mv_max_spel[0] = X264_MIN( h->mb.mv_max[0], i_fmv_range-1 );
if( h->param.b_intra_refresh && h->sh.i_type == SLICE_TYPE_P )
{
int max_x = (h->fref[0][0]->i_pir_end_col * 16 - 3)*4; /* 3 pixels of hpel border */
@ -513,9 +502,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
mb_y = (h->mb.i_mb_y >> j) + (i == 1);
h->mb.mv_miny_row[i] = 4*( -16*mb_y - 24 );
h->mb.mv_maxy_row[i] = 4*( 16*( (h->mb.i_mb_height>>j) - mb_y - 1 ) + 24 );
h->mb.mv_miny_spel_row[i] = x264_clip3( h->mb.mv_miny_row[i], -i_fmv_range, i_fmv_range );
h->mb.mv_maxy_spel_row[i] = CLIP_FMV( h->mb.mv_maxy_row[i] );
h->mb.mv_maxy_spel_row[i] = X264_MIN( h->mb.mv_maxy_spel_row[i], thread_mvy_range*4 );
h->mb.mv_miny_spel_row[i] = X264_MAX( h->mb.mv_miny_row[i], -i_fmv_range );
h->mb.mv_maxy_spel_row[i] = X264_MIN3( h->mb.mv_maxy_row[i], i_fmv_range-1, 4*thread_mvy_range );
h->mb.mv_miny_fpel_row[i] = (h->mb.mv_miny_spel_row[i]>>2) + i_fpel_border;
h->mb.mv_maxy_fpel_row[i] = (h->mb.mv_maxy_spel_row[i]>>2) - i_fpel_border;
}
@ -524,9 +512,8 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
{
h->mb.mv_min[1] = 4*( -16*mb_y - 24 );
h->mb.mv_max[1] = 4*( 16*( h->mb.i_mb_height - mb_y - 1 ) + 24 );
h->mb.mv_min_spel[1] = x264_clip3( h->mb.mv_min[1], -i_fmv_range, i_fmv_range );
h->mb.mv_max_spel[1] = CLIP_FMV( h->mb.mv_max[1] );
h->mb.mv_max_spel[1] = X264_MIN( h->mb.mv_max_spel[1], thread_mvy_range*4 );
h->mb.mv_min_spel[1] = X264_MAX( h->mb.mv_min[1], -i_fmv_range );
h->mb.mv_max_spel[1] = X264_MIN3( h->mb.mv_max[1], i_fmv_range-1, 4*thread_mvy_range );
h->mb.mv_limit_fpel[0][1] = (h->mb.mv_min_spel[1]>>2) + i_fpel_border;
h->mb.mv_limit_fpel[1][1] = (h->mb.mv_max_spel[1]>>2) - i_fpel_border;
}
@ -541,7 +528,6 @@ static void x264_mb_analyse_init( x264_t *h, x264_mb_analysis_t *a, int qp )
h->mb.mv_limit_fpel[0][1] = h->mb.mv_miny_fpel_row[i];
h->mb.mv_limit_fpel[1][1] = h->mb.mv_maxy_fpel_row[i];
}
#undef CLIP_FMV
a->l0.me16x16.cost =
a->l0.i_rd16x16 =
@ -713,8 +699,12 @@ static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
if( !h->mb.i_psy_rd )
return;
/* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
M128( &h->mb.pic.fenc_hadamard_cache[0] ) = M128_ZERO;
M128( &h->mb.pic.fenc_hadamard_cache[2] ) = M128_ZERO;
M128( &h->mb.pic.fenc_hadamard_cache[4] ) = M128_ZERO;
M128( &h->mb.pic.fenc_hadamard_cache[6] ) = M128_ZERO;
h->mb.pic.fenc_hadamard_cache[8] = 0;
if( b_satd )
h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
}
@ -743,8 +733,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
}
a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
+ h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
+ h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
return;
}
@ -759,8 +749,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE );
satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
for( ; *predict_mode >= 0; predict_mode++ )
{
@ -788,8 +778,8 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
}
/* we calculate the cost */
i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE ) +
h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE ) +
a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
a->i_satd_chroma_dir[i_mode] = i_satd;
@ -845,7 +835,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
if( a->i_satd_i16x16 <= i16x16_thresh )
{
h->predict_16x16[I_PRED_16x16_P]( p_dst );
a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
a->i_satd_i16x16_dir[I_PRED_16x16_P] = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE );
a->i_satd_i16x16_dir[I_PRED_16x16_P] += lambda * bs_size_ue(3);
COPY2_IF_LT( a->i_satd_i16x16, a->i_satd_i16x16_dir[I_PRED_16x16_P], a->i_predict16x16, 3 );
}
@ -862,7 +852,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
else
h->predict_16x16[i_mode]( p_dst );
i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE ) +
i_satd = h->pixf.mbcmp[PIXEL_16x16]( p_src, FENC_STRIDE, p_dst, FDEC_STRIDE ) +
lambda * bs_size_ue( x264_mb_pred_mode16x16_fix[i_mode] );
COPY2_IF_LT( a->i_satd_i16x16, i_satd, a->i_predict16x16, i_mode );
a->i_satd_i16x16_dir[i_mode] = i_satd;
@ -1065,7 +1055,7 @@ static void x264_mb_analyse_intra( x264_t *h, x264_mb_analysis_t *a, int i_satd_
else
h->predict_4x4[i_mode]( p_dst_by );
i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_dst_by, FDEC_STRIDE, p_src_by, FENC_STRIDE );
i_satd = h->pixf.mbcmp[PIXEL_4x4]( p_src_by, FENC_STRIDE, p_dst_by, FDEC_STRIDE );
if( i_pred_mode == x264_mb_pred_mode4x4_fix(i_mode) )
{
i_satd -= lambda * 3;
@ -1735,7 +1725,7 @@ static void x264_mb_analyse_inter_p8x16( x264_t *h, x264_mb_analysis_t *a, int i
static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
pixel **p_fref, int i8x8, int size, int chroma )
{
ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
pixel *pix2 = pix1+8;
int i_stride = h->mb.pic.i_stride[1];
int chroma_h_shift = chroma <= CHROMA_422;
@ -1919,8 +1909,8 @@ static void x264_mb_analyse_inter_p4x8( x264_t *h, x264_mb_analysis_t *a, int i8
static ALWAYS_INLINE int x264_analyse_bi_chroma( x264_t *h, x264_mb_analysis_t *a, int idx, int i_pixel )
{
ALIGNED_ARRAY_N( pixel, pix, [4],[16*16] );
ALIGNED_ARRAY_N( pixel, bi, [2],[16*16] );
ALIGNED_ARRAY_32( pixel, pix, [4],[16*16] );
ALIGNED_ARRAY_32( pixel, bi, [2],[16*16] );
int i_chroma_cost = 0;
int chromapix = h->luma2chroma_pixel[i_pixel];
@ -2013,8 +2003,8 @@ static void x264_mb_analyse_inter_direct( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
{
ALIGNED_ARRAY_N( pixel, pix0,[16*16] );
ALIGNED_ARRAY_N( pixel, pix1,[16*16] );
ALIGNED_ARRAY_32( pixel, pix0,[16*16] );
ALIGNED_ARRAY_32( pixel, pix1,[16*16] );
pixel *src0, *src1;
intptr_t stride0 = 16, stride1 = 16;
int i_ref, i_mvc;
@ -2147,7 +2137,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
}
else
{
ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] );
ALIGNED_ARRAY_32( pixel, pixuv, [2],[16*FENC_STRIDE] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int v_shift = CHROMA_V_SHIFT;
@ -2483,7 +2473,7 @@ static void x264_mb_analyse_inter_b8x8( x264_t *h, x264_mb_analysis_t *a )
static void x264_mb_analyse_inter_b16x8( x264_t *h, x264_mb_analysis_t *a, int i_best_satd )
{
ALIGNED_ARRAY_N( pixel, pix,[2],[16*8] );
ALIGNED_ARRAY_32( pixel, pix,[2],[16*8] );
ALIGNED_4( int16_t mvc[3][2] );
h->mb.i_partition = D_16x8;

@ -801,7 +801,7 @@ void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat
static void ALWAYS_INLINE x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX
#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
@ -915,7 +915,7 @@ void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_
static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX
#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l );
@ -923,7 +923,7 @@ static ALWAYS_INLINE void x264_cabac_block_residual_8x8( x264_t *h, x264_cabac_t
}
static ALWAYS_INLINE void x264_cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
#if ARCH_X86_64 && HAVE_MMX
#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l );
@ -1057,29 +1057,29 @@ static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_
src = dst;
#define MUNGE_8x8_NNZ( MUNGE )\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] )\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[0]] && !(h->mb.cbp[h->mb.i_mb_left_xy[0]] & 0x1000) )\
{\
MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x80 )\
MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x80 )\
MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x80 )\
MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x80 )\
MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x80 )\
MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x80 )\
MUNGE( nnzbak[0][0], h->mb.cache.non_zero_count[x264_scan8[16*0+ 0] - 1], 0x00 )\
MUNGE( nnzbak[0][1], h->mb.cache.non_zero_count[x264_scan8[16*0+ 2] - 1], 0x00 )\
MUNGE( nnzbak[1][0], h->mb.cache.non_zero_count[x264_scan8[16*1+ 0] - 1], 0x00 )\
MUNGE( nnzbak[1][1], h->mb.cache.non_zero_count[x264_scan8[16*1+ 2] - 1], 0x00 )\
MUNGE( nnzbak[2][0], h->mb.cache.non_zero_count[x264_scan8[16*2+ 0] - 1], 0x00 )\
MUNGE( nnzbak[2][1], h->mb.cache.non_zero_count[x264_scan8[16*2+ 2] - 1], 0x00 )\
}\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] )\
if( (h->mb.i_neighbour & MB_LEFT) && !h->mb.mb_transform_size[h->mb.i_mb_left_xy[1]] && !(h->mb.cbp[h->mb.i_mb_left_xy[1]] & 0x1000) )\
{\
MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x80 )\
MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x80 )\
MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x80 )\
MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x80 )\
MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x80 )\
MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x80 )\
MUNGE( nnzbak[0][2], h->mb.cache.non_zero_count[x264_scan8[16*0+ 8] - 1], 0x00 )\
MUNGE( nnzbak[0][3], h->mb.cache.non_zero_count[x264_scan8[16*0+10] - 1], 0x00 )\
MUNGE( nnzbak[1][2], h->mb.cache.non_zero_count[x264_scan8[16*1+ 8] - 1], 0x00 )\
MUNGE( nnzbak[1][3], h->mb.cache.non_zero_count[x264_scan8[16*1+10] - 1], 0x00 )\
MUNGE( nnzbak[2][2], h->mb.cache.non_zero_count[x264_scan8[16*2+ 8] - 1], 0x00 )\
MUNGE( nnzbak[2][3], h->mb.cache.non_zero_count[x264_scan8[16*2+10] - 1], 0x00 )\
}\
if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] )\
if( (h->mb.i_neighbour & MB_TOP) && !h->mb.mb_transform_size[h->mb.i_mb_top_xy] && !(h->mb.cbp[h->mb.i_mb_top_xy] & 0x1000) )\
{\
MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x80808080U )\
MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x80808080U )\
MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x80808080U )\
MUNGE( M32( &nnzbak[0][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*0] - 8] ), 0x00000000U )\
MUNGE( M32( &nnzbak[1][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*1] - 8] ), 0x00000000U )\
MUNGE( M32( &nnzbak[2][4] ), M32( &h->mb.cache.non_zero_count[x264_scan8[16*2] - 8] ), 0x00000000U )\
}
MUNGE_8x8_NNZ( BACKUP )

@ -444,11 +444,6 @@ static int x264_validate_parameters( x264_t *h, int b_open )
fail = 1;
}
#endif
if( !fail && !(cpuflags & X264_CPU_CMOV) )
{
x264_log( h, X264_LOG_ERROR, "your cpu does not support CMOV, but x264 was compiled with asm\n");
fail = 1;
}
if( fail )
{
x264_log( h, X264_LOG_ERROR, "to run x264, recompile without asm (configure --disable-asm)\n");
@ -494,7 +489,8 @@ static int x264_validate_parameters( x264_t *h, int b_open )
#endif
if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
{
x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/NV21/I422/YV16/NV16/YUYV/UYVY/"
"I444/YV24/BGR/BGRA/RGB supported)\n" );
return -1;
}
@ -859,6 +855,11 @@ static int x264_validate_parameters( x264_t *h, int b_open )
h->param.analyse.inter &= ~X264_ANALYSE_I8x8;
h->param.analyse.intra &= ~X264_ANALYSE_I8x8;
}
if( i_csp >= X264_CSP_I444 && h->param.b_cabac )
{
/* Disable 8x8dct during 4:4:4+CABAC encoding for compatibility with libavcodec */
h->param.analyse.b_transform_8x8 = 0;
}
if( h->param.rc.i_rc_method == X264_RC_CQP )
{
float qp_p = h->param.rc.i_qp_constant;
@ -1170,7 +1171,7 @@ static int x264_validate_parameters( x264_t *h, int b_open )
if( h->param.analyse.i_mv_range <= 0 )
h->param.analyse.i_mv_range = l->mv_range >> PARAM_INTERLACED;
else
h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 512 >> PARAM_INTERLACED);
h->param.analyse.i_mv_range = x264_clip3(h->param.analyse.i_mv_range, 32, 8192 >> PARAM_INTERLACED);
}
h->param.analyse.i_weighted_pred = x264_clip3( h->param.analyse.i_weighted_pred, X264_WEIGHTP_NONE, X264_WEIGHTP_SMART );
@ -1530,6 +1531,12 @@ x264_t *x264_encoder_open( x264_param_t *param )
x264_rdo_init();
/* init CPU functions */
#if (ARCH_X86 || ARCH_X86_64) && HIGH_BIT_DEPTH
/* FIXME: Only 8-bit has been optimized for AVX-512 so far. The few AVX-512 functions
* enabled in high bit-depth are insignificant and just causes potential issues with
* unnecessary thermal throttling and whatnot, so keep it disabled for now. */
h->param.cpu &= ~X264_CPU_AVX512;
#endif
x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
@ -1566,9 +1573,15 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( !strcmp(x264_cpu_names[i].name, "SSE4.1")
&& (h->param.cpu & X264_CPU_SSE42) )
continue;
if( !strcmp(x264_cpu_names[i].name, "LZCNT")
&& (h->param.cpu & X264_CPU_BMI1) )
continue;
if( !strcmp(x264_cpu_names[i].name, "BMI1")
&& (h->param.cpu & X264_CPU_BMI2) )
continue;
if( !strcmp(x264_cpu_names[i].name, "FMA4")
&& (h->param.cpu & X264_CPU_FMA3) )
continue;
if( (h->param.cpu & x264_cpu_names[i].flags) == x264_cpu_names[i].flags
&& (!i || x264_cpu_names[i].flags != x264_cpu_names[i-1].flags) )
p += sprintf( p, " %s", x264_cpu_names[i].name );
@ -1580,14 +1593,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
if( x264_analyse_init_costs( h ) )
goto fail;
static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 };
/* Checks for known miscompilation issues. */
if( h->cost_mv[X264_LOOKAHEAD_QP][2013] != cost_mv_correct[BIT_DEPTH-8] )
{
x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
goto fail;
}
/* Must be volatile or else GCC will optimize it out. */
volatile int temp = 392;
if( x264_clz( temp ) != 23 )

@ -128,8 +128,8 @@ static void x264_mb_encode_i16x16( x264_t *h, int p, int i_qp )
pixel *p_src = h->mb.pic.p_fenc[p];
pixel *p_dst = h->mb.pic.p_fdec[p];
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct_dc4x4,[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct_dc4x4,[16] );
int nz, block_cbp = 0;
int decimate_score = h->mb.b_dct_decimate ? 0 : 9;
@ -283,13 +283,10 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
{
int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
int ssd[2];
ALIGNED_ARRAY_8( int, ssd,[2] );
int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
int score = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
if( score < thresh*4 )
score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
if( score < thresh*4 )
if( h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], ssd ) < thresh*4 )
{
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
@ -350,7 +347,7 @@ static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter
int i_decimate_score = b_decimate ? 0 : 7;
int nz_ac = 0;
ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
if( h->mb.b_lossless )
{
@ -561,9 +558,16 @@ void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_m
pixel *p_src = h->mb.pic.p_fenc_plane[p] + block_idx_x[idx]*4 + block_idx_y[idx]*4 * stride;
if( i_mode == I_PRED_4x4_V )
{
h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-stride, stride, 4 );
memcpy( p_dst, p_dst-FDEC_STRIDE, 4*sizeof(pixel) );
}
else if( i_mode == I_PRED_4x4_H )
{
h->mc.copy[PIXEL_4x4]( p_dst, FDEC_STRIDE, p_src-1, stride, 4 );
for( int i = 0; i < 4; i++ )
p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1];
}
else
h->predict_4x4[i_mode]( p_dst );
}
@ -574,9 +578,16 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_m
pixel *p_src = h->mb.pic.p_fenc_plane[p] + (idx&1)*8 + (idx>>1)*8*stride;
if( i_mode == I_PRED_8x8_V )
{
h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-stride, stride, 8 );
memcpy( p_dst, &edge[16], 8*sizeof(pixel) );
}
else if( i_mode == I_PRED_8x8_H )
{
h->mc.copy[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src-1, stride, 8 );
for( int i = 0; i < 8; i++ )
p_dst[i*FDEC_STRIDE] = edge[14-i];
}
else
h->predict_8x8[i_mode]( p_dst, edge );
}
@ -584,12 +595,21 @@ void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_m
void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode )
{
int stride = h->fenc->i_stride[p] << MB_INTERLACED;
pixel *p_dst = h->mb.pic.p_fdec[p];
if( i_mode == I_PRED_16x16_V )
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
{
h->mc.copy[PIXEL_16x16]( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-stride, stride, 16 );
memcpy( p_dst, p_dst-FDEC_STRIDE, 16*sizeof(pixel) );
}
else if( i_mode == I_PRED_16x16_H )
h->mc.copy_16x16_unaligned( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
{
h->mc.copy_16x16_unaligned( p_dst, FDEC_STRIDE, h->mb.pic.p_fenc_plane[p]-1, stride, 16 );
for( int i = 0; i < 16; i++ )
p_dst[i*FDEC_STRIDE] = p_dst[i*FDEC_STRIDE-1];
}
else
h->predict_16x16[i_mode]( h->mb.pic.p_fdec[p] );
h->predict_16x16[i_mode]( p_dst );
}
/*****************************************************************************
@ -780,7 +800,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else if( h->mb.b_transform_8x8 )
{
ALIGNED_ARRAY_N( dctcoef, dct8x8,[4],[64] );
ALIGNED_ARRAY_64( dctcoef, dct8x8,[4],[64] );
b_decimate &= !h->mb.b_trellis || !h->param.b_cabac; // 8x8 trellis is inherently optimal decimation for CABAC
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
@ -824,7 +844,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_internal( x264_t *h, int plane_
}
else
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16],[16] );
for( int p = 0; p < plane_count; p++, i_qp = h->mb.i_chroma_qp )
{
int quant_cat = p ? CQM_4PC : CQM_4PY;
@ -965,8 +985,8 @@ void x264_macroblock_encode( x264_t *h )
*****************************************************************************/
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_64( dctcoef, dctscan,[16] );
ALIGNED_4( int16_t mvp[2] );
int i_qp = h->mb.i_qp;
@ -1219,7 +1239,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
int quant_cat = p ? CQM_8PC : CQM_8PY;
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
int nnz8x8 = x264_quant_8x8( h, dct8x8, i_qp, ctx_cat_plane[DCT_LUMA_8x8][p], 0, p, i8 );
@ -1252,7 +1272,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
pixel *p_fenc = h->mb.pic.p_fenc[p] + 8*x + 8*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[p] + 8*x + 8*y*FDEC_STRIDE;
int i_decimate_8x8 = b_decimate ? 0 : 4;
ALIGNED_ARRAY_N( dctcoef, dct4x4,[4],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[4],[16] );
int nnz8x8 = 0;
h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
@ -1311,7 +1331,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p8x8_internal( x264_t *h, int i
i_qp = h->mb.i_chroma_qp;
for( int ch = 0; ch < 2; ch++ )
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[2],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[2],[16] );
pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
@ -1376,7 +1396,7 @@ static ALWAYS_INLINE void x264_macroblock_encode_p4x4_internal( x264_t *h, int i
}
else
{
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
nz = x264_quant_4x4( h, dct4x4, i_qp, ctx_cat_plane[DCT_LUMA_4x4][p], 0, p, i4 );
h->mb.cache.non_zero_count[x264_scan8[p*16+i4]] = nz;

@ -55,6 +55,9 @@ void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
void x264_cabac_mb_skip( x264_t *h, int b_skip );
void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp,
int ctx_block_cat, int b_intra, int idx );
@ -113,7 +116,7 @@ static ALWAYS_INLINE void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][block_idx_xy_fenc[idx]];
pixel *p_dst = &h->mb.pic.p_fdec[p][block_idx_xy_fdec[idx]];
ALIGNED_ARRAY_N( dctcoef, dct4x4,[16] );
ALIGNED_ARRAY_64( dctcoef, dct4x4,[16] );
if( b_predict )
{
@ -151,7 +154,7 @@ static ALWAYS_INLINE void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_
int nz;
pixel *p_src = &h->mb.pic.p_fenc[p][8*x + 8*y*FENC_STRIDE];
pixel *p_dst = &h->mb.pic.p_fdec[p][8*x + 8*y*FDEC_STRIDE];
ALIGNED_ARRAY_N( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_64( dctcoef, dct8x8,[64] );
ALIGNED_ARRAY_32( pixel, edge_buf,[36] );
if( b_predict )

@ -191,7 +191,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
int omx, omy, pmx, pmy;
pixel *p_fenc = m->p_fenc[0];
pixel *p_fref_w = m->p_fref_w;
ALIGNED_ARRAY_N( pixel, pix,[16*16] );
ALIGNED_ARRAY_32( pixel, pix,[16*16] );
ALIGNED_ARRAY_8( int16_t, mvc_temp,[16],[2] );
ALIGNED_ARRAY_16( int, costs,[16] );
@ -875,7 +875,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
int chroma_v_shift = CHROMA_V_SHIFT;
int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
ALIGNED_ARRAY_N( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
ALIGNED_ARRAY_32( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
ALIGNED_ARRAY_16( int, costs,[4] );
int bmx = m->mv[0];
@ -1034,9 +1034,9 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
ALIGNED_ARRAY_N( pixel, pixy_buf,[2],[9][16*16] );
ALIGNED_ARRAY_N( pixel, pixu_buf,[2],[9][16*16] );
ALIGNED_ARRAY_N( pixel, pixv_buf,[2],[9][16*16] );
ALIGNED_ARRAY_32( pixel, pixy_buf,[2],[9][16*16] );
ALIGNED_ARRAY_32( pixel, pixu_buf,[2],[9][16*16] );
ALIGNED_ARRAY_32( pixel, pixv_buf,[2],[9][16*16] );
pixel *src[3][2][9];
int chromapix = h->luma2chroma_pixel[i_pixel];
int chroma_v_shift = CHROMA_V_SHIFT;
@ -1059,7 +1059,7 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m
uint64_t bcostrd = COST_MAX64;
uint16_t amvd;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
ALIGNED_ARRAY_N( uint8_t, visited,[8],[8][8] );
ALIGNED_ARRAY_64( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
ALIGNED_4( static const int8_t dia4d[33][4] ) =
{

@ -32,10 +32,10 @@
typedef struct
{
/* aligning the first member is a gcc hack to force the struct to be
* 16 byte aligned, as well as force sizeof(struct) to be a multiple of 16 */
/* aligning the first member is a gcc hack to force the struct to be aligned,
* as well as force sizeof(struct) to be a multiple of the alignment. */
/* input */
ALIGNED_16( int i_pixel ); /* PIXEL_WxH */
ALIGNED_64( int i_pixel ); /* PIXEL_WxH */
uint16_t *p_cost_mv; /* lambda * nbits for each possible mv */
int i_ref_cost;
int i_ref;
@ -53,7 +53,7 @@ typedef struct
int cost_mv; /* lambda * nbits for the chosen mv */
int cost; /* satd + lambda * nbits */
ALIGNED_4( int16_t mv[2] );
} ALIGNED_16( x264_me_t );
} ALIGNED_64( x264_me_t );
void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int *p_fullpel_thresh );
#define x264_me_search( h, m, mvc, i_mvc )\
@ -66,8 +66,6 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
extern uint16_t *x264_cost_mv_fpel[QP_MAX+1][4];
#define COPY1_IF_LT(x,y)\
if( (y) < (x) )\
(x) = (y);

@ -243,7 +243,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
stride <<= b_field;
if( b_chroma )
{
ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] );
ALIGNED_ARRAY_32( pixel, pix,[FENC_STRIDE*16] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int shift = 7 - CHROMA_V_SHIFT;
@ -420,7 +420,7 @@ static int x264_macroblock_tree_rescale_init( x264_t *h, x264_ratecontrol_t *rc
float dstdim[2] = { h->param.i_width / 16.f, h->param.i_height / 16.f};
int srcdimi[2] = {ceil(srcdim[0]), ceil(srcdim[1])};
int dstdimi[2] = {ceil(dstdim[0]), ceil(dstdim[1])};
if( PARAM_INTERLACED )
if( h->param.b_interlaced || h->param.b_fake_interlaced )
{
srcdimi[1] = (srcdimi[1]+1)&~1;
dstdimi[1] = (dstdimi[1]+1)&~1;
@ -1469,7 +1469,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
if( h->i_frame == 0 )
{
//384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR
double fr = 1. / 172;
double fr = 1. / (h->param.i_level_idc >= 60 ? 300 : 172);
int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height;
rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr;
}

@ -58,8 +58,6 @@ int x264_ratecontrol_qp( x264_t * );
int x264_ratecontrol_mb_qp( x264_t *h );
int x264_ratecontrol_end( x264_t *, int bits, int *filler );
void x264_ratecontrol_summary( x264_t * );
void x264_ratecontrol_set_estimated_size( x264_t *, int bits );
int x264_ratecontrol_get_estimated_size( x264_t const *);
int x264_rc_analyse_slice( x264_t *h );
void x264_threads_distribute_ratecontrol( x264_t *h );
void x264_threads_merge_ratecontrol( x264_t *h );

@ -64,9 +64,8 @@ static uint16_t cabac_size_5ones[128];
#include "cabac.c"
#define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) - (CHROMA444 ? 0 : (1024+12)-460) )
#define COPY_CABAC_PART( pos, size )\
memcpy( &cb->state[pos], &h->cabac.state[pos], size )
sizeof(int) + (CHROMA444 ? 1024+12 : 460) )
#define COPY_CABAC_PART( pos, size ) memcpy( &cb->state[pos], &h->cabac.state[pos], size )
static ALWAYS_INLINE uint64_t cached_hadamard( x264_t *h, int size, int x, int y )
{
@ -634,8 +633,8 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
const uint8_t *zigzag, int ctx_block_cat, int lambda2, int b_ac,
int b_chroma, int dc, int num_coefs, int idx )
{
ALIGNED_ARRAY_N( dctcoef, orig_coefs, [64] );
ALIGNED_ARRAY_N( dctcoef, quant_coefs, [64] );
ALIGNED_ARRAY_64( dctcoef, orig_coefs, [64] );
ALIGNED_ARRAY_64( dctcoef, quant_coefs, [64] );
const uint32_t *coef_weight1 = num_coefs == 64 ? x264_dct8_weight_tab : x264_dct4_weight_tab;
const uint32_t *coef_weight2 = num_coefs == 64 ? x264_dct8_weight2_tab : x264_dct4_weight2_tab;
const int b_interlaced = MB_INTERLACED;
@ -695,7 +694,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
return !!dct[0];
}
#if HAVE_MMX && ARCH_X86_64
#if HAVE_MMX && ARCH_X86_64 && !defined( __MACH__ )
#define TRELLIS_ARGS unquant_mf, zigzag, lambda2, last_nnz, orig_coefs, quant_coefs, dct,\
cabac_state_sig, cabac_state_last, M64(cabac_state), M16(cabac_state+8)
if( num_coefs == 16 && !dc )

@ -800,6 +800,9 @@ const x264_level_t x264_levels[] =
{ 50, 589824, 22080, 110400, 135000, 135000, 512, 16, 24, 2, 1, 1, 1 },
{ 51, 983040, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
{ 52, 2073600, 36864, 184320, 240000, 240000, 512, 16, 24, 2, 1, 1, 1 },
{ 60, 4177920, 139264, 696320, 240000, 240000, 8192, 16, 24, 2, 1, 1, 1 },
{ 61, 8355840, 139264, 696320, 480000, 480000, 8192, 16, 24, 2, 1, 1, 1 },
{ 62, 16711680, 139264, 696320, 800000, 800000, 8192, 16, 24, 2, 1, 1, 1 },
{ 0 }
};

@ -267,7 +267,7 @@ static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t
int i_lines = fenc->i_lines[p];
int i_width = fenc->i_width[p];
pixel *src = fenc->plane[p];
ALIGNED_ARRAY_16( pixel, buf, [16*16] );
ALIGNED_ARRAY_64( pixel, buf, [16*16] );
int pixoff = 0;
if( w )
{
@ -544,17 +544,18 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
if( p0 == p1 )
goto lowres_intra_mb;
int mv_range = 2 * h->param.analyse.i_mv_range;
// no need for h->mb.mv_min[]
h->mb.mv_limit_fpel[0][0] = -8*h->mb.i_mb_x - 4;
h->mb.mv_limit_fpel[1][0] = 8*( h->mb.i_mb_width - h->mb.i_mb_x - 1 ) + 4;
h->mb.mv_min_spel[0] = 4*( h->mb.mv_limit_fpel[0][0] - 8 );
h->mb.mv_max_spel[0] = 4*( h->mb.mv_limit_fpel[1][0] + 8 );
h->mb.mv_min_spel[0] = X264_MAX( 4*(-8*h->mb.i_mb_x - 12), -mv_range );
h->mb.mv_max_spel[0] = X264_MIN( 4*(8*(h->mb.i_mb_width - h->mb.i_mb_x - 1) + 12), mv_range-1 );
h->mb.mv_limit_fpel[0][0] = h->mb.mv_min_spel[0] >> 2;
h->mb.mv_limit_fpel[1][0] = h->mb.mv_max_spel[0] >> 2;
if( h->mb.i_mb_x >= h->mb.i_mb_width - 2 )
{
h->mb.mv_limit_fpel[0][1] = -8*h->mb.i_mb_y - 4;
h->mb.mv_limit_fpel[1][1] = 8*( h->mb.i_mb_height - h->mb.i_mb_y - 1 ) + 4;
h->mb.mv_min_spel[1] = 4*( h->mb.mv_limit_fpel[0][1] - 8 );
h->mb.mv_max_spel[1] = 4*( h->mb.mv_limit_fpel[1][1] + 8 );
h->mb.mv_min_spel[1] = X264_MAX( 4*(-8*h->mb.i_mb_y - 12), -mv_range );
h->mb.mv_max_spel[1] = X264_MIN( 4*(8*( h->mb.i_mb_height - h->mb.i_mb_y - 1) + 12), mv_range-1 );
h->mb.mv_limit_fpel[0][1] = h->mb.mv_min_spel[1] >> 2;
h->mb.mv_limit_fpel[1][1] = h->mb.mv_max_spel[1] >> 2;
}
#define LOAD_HPELS_LUMA(dst, src) \
@ -728,13 +729,13 @@ lowres_intra_mb:
if( h->param.analyse.i_subpel_refine > 1 )
{
h->predict_8x8c[I_PRED_CHROMA_P]( pix );
int satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
int satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
h->predict_8x8_filter( pix, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
for( int i = 3; i < 9; i++ )
{
h->predict_8x8[i]( pix, edge );
satd = h->pixf.mbcmp[PIXEL_8x8]( pix, FDEC_STRIDE, h->mb.pic.p_fenc[0], FENC_STRIDE );
satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE );
i_icost = X264_MIN( i_icost, satd );
}
}

@ -154,10 +154,12 @@ static int convert_csp_to_pix_fmt( int csp )
case X264_CSP_RGB: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_RGB48 : AV_PIX_FMT_RGB24;
case X264_CSP_BGR: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGR48 : AV_PIX_FMT_BGR24;
case X264_CSP_BGRA: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_BGRA64 : AV_PIX_FMT_BGRA;
/* the next csp has no equivalent 16bit depth in swscale */
/* the following has no equivalent 16-bit depth in swscale */
case X264_CSP_NV12: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV12;
case X264_CSP_NV21: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_NV21;
/* the next csp is no supported by swscale at all */
case X264_CSP_YUYV: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_YUYV422;
case X264_CSP_UYVY: return csp&X264_CSP_HIGH_DEPTH ? AV_PIX_FMT_NONE : AV_PIX_FMT_UYVY422;
/* the following is not supported by swscale at all */
case X264_CSP_NV16:
default: return AV_PIX_FMT_NONE;
}

@ -43,6 +43,8 @@ const x264_cli_csp_t x264_cli_csps[] = {
[X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 },
[X264_CSP_NV21] = { "nv21", 2, { 1, 1 }, { 1, .5 }, 2, 2 },
[X264_CSP_NV16] = { "nv16", 2, { 1, 1 }, { 1, 1 }, 2, 1 },
[X264_CSP_YUYV] = { "yuyv", 1, { 2 }, { 1 }, 2, 1 },
[X264_CSP_UYVY] = { "uyvy", 1, { 2 }, { 1 }, 2, 1 },
[X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 },
[X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 },
[X264_CSP_RGB] = { "rgb", 1, { 3 }, { 1 }, 1, 1 },

@ -98,6 +98,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
uint64_t size = ftell( h->fh );
fseek( h->fh, 0, SEEK_SET );
info->num_frames = size / h->frame_size;
FAIL_IF_ERROR( !info->num_frames, "empty input file\n" );
/* Attempt to use memory-mapped input frames if possible */
if( !(h->bit_depth & 7) )

@ -223,6 +223,7 @@ static int open_file( char *psz_filename, hnd_t *p_handle, video_info_t *info, c
uint64_t i_size = ftell( h->fh );
fseek( h->fh, init_pos, SEEK_SET );
info->num_frames = (i_size - h->seq_header_len) / h->frame_size;
FAIL_IF_ERROR( !info->num_frames, "empty input file\n" );
/* Attempt to use memory-mapped input frames if possible */
if( !(h->bit_depth & 7) )

@ -153,7 +153,11 @@ cglobal checkasm_call, 2,15,16,max_args*8+8
mov r9, rax
mov r10, rdx
lea r0, [error_message]
%if FORMAT_ELF
call puts wrt ..plt
%else
call puts
%endif
mov r1, [rsp+max_args*8]
mov dword [r1], 0
mov rdx, r10
@ -221,3 +225,14 @@ cglobal stack_pagealign, 2,2
leave
RET
; Trigger a warmup of vector units
%macro WARMUP 0
cglobal checkasm_warmup, 0,0
xorps m0, m0
RET
%endmacro
INIT_YMM avx
WARMUP
INIT_ZMM avx512
WARMUP

@ -25,9 +25,7 @@
#include "../common/aarch64/asm.S"
.section .rodata
.align 4
register_init:
const register_init, align=4
.quad 0x21f86d66c8ca00ce
.quad 0x75b6ba21077c48ad
.quad 0xed56bb2dcb3c7736
@ -46,10 +44,12 @@ register_init:
.quad 0xd229e1f5b281303f
.quad 0x71aeaff20b095fd9
.quad 0xab63e2e11fa38ed9
endconst
error_message:
const error_message
.asciz "failed to preserve register"
endconst
.text
@ -149,7 +149,7 @@ function x264_checkasm_call, export=1
mov w9, #0
str w9, [x2]
movrel x0, error_message
bl puts
bl X(puts)
0:
ldp x0, x1, [sp], #16
ldp d14, d15, [sp], #16

@ -25,9 +25,7 @@
#include "../common/arm/asm.S"
.section .rodata
.align 4
register_init:
const register_init, align=4
.quad 0x21f86d66c8ca00ce
.quad 0x75b6ba21077c48ad
.quad 0xed56bb2dcb3c7736
@ -36,9 +34,11 @@ register_init:
.quad 0xdf9a54b303f1d3a3
.quad 0x4a75479abd64e097
.quad 0x249214109d5d1c88
endconst
error_message:
const error_message
.asciz "failed to preserve register"
endconst
.text

@ -28,6 +28,7 @@
#include <ctype.h>
#include "common/common.h"
#include "common/cpu.h"
#include "encoder/macroblock.h"
#ifdef _WIN32
#include <windows.h>
@ -56,8 +57,7 @@ int quiet = 0;
if( !ok ) ret = -1; \
}
#define BENCH_RUNS 100 // tradeoff between accuracy and speed
#define BENCH_ALIGNS 16 // number of stack+heap data alignments (another accuracy vs speed tradeoff)
#define BENCH_RUNS 2000 // tradeoff between accuracy and speed
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
#define MAX_CPUS 30 // number of different combinations of cpu flags
@ -99,7 +99,7 @@ static inline uint32_t read_time(void)
: "=a"(a) :: "edx", "memory" );
#elif ARCH_PPC
asm volatile( "mftb %0" : "=r"(a) :: "memory" );
#elif ARCH_ARM // ARMv7 only
#elif HAVE_ARM_INLINE_ASM // ARMv7 only
asm volatile( "mrc p15, 0, %0, c9, c13, 0" : "=r"(a) :: "memory" );
#elif ARCH_AARCH64
uint64_t b = 0;
@ -177,7 +177,10 @@ static void print_bench(void)
continue;
printf( "%s_%s%s: %"PRId64"\n", benchs[i].name,
#if HAVE_MMX
b->cpu&X264_CPU_AVX512 ? "avx512" :
b->cpu&X264_CPU_AVX2 ? "avx2" :
b->cpu&X264_CPU_BMI2 ? "bmi2" :
b->cpu&X264_CPU_BMI1 ? "bmi1" :
b->cpu&X264_CPU_FMA3 ? "fma3" :
b->cpu&X264_CPU_FMA4 ? "fma4" :
b->cpu&X264_CPU_XOP ? "xop" :
@ -186,6 +189,7 @@ static void print_bench(void)
b->cpu&X264_CPU_SSE4 ? "sse4" :
b->cpu&X264_CPU_SSSE3 ? "ssse3" :
b->cpu&X264_CPU_SSE3 ? "sse3" :
b->cpu&X264_CPU_LZCNT ? "lzcnt" :
/* print sse2slow only if there's also a sse2fast version of the same func */
b->cpu&X264_CPU_SSE2_IS_SLOW && j<MAX_CPUS-1 && b[1].cpu&X264_CPU_SSE2_IS_FAST && !(b[1].cpu&X264_CPU_SSE3) ? "sse2slow" :
b->cpu&X264_CPU_SSE2 ? "sse2" :
@ -208,10 +212,7 @@ static void print_bench(void)
b->cpu&X264_CPU_SLOW_ATOM && b->cpu&X264_CPU_CACHELINE_64 ? "_c64_atom" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
b->cpu&X264_CPU_SLOW_SHUFFLE ? "_slowshuffle" :
b->cpu&X264_CPU_LZCNT ? "_lzcnt" :
b->cpu&X264_CPU_BMI2 ? "_bmi2" :
b->cpu&X264_CPU_BMI1 ? "_bmi1" :
b->cpu&X264_CPU_SLOW_CTZ ? "_slow_ctz" :
b->cpu&X264_CPU_LZCNT && b->cpu&X264_CPU_SSE3 && !(b->cpu&X264_CPU_BMI1) ? "_lzcnt" :
b->cpu&X264_CPU_SLOW_ATOM ? "_atom" :
#elif ARCH_ARM
b->cpu&X264_CPU_FAST_NEON_MRC ? "_fast_mrc" :
@ -221,8 +222,18 @@ static void print_bench(void)
}
}
/* YMM and ZMM registers on x86 are turned off to save power when they haven't been
* used for some period of time. When they are used there will be a "warmup" period
* during which performance will be reduced and inconsistent which is problematic when
* trying to benchmark individual functions. We can work around this by periodically
* issuing "dummy" instructions that uses those registers to keep them powered on. */
static void (*simd_warmup_func)( void ) = NULL;
#define simd_warmup() do { if( simd_warmup_func ) simd_warmup_func(); } while( 0 )
#if ARCH_X86 || ARCH_X86_64
int x264_stack_pagealign( int (*func)(), int align );
void x264_checkasm_warmup_avx( void );
void x264_checkasm_warmup_avx512( void );
/* detect when callee-saved regs aren't saved
* needs an explicit asm check because it only sometimes crashes in normal use. */
@ -257,6 +268,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
#define call_a1(func,...) ({ \
uint64_t r = (rand() & 0xffff) * 0x0001000100010001ULL; \
x264_checkasm_stack_clobber( r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r,r ); /* max_args+6 */ \
simd_warmup(); \
x264_checkasm_call(( intptr_t(*)())func, &ok, 0, 0, 0, 0, __VA_ARGS__ ); })
#elif ARCH_AARCH64 && !defined(__APPLE__)
void x264_checkasm_stack_clobber( uint64_t clobber, ... );
@ -284,6 +296,7 @@ void x264_checkasm_stack_clobber( uint64_t clobber, ... );
call_a1(func, __VA_ARGS__);\
for( int ti = 0; ti < (cpu?BENCH_RUNS:BENCH_RUNS/4); ti++ )\
{\
simd_warmup();\
uint32_t t = read_time();\
func(__VA_ARGS__);\
func(__VA_ARGS__);\
@ -357,8 +370,9 @@ static int check_pixel( int cpu_ref, int cpu_new )
used_asm = 1; \
for( int j = 0; j < 64; j++ ) \
{ \
res_c = call_c( pixel_c.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
intptr_t stride1 = (j&31) == 31 ? 32 : FENC_STRIDE; \
res_c = call_c( pixel_c.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
res_asm = call_a( pixel_asm.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
@ -493,15 +507,17 @@ static int check_pixel( int cpu_ref, int cpu_new )
#define TEST_PIXEL_VAR2( i ) \
if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
{ \
int res_c, res_asm, ssd_c, ssd_asm; \
int res_c, res_asm; \
ALIGNED_ARRAY_8( int, ssd_c, [2] ); \
ALIGNED_ARRAY_8( int, ssd_asm,[2] ); \
set_func_name( "%s_%s", "var2", pixel_names[i] ); \
used_asm = 1; \
res_c = call_c( pixel_c.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_c ); \
res_asm = call_a( pixel_asm.var2[i], pbuf1, (intptr_t)16, pbuf2, (intptr_t)16, &ssd_asm ); \
if( res_c != res_asm || ssd_c != ssd_asm ) \
res_c = call_c( pixel_c.var2[i], pbuf1, pbuf2, ssd_c ); \
res_asm = call_a( pixel_asm.var2[i], pbuf1, pbuf2, ssd_asm ); \
if( res_c != res_asm || memcmp( ssd_c, ssd_asm, 2*sizeof(int) ) ) \
{ \
ok = 0; \
fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \
fprintf( stderr, "var2[%d]: {%d, %d, %d} != {%d, %d, %d} [FAILED]\n", i, res_c, ssd_c[0], ssd_c[1], res_asm, ssd_asm[0], ssd_asm[1] ); \
} \
}
@ -826,10 +842,10 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_dct_function_t dct_asm;
x264_quant_function_t qf;
int ret = 0, ok, used_asm, interlace = 0;
ALIGNED_ARRAY_N( dctcoef, dct1, [16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct2, [16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct4, [16],[16] );
ALIGNED_ARRAY_N( dctcoef, dct8, [4],[64] );
ALIGNED_ARRAY_64( dctcoef, dct1, [16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct2, [16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct4, [16],[16] );
ALIGNED_ARRAY_64( dctcoef, dct8, [4],[64] );
ALIGNED_16( dctcoef dctdc[2][8] );
x264_t h_buf;
x264_t *h = &h_buf;
@ -1031,8 +1047,8 @@ static int check_dct( int cpu_ref, int cpu_new )
x264_zigzag_function_t zigzag_ref[2];
x264_zigzag_function_t zigzag_asm[2];
ALIGNED_ARRAY_16( dctcoef, level1,[64] );
ALIGNED_ARRAY_16( dctcoef, level2,[64] );
ALIGNED_ARRAY_64( dctcoef, level1,[64] );
ALIGNED_ARRAY_64( dctcoef, level2,[64] );
#define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
if( zigzag_asm[interlace].name != zigzag_ref[interlace].name ) \
@ -1370,6 +1386,8 @@ static int check_mc( int cpu_ref, int cpu_new )
}
report( "mc offsetsub :" );
memset( pbuf3, 0, 64*16 );
memset( pbuf4, 0, 64*16 );
ok = 1; used_asm = 0;
for( int height = 8; height <= 16; height += 8 )
{
@ -1377,8 +1395,6 @@ static int check_mc( int cpu_ref, int cpu_new )
{
set_func_name( "store_interleave_chroma" );
used_asm = 1;
memset( pbuf3, 0, 64*height );
memset( pbuf4, 0, 64*height );
call_c( mc_c.store_interleave_chroma, pbuf3, (intptr_t)64, pbuf1, pbuf1+16, height );
call_a( mc_a.store_interleave_chroma, pbuf4, (intptr_t)64, pbuf1, pbuf1+16, height );
if( memcmp( pbuf3, pbuf4, 64*height ) )
@ -1525,6 +1541,33 @@ static int check_mc( int cpu_ref, int cpu_new )
}
}
if( mc_a.plane_copy_deinterleave_yuyv != mc_ref.plane_copy_deinterleave_yuyv )
{
set_func_name( "plane_copy_deinterleave_yuyv" );
used_asm = 1;
for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
{
int w = (plane_specs[i].w + 1) >> 1;
int h = plane_specs[i].h;
intptr_t dst_stride = ALIGN( w, 32/sizeof(pixel) );
intptr_t src_stride = (plane_specs[i].src_stride + 1) >> 1;
intptr_t offv = dst_stride*h;
pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
memset( pbuf3, 0, 0x1000 );
memset( pbuf4, 0, 0x1000 );
/* Skip benchmarking since it's the same as plane_copy_deinterleave(), just verify correctness. */
call_c1( mc_c.plane_copy_deinterleave_yuyv, pbuf3, dst_stride, pbuf3+offv, dst_stride, src1, src_stride, w, h );
call_a1( mc_a.plane_copy_deinterleave_yuyv, pbuf4, dst_stride, pbuf4+offv, dst_stride, src1, src_stride, w, h );
for( int y = 0; y < h; y++ )
if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) ||
memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(pixel) ) )
{
fprintf( stderr, "plane_copy_deinterleave_yuyv FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
break;
}
}
}
if( mc_a.plane_copy_deinterleave_rgb != mc_ref.plane_copy_deinterleave_rgb )
{
set_func_name( "plane_copy_deinterleave_rgb" );
@ -1565,7 +1608,7 @@ static int check_mc( int cpu_ref, int cpu_new )
{
int w = (plane_specs[i].w + 1) >> 1;
int h = plane_specs[i].h;
intptr_t dst_stride = ALIGN( w, 16 );
intptr_t dst_stride = ALIGN( w, 32 );
intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
intptr_t offv = dst_stride*h + 32;
memset( pbuf3, 0, 0x1000 );
@ -1703,7 +1746,7 @@ static int check_mc( int cpu_ref, int cpu_new )
{
ok &= abs( dstc[j]-dsta[j] ) <= 1 || fabs( (double)dstc[j]/dsta[j]-1 ) < 1e-4;
if( !ok )
fprintf( stderr, "mbtree_propagate_cost FAILED: %f !~= %f\n", (double)dstc[j], (double)dsta[j] );
fprintf( stderr, "mbtree_propagate_cost FAILED: %d !~= %d\n", dstc[j], dsta[j] );
}
}
}
@ -1722,15 +1765,16 @@ static int check_mc( int cpu_ref, int cpu_new )
h.mb.i_mb_width = width;
h.mb.i_mb_height = height;
uint16_t *ref_costsc = (uint16_t*)buf3;
uint16_t *ref_costsa = (uint16_t*)buf4;
int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + size);
uint16_t *ref_costsc = (uint16_t*)buf3 + width;
uint16_t *ref_costsa = (uint16_t*)buf4 + width;
int16_t (*mvs)[2] = (int16_t(*)[2])(ref_costsc + width + size);
int16_t *propagate_amount = (int16_t*)(mvs + width);
uint16_t *lowres_costs = (uint16_t*)(propagate_amount + width);
h.scratch_buffer2 = (uint8_t*)(ref_costsa + size);
h.scratch_buffer2 = (uint8_t*)(ref_costsa + width + size);
int bipred_weight = (rand()%63)+1;
int mb_y = rand()&3;
int list = i&1;
for( int j = 0; j < size; j++ )
for( int j = -width; j < size+width; j++ )
ref_costsc[j] = ref_costsa[j] = rand()&32767;
for( int j = 0; j < width; j++ )
{
@ -1741,18 +1785,18 @@ static int check_mc( int cpu_ref, int cpu_new )
lowres_costs[j] = list_dist[list][rand()&7] << LOWRES_COST_SHIFT;
}
call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
call_c1( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
call_a1( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
for( int j = 0; j < size && ok; j++ )
for( int j = -width; j < size+width && ok; j++ )
{
ok &= abs(ref_costsa[j] - ref_costsc[j]) <= 1;
if( !ok )
fprintf( stderr, "mbtree_propagate_list FAILED at %d: %d !~= %d\n", j, ref_costsc[j], ref_costsa[j] );
}
call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, 0, width, list );
call_c2( mc_c.mbtree_propagate_list, &h, ref_costsc, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
call_a2( mc_a.mbtree_propagate_list, &h, ref_costsa, mvs, propagate_amount, lowres_costs, bipred_weight, mb_y, width, list );
}
}
@ -1815,12 +1859,14 @@ static int check_mc( int cpu_ref, int cpu_new )
{
set_func_name( "memcpy_aligned" );
ok = 1; used_asm = 1;
for( size_t size = 16; size < 256; size += 16 )
for( size_t size = 16; size < 512; size += 16 )
{
memset( buf4, 0xAA, size + 1 );
for( int i = 0; i < size; i++ )
buf1[i] = rand();
memset( buf4-1, 0xAA, size + 2 );
call_c( mc_c.memcpy_aligned, buf3, buf1, size );
call_a( mc_a.memcpy_aligned, buf4, buf1, size );
if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
{
ok = 0;
fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", (int)size );
@ -1836,10 +1882,10 @@ static int check_mc( int cpu_ref, int cpu_new )
ok = 1; used_asm = 1;
for( size_t size = 128; size < 1024; size += 128 )
{
memset( buf4, 0xAA, size + 1 );
memset( buf4-1, 0xAA, size + 2 );
call_c( mc_c.memzero_aligned, buf3, size );
call_a( mc_a.memzero_aligned, buf4, size );
if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
if( memcmp( buf3, buf4, size ) || buf4[-1] != 0xAA || buf4[size] != 0xAA )
{
ok = 0;
fprintf( stderr, "memzero_aligned FAILED: size=%d\n", (int)size );
@ -1919,12 +1965,15 @@ static int check_deblock( int cpu_ref, int cpu_new )
if( db_a.deblock_strength != db_ref.deblock_strength )
{
set_func_name( "deblock_strength" );
used_asm = 1;
for( int i = 0; i < 100; i++ )
{
ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
ALIGNED_ARRAY_16( uint8_t, nnz_buf, [X264_SCAN8_SIZE+8] );
uint8_t *nnz = &nnz_buf[8];
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] );
ALIGNED_ARRAY_32( uint8_t, bs, [2],[2][8][4] );
memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
for( int j = 0; j < X264_SCAN8_SIZE; j++ )
nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
@ -1933,9 +1982,8 @@ static int check_deblock( int cpu_ref, int cpu_new )
{
ref[j][k] = ((rand()&3) != 3) ? 0 : (rand() & 31) - 2;
for( int l = 0; l < 2; l++ )
mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&1023) - 512;
mv[j][k][l] = ((rand()&7) != 7) ? (rand()&7) - 3 : (rand()&16383) - 8192;
}
set_func_name( "deblock_strength" );
call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) )
@ -1968,11 +2016,11 @@ static int check_quant( int cpu_ref, int cpu_new )
x264_quant_function_t qf_c;
x264_quant_function_t qf_ref;
x264_quant_function_t qf_a;
ALIGNED_ARRAY_N( dctcoef, dct1,[64] );
ALIGNED_ARRAY_N( dctcoef, dct2,[64] );
ALIGNED_ARRAY_N( dctcoef, dct3,[8],[16] );
ALIGNED_ARRAY_N( dctcoef, dct4,[8],[16] );
ALIGNED_ARRAY_N( uint8_t, cqm_buf,[64] );
ALIGNED_ARRAY_64( dctcoef, dct1,[64] );
ALIGNED_ARRAY_64( dctcoef, dct2,[64] );
ALIGNED_ARRAY_32( dctcoef, dct3,[8],[16] );
ALIGNED_ARRAY_32( dctcoef, dct4,[8],[16] );
ALIGNED_ARRAY_32( uint8_t, cqm_buf,[64] );
int ret = 0, ok, used_asm;
int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
x264_t h_buf;
@ -2213,7 +2261,7 @@ static int check_quant( int cpu_ref, int cpu_new )
int max = X264_MIN( i, PIXEL_MAX*16 ); \
for( int j = 0; j < size; j++ ) \
dct1[j] = rand()%(max*2+1) - max; \
for( int j = 0; i <= size; j += 4 ) \
for( int j = 0; j <= size; j += 4 ) \
qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \
memcpy( dct2, dct1, size*sizeof(dctcoef) ); \
res_c = call_c1( qf_c.optname, dct1, dmf ); \
@ -2560,9 +2608,6 @@ DECL_CABAC(asm)
#endif
extern const uint8_t x264_count_cat_m1[14];
void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
void x264_cabac_block_residual_8x8_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l );
static int check_cabac( int cpu_ref, int cpu_new )
{
@ -2577,6 +2622,11 @@ static int check_cabac( int cpu_ref, int cpu_new )
x264_quant_init( &h, cpu_new, &h.quantf );
h.quantf.coeff_last[DCT_CHROMA_DC] = h.quantf.coeff_last4;
/* Reset cabac state to avoid buffer overruns in do_bench() with large BENCH_RUNS values. */
#define GET_CB( i ) (\
x264_cabac_encode_init( &cb[i], bitstream[i], bitstream[i]+0xfff0 ),\
cb[i].f8_bits_encoded = 0, &cb[i] )
#define CABAC_RESIDUAL(name, start, end, rd)\
{\
if( bs_a.name##_internal && (bs_a.name##_internal != bs_ref.name##_internal || (cpu_new&X264_CPU_SSE2_IS_SLOW)) )\
@ -2589,7 +2639,7 @@ static int check_cabac( int cpu_ref, int cpu_new )
{\
for( int j = 0; j < 256; j++ )\
{\
ALIGNED_ARRAY_N( dctcoef, dct, [2],[64] );\
ALIGNED_ARRAY_64( dctcoef, dct, [2],[64] );\
uint8_t bitstream[2][1<<16];\
static const uint8_t ctx_ac[14] = {0,1,0,0,1,0,0,1,0,0,0,1,0,0};\
int ac = ctx_ac[ctx_block_cat];\
@ -2612,13 +2662,9 @@ static int check_cabac( int cpu_ref, int cpu_new )
x264_cabac_t cb[2];\
x264_cabac_context_init( &h, &cb[0], SLICE_TYPE_P, 26, 0 );\
x264_cabac_context_init( &h, &cb[1], SLICE_TYPE_P, 26, 0 );\
x264_cabac_encode_init( &cb[0], bitstream[0], bitstream[0]+0xfff0 );\
x264_cabac_encode_init( &cb[1], bitstream[1], bitstream[1]+0xfff0 );\
cb[0].f8_bits_encoded = 0;\
cb[1].f8_bits_encoded = 0;\
if( !rd ) memcpy( bitstream[1], bitstream[0], 0x400 );\
call_c1( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
call_c1( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
call_a1( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
ok = cb[0].f8_bits_encoded == cb[1].f8_bits_encoded && !memcmp(cb[0].state, cb[1].state, 1024);\
if( !rd ) ok |= !memcmp( bitstream[1], bitstream[0], 0x400 ) && !memcmp( &cb[1], &cb[0], offsetof(x264_cabac_t, p_start) );\
if( !ok )\
@ -2631,8 +2677,8 @@ static int check_cabac( int cpu_ref, int cpu_new )
}\
if( (j&15) == 0 )\
{\
call_c2( x264_##name##_c, &h, &cb[0], ctx_block_cat, dct[0]+ac );\
call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, &cb[1] );\
call_c2( x264_##name##_c, &h, GET_CB( 0 ), ctx_block_cat, dct[0]+ac );\
call_a2( bs_a.name##_internal, dct[1]+ac, i, ctx_block_cat, GET_CB( 1 ) );\
}\
}\
}\
@ -2759,6 +2805,14 @@ static int check_all_flags( void )
int ret = 0;
int cpu0 = 0, cpu1 = 0;
uint32_t cpu_detect = x264_cpu_detect();
#if ARCH_X86 || ARCH_X86_64
if( cpu_detect & X264_CPU_AVX512 )
simd_warmup_func = x264_checkasm_warmup_avx512;
else if( cpu_detect & X264_CPU_AVX )
simd_warmup_func = x264_checkasm_warmup_avx;
#endif
simd_warmup();
#if HAVE_MMX
if( cpu_detect & X264_CPU_MMX2 )
{
@ -2769,13 +2823,6 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
cpu1 &= ~X264_CPU_CACHELINE_32;
#endif
if( cpu_detect & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "MMX SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( cpu_detect & X264_CPU_SSE )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE, "SSE" );
@ -2787,14 +2834,12 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSE2 SlowShuffle" );
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSE2 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
}
if( cpu_detect & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE2 LZCNT" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
}
if( cpu_detect & X264_CPU_SSE3 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
@ -2807,8 +2852,6 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_CACHELINE_64;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_SHUFFLE, "SSSE3 SlowShuffle" );
cpu1 &= ~X264_CPU_SLOW_SHUFFLE;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_CTZ, "SSSE3 SlowCTZ" );
cpu1 &= ~X264_CPU_SLOW_CTZ;
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SLOW_ATOM, "SSSE3 SlowAtom" );
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_64, "SSSE3 Cache64 SlowAtom" );
cpu1 &= ~X264_CPU_CACHELINE_64;
@ -2833,29 +2876,15 @@ static int check_all_flags( void )
cpu1 &= ~X264_CPU_FMA4;
}
if( cpu_detect & X264_CPU_FMA3 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3, "FMA3" );
cpu1 &= ~X264_CPU_FMA3;
}
if( cpu_detect & X264_CPU_AVX2 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_FMA3 | X264_CPU_AVX2, "AVX2" );
if( cpu_detect & X264_CPU_LZCNT )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "AVX2 LZCNT" );
cpu1 &= ~X264_CPU_LZCNT;
}
}
if( cpu_detect & X264_CPU_BMI1 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1, "BMI1" );
cpu1 &= ~X264_CPU_BMI1;
}
if( cpu_detect & X264_CPU_BMI2 )
{
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI1|X264_CPU_BMI2, "BMI2" );
cpu1 &= ~(X264_CPU_BMI1|X264_CPU_BMI2);
}
ret |= add_flags( &cpu0, &cpu1, X264_CPU_BMI2, "BMI2" );
if( cpu_detect & X264_CPU_AVX2 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX2, "AVX2" );
if( cpu_detect & X264_CPU_AVX512 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_AVX512, "AVX512" );
#elif ARCH_PPC
if( cpu_detect & X264_CPU_ALTIVEC )
{
@ -2885,8 +2914,6 @@ static int check_all_flags( void )
int main(int argc, char *argv[])
{
int ret = 0;
#ifdef _WIN32
/* Disable the Windows Error Reporting dialog */
SetErrorMode( SEM_NOGPFAULTERRORBOX );
@ -2912,8 +2939,8 @@ int main(int argc, char *argv[])
fprintf( stderr, "x264: using random seed %u\n", seed );
srand( seed );
buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 32*BENCH_ALIGNS );
pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 32*BENCH_ALIGNS );
buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) );
pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) );
if( !buf1 || !pbuf1 )
{
fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
@ -2934,21 +2961,7 @@ int main(int argc, char *argv[])
}
memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
/* 32-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
if( do_bench )
for( int i = 0; i < BENCH_ALIGNS && !ret; i++ )
{
INIT_POINTER_OFFSETS;
ret |= x264_stack_pagealign( check_all_flags, i*32 );
buf1 += 32;
pbuf1 += 32;
quiet = 1;
fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
}
else
ret = x264_stack_pagealign( check_all_flags, 0 );
if( ret )
if( x264_stack_pagealign( check_all_flags, 0 ) )
{
fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
return -1;

@ -63,7 +63,7 @@ while (@ARGV) {
$force_thumb = 1;
} elsif ($opt eq "-arch") {
$arch = shift;
die "unknown arch: '$arch'\n" if not exists $comments{$arch};
die "unknown arch: '$arch'\n" if not exists $canonical_arch{$arch};
} elsif ($opt eq "-as-type") {
$as_type = shift;
die "unknown as type: '$as_type'\n" if $as_type !~ /^((apple-)?(gas|clang)|armasm)$/;
@ -429,7 +429,7 @@ sub parse_line {
sub handle_set {
my $line = $_[0];
if ($line =~ /\.set\s+(.*),\s*(.*)/) {
if ($line =~ /\.(?:set|equ)\s+(\S*)\s*,\s*(.*)/) {
$symbols{$1} = eval_expr($2);
return 1;
}
@ -874,7 +874,7 @@ sub handle_serialized_line {
# Don't interpret e.g. bic as b<cc> with ic as conditional code
if ($cond !~ /|$arm_cond_codes/) {
# Not actually a branch
} elsif ($target =~ /(\d+)([bf])/) {
} elsif ($target =~ /^(\d+)([bf])$/) {
# The target is a local label
$line = handle_local_label($line, $1, $2);
$line =~ s/\b$instr\b/$&.w/ if $width eq "";
@ -888,12 +888,12 @@ sub handle_serialized_line {
}
# ALIGN in armasm syntax is the actual number of bytes
if ($line =~ /\.align\s+(\d+)/) {
if ($line =~ /\.(?:p2)?align\s+(\d+)/) {
my $align = 1 << $1;
$line =~ s/\.align\s(\d+)/ALIGN $align/;
$line =~ s/\.(?:p2)?align\s(\d+)/ALIGN $align/;
}
# Convert gas style [r0, :128] into armasm [r0@128] alignment specification
$line =~ s/\[([^\[]+),\s*:(\d+)\]/[$1\@$2]/g;
$line =~ s/\[([^\[,]+),?\s*:(\d+)\]/[$1\@$2]/g;
# armasm treats logical values {TRUE} and {FALSE} separately from
# numeric values - logical operators and values can't be intermixed
@ -930,7 +930,7 @@ sub handle_serialized_line {
# Misc bugs/deficiencies:
# armasm seems unable to parse e.g. "vmov s0, s1" without a type
# qualifier, thus add .f32.
$line =~ s/^(\s+(?:vmov|vadd))(\s+s)/$1.f32$2/;
$line =~ s/^(\s+(?:vmov|vadd))(\s+s\d+\s*,\s*s\d+)/$1.f32$2/;
# armasm is unable to parse &0x - add spacing
$line =~ s/&0x/& 0x/g;
}
@ -939,16 +939,31 @@ sub handle_serialized_line {
# Convert register post indexing to a separate add instruction.
# This converts e.g. "ldr r0, [r1], r2" into "ldr r0, [r1]",
# "add r1, r1, r2".
$line =~ s/(ldr|str)\s+(\w+),\s*\[(\w+)\],\s*(\w+)/$1 $2, [$3]\n\tadd $3, $3, $4/g;
$line =~ s/((?:ldr|str)[bh]?)\s+(\w+),\s*\[(\w+)\],\s*(\w+)/$1 $2, [$3]\n\tadd $3, $3, $4/g;
# Convert "mov pc, lr" into "bx lr", since the former only works
# for switching from arm to thumb (and only in armv7), but not
# from thumb to arm.
s/mov\s*pc\s*,\s*lr/bx lr/g;
# Convert stmdb/ldmia with only one register into a plain str/ldr with post-increment/decrement
$line =~ s/stmdb\s+sp!\s*,\s*\{([^,-]+)\}/str $1, [sp, #-4]!/g;
$line =~ s/ldmia\s+sp!\s*,\s*\{([^,-]+)\}/ldr $1, [sp], #4/g;
# Convert stmdb/ldmia/stmfd/ldmfd/ldm with only one register into a plain str/ldr with post-increment/decrement.
# Wide thumb2 encoding requires at least two registers in register list while all other encodings support one register too.
$line =~ s/stm(?:db|fd)\s+sp!\s*,\s*\{([^,-]+)\}/str $1, [sp, #-4]!/g;
$line =~ s/ldm(?:ia|fd)?\s+sp!\s*,\s*\{([^,-]+)\}/ldr $1, [sp], #4/g;
# Convert muls into mul+cmp
$line =~ s/muls\s+(\w+),\s*(\w+)\,\s*(\w+)/mul $1, $2, $3\n\tcmp $1, #0/g;
# Convert "and r0, sp, #xx" into "mov r0, sp", "and r0, r0, #xx"
$line =~ s/and\s+(\w+),\s*(sp|r13)\,\s*#(\w+)/mov $1, $2\n\tand $1, $1, #$3/g;
# Convert "ldr r0, [r0, r1, lsl #6]" where the shift is >3 (which
# can't be handled in thumb) into "add r0, r0, r1, lsl #6",
# "ldr r0, [r0]", for the special case where the same address is
# used as base and target for the ldr.
if ($line =~ /(ldr[bh]?)\s+(\w+),\s*\[\2,\s*(\w+),\s*lsl\s*#(\w+)\]/ and $4 > 3) {
$line =~ s/(ldr[bh]?)\s+(\w+),\s*\[\2,\s*(\w+),\s*lsl\s*#(\w+)\]/add $2, $2, $3, lsl #$4\n\t$1 $2, [$2]/;
}
$line =~ s/\.arm/.thumb/x;
}
@ -978,6 +993,9 @@ sub handle_serialized_line {
$line =~ s/\.int/.long/x;
$line =~ s/\.float/.single/x;
}
if ($as_type eq "apple-gas") {
$line =~ s/vmrs\s+APSR_nzcv/fmrx r15/x;
}
if ($as_type eq "armasm") {
$line =~ s/\.global/EXPORT/x;
$line =~ s/\.int/dcd/x;
@ -986,11 +1004,15 @@ sub handle_serialized_line {
$line =~ s/\.word/dcd/x;
$line =~ s/\.short/dcw/x;
$line =~ s/\.byte/dcb/x;
$line =~ s/\.quad/dcq/x;
$line =~ s/\.ascii/dcb/x;
$line =~ s/\.asciz(.*)$/dcb\1,0/x;
$line =~ s/\.thumb/THUMB/x;
$line =~ s/\.arm/ARM/x;
# The alignment in AREA is the power of two, just as .align in gas
$line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=2, CODEALIGN/;
$line =~ s/\.text/AREA |.text|, CODE, READONLY, ALIGN=4, CODEALIGN/;
$line =~ s/(\s*)(.*)\.rodata/$1AREA |.rodata|, DATA, READONLY, ALIGN=5/;
$line =~ s/\.data/AREA |.data|, DATA, ALIGN=5/;
$line =~ s/fmxr/vmsr/;
$line =~ s/fmrx/vmrs/;

@ -23,6 +23,12 @@ if command -v cygpath >/dev/null 2>&1 ; then
IFS='
'
deps="$(cygpath -u -- $deps)"
elif grep -q 'Microsoft' /proc/sys/kernel/osrelease 2>/dev/null ; then
# Running under WSL. We don't have access to cygpath but since the Windows
# file system resides under "/mnt/<drive_letter>/" we can simply replace
# "C:" with "/mnt/c". This command uses a GNU extension to sed but that's
# available on WSL so we don't need to limit ourselves by what POSIX says.
deps="$(printf '%s' "$deps" | sed 's/^\([a-zA-Z]\):/\/mnt\/\L\1/')"
fi
# Escape characters as required to create valid Makefile file names

@ -420,47 +420,47 @@ static char *stringify_names( char *buf, const char * const names[] )
return buf;
}
#define INDENT " "
#define INDENT_LEN 32 // strlen( INDENT )
#define SEPARATOR ", "
#define SEPARATOR_LEN 2 // strlen( SEPARATOR )
static void print_csp_name_internal( const char *name, size_t *line_len, int last )
{
if( name )
{
size_t name_len = strlen( name );
if( *line_len + name_len > (80 - SEPARATOR_LEN) )
{
printf( "\n" INDENT );
*line_len = INDENT_LEN;
}
printf( "%s", name );
*line_len += name_len;
if( !last )
{
printf( SEPARATOR );
*line_len += SEPARATOR_LEN;
}
}
}
static void print_csp_names( int longhelp )
{
if( longhelp < 2 )
return;
# define INDENT " "
printf( " - valid csps for `raw' demuxer:\n" );
printf( INDENT );
size_t line_len = INDENT_LEN;
for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
{
if( x264_cli_csps[i].name )
{
printf( "%s", x264_cli_csps[i].name );
if( i+1 < X264_CSP_CLI_MAX )
printf( ", " );
}
}
print_csp_name_internal( x264_cli_csps[i].name, &line_len, i == X264_CSP_CLI_MAX-1 );
#if HAVE_LAVF
printf( "\n" );
printf( " - valid csps for `lavf' demuxer:\n" );
printf( INDENT );
size_t line_len = strlen( INDENT );
line_len = INDENT_LEN;
for( enum AVPixelFormat i = AV_PIX_FMT_NONE+1; i < AV_PIX_FMT_NB; i++ )
{
const char *pfname = av_get_pix_fmt_name( i );
if( pfname )
{
size_t name_len = strlen( pfname );
if( line_len + name_len > (80 - strlen( ", " )) )
{
printf( "\n" INDENT );
line_len = strlen( INDENT );
}
printf( "%s", pfname );
line_len += name_len;
if( i+1 < AV_PIX_FMT_NB )
{
printf( ", " );
line_len += 2;
}
}
}
print_csp_name_internal( av_get_pix_fmt_name( i ), &line_len, i == AV_PIX_FMT_NB-1 );
#endif
printf( "\n" );
}
@ -636,7 +636,7 @@ static void help( x264_param_t *defaults, int longhelp )
" - grain (psy tuning):\n"
" --aq-strength 0.5 --no-dct-decimate\n"
" --deadzone-inter 6 --deadzone-intra 6\n"
" --deblock -2:-2 --ipratio 1.1 \n"
" --deblock -2:-2 --ipratio 1.1\n"
" --pbratio 1.1 --psy-rd <unset>:0.25\n"
" --qcomp 0.8\n"
" - stillimage (psy tuning):\n"

@ -45,7 +45,7 @@ extern "C" {
#include "x264_config.h"
#define X264_BUILD 148
#define X264_BUILD 152
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
@ -119,39 +119,38 @@ typedef struct x264_nal_t
/* CPU flags */
/* x86 */
#define X264_CPU_CMOV 0x0000001
#define X264_CPU_MMX 0x0000002
#define X264_CPU_MMX2 0x0000004 /* MMX2 aka MMXEXT aka ISSE */
#define X264_CPU_MMX (1<<0)
#define X264_CPU_MMX2 (1<<1) /* MMX2 aka MMXEXT aka ISSE */
#define X264_CPU_MMXEXT X264_CPU_MMX2
#define X264_CPU_SSE 0x0000008
#define X264_CPU_SSE2 0x0000010
#define X264_CPU_SSE3 0x0000020
#define X264_CPU_SSSE3 0x0000040
#define X264_CPU_SSE4 0x0000080 /* SSE4.1 */
#define X264_CPU_SSE42 0x0000100 /* SSE4.2 */
#define X264_CPU_LZCNT 0x0000200 /* Phenom support for "leading zero count" instruction. */
#define X264_CPU_AVX 0x0000400 /* AVX support: requires OS support even if YMM registers aren't used. */
#define X264_CPU_XOP 0x0000800 /* AMD XOP */
#define X264_CPU_FMA4 0x0001000 /* AMD FMA4 */
#define X264_CPU_FMA3 0x0002000 /* FMA3 */
#define X264_CPU_AVX2 0x0004000 /* AVX2 */
#define X264_CPU_BMI1 0x0008000 /* BMI1 */
#define X264_CPU_BMI2 0x0010000 /* BMI2 */
#define X264_CPU_SSE (1<<2)
#define X264_CPU_SSE2 (1<<3)
#define X264_CPU_LZCNT (1<<4)
#define X264_CPU_SSE3 (1<<5)
#define X264_CPU_SSSE3 (1<<6)
#define X264_CPU_SSE4 (1<<7) /* SSE4.1 */
#define X264_CPU_SSE42 (1<<8) /* SSE4.2 */
#define X264_CPU_AVX (1<<9) /* Requires OS support even if YMM registers aren't used */
#define X264_CPU_XOP (1<<10) /* AMD XOP */
#define X264_CPU_FMA4 (1<<11) /* AMD FMA4 */
#define X264_CPU_FMA3 (1<<12)
#define X264_CPU_BMI1 (1<<13)
#define X264_CPU_BMI2 (1<<14)
#define X264_CPU_AVX2 (1<<15)
#define X264_CPU_AVX512 (1<<16) /* AVX-512 {F, CD, BW, DQ, VL}, requires OS support */
/* x86 modifiers */
#define X264_CPU_CACHELINE_32 0x0020000 /* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_64 0x0040000 /* 32/64 is the size of a cacheline in bytes */
#define X264_CPU_SSE2_IS_SLOW 0x0080000 /* avoid most SSE2 functions on Athlon64 */
#define X264_CPU_SSE2_IS_FAST 0x0100000 /* a few functions are only faster on Core2 and Phenom */
#define X264_CPU_SLOW_SHUFFLE 0x0200000 /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
#define X264_CPU_STACK_MOD4 0x0400000 /* if stack is only mod4 and not mod16 */
#define X264_CPU_SLOW_CTZ 0x0800000 /* BSR/BSF x86 instructions are really slow on some CPUs */
#define X264_CPU_SLOW_ATOM 0x1000000 /* The Atom is terrible: slow SSE unaligned loads, slow
#define X264_CPU_CACHELINE_32 (1<<17) /* avoid memory loads that span the border between two cachelines */
#define X264_CPU_CACHELINE_64 (1<<18) /* 32/64 is the size of a cacheline in bytes */
#define X264_CPU_SSE2_IS_SLOW (1<<19) /* avoid most SSE2 functions on Athlon64 */
#define X264_CPU_SSE2_IS_FAST (1<<20) /* a few functions are only faster on Core2 and Phenom */
#define X264_CPU_SLOW_SHUFFLE (1<<21) /* The Conroe has a slow shuffle unit (relative to overall SSE performance) */
#define X264_CPU_STACK_MOD4 (1<<22) /* if stack is only mod4 and not mod16 */
#define X264_CPU_SLOW_ATOM (1<<23) /* The Atom is terrible: slow SSE unaligned loads, slow
* SIMD multiplies, slow SIMD variable shifts, slow pshufb,
* cacheline split penalties -- gather everything here that
* isn't shared by other CPUs to avoid making half a dozen
* new SLOW flags. */
#define X264_CPU_SLOW_PSHUFB 0x2000000 /* such as on the Intel Atom */
#define X264_CPU_SLOW_PALIGNR 0x4000000 /* such as on the AMD Bobcat */
#define X264_CPU_SLOW_PSHUFB (1<<24) /* such as on the Intel Atom */
#define X264_CPU_SLOW_PALIGNR (1<<25) /* such as on the AMD Bobcat */
/* PowerPC */
#define X264_CPU_ALTIVEC 0x0000001
@ -227,13 +226,15 @@ static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
#define X264_CSP_I422 0x0005 /* yuv 4:2:2 planar */
#define X264_CSP_YV16 0x0006 /* yvu 4:2:2 planar */
#define X264_CSP_NV16 0x0007 /* yuv 4:2:2, with one y plane and one packed u+v */
#define X264_CSP_V210 0x0008 /* 10-bit yuv 4:2:2 packed in 32 */
#define X264_CSP_I444 0x0009 /* yuv 4:4:4 planar */
#define X264_CSP_YV24 0x000a /* yvu 4:4:4 planar */
#define X264_CSP_BGR 0x000b /* packed bgr 24bits */
#define X264_CSP_BGRA 0x000c /* packed bgr 32bits */
#define X264_CSP_RGB 0x000d /* packed rgb 24bits */
#define X264_CSP_MAX 0x000e /* end of list */
#define X264_CSP_YUYV 0x0008 /* yuyv 4:2:2 packed */
#define X264_CSP_UYVY 0x0009 /* uyvy 4:2:2 packed */
#define X264_CSP_V210 0x000a /* 10-bit yuv 4:2:2 packed in 32 */
#define X264_CSP_I444 0x000b /* yuv 4:4:4 planar */
#define X264_CSP_YV24 0x000c /* yvu 4:4:4 planar */
#define X264_CSP_BGR 0x000d /* packed bgr 24bits */
#define X264_CSP_BGRA 0x000e /* packed bgr 32bits */
#define X264_CSP_RGB 0x000f /* packed rgb 24bits */
#define X264_CSP_MAX 0x0010 /* end of list */
#define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */
#define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */
@ -563,19 +564,19 @@ void x264_nal_encode( x264_t *h, uint8_t *dst, x264_nal_t *nal );
typedef struct x264_level_t
{
int level_idc;
int mbps; /* max macroblock processing rate (macroblocks/sec) */
int frame_size; /* max frame size (macroblocks) */
int dpb; /* max decoded picture buffer (mbs) */
int bitrate; /* max bitrate (kbit/sec) */
int cpb; /* max vbv buffer (kbit) */
int mv_range; /* max vertical mv component range (pixels) */
int mvs_per_2mb; /* max mvs per 2 consecutive mbs. */
int slice_rate; /* ?? */
int mincr; /* min compression ratio */
int bipred8x8; /* limit bipred to >=8x8 */
int direct8x8; /* limit b_direct to >=8x8 */
int frame_only; /* forbid interlacing */
uint8_t level_idc;
uint32_t mbps; /* max macroblock processing rate (macroblocks/sec) */
uint32_t frame_size; /* max frame size (macroblocks) */
uint32_t dpb; /* max decoded picture buffer (mbs) */
uint32_t bitrate; /* max bitrate (kbit/sec) */
uint32_t cpb; /* max vbv buffer (kbit) */
uint16_t mv_range; /* max vertical mv component range (pixels) */
uint8_t mvs_per_2mb; /* max mvs per 2 consecutive mbs. */
uint8_t slice_rate; /* ?? */
uint8_t mincr; /* min compression ratio */
uint8_t bipred8x8; /* limit bipred to >=8x8 */
uint8_t direct8x8; /* limit b_direct to >=8x8 */
uint8_t frame_only; /* forbid interlacing */
} x264_level_t;
/* all of the levels defined in the standard, terminated by .level_idc=0 */

@ -3,4 +3,4 @@
#define X264_INTERLACED 1
#define X264_CHROMA_FORMAT 0
#define X264_VERSION ""
#define X264_POINTVER "0.148.x"
#define X264_POINTVER "0.152.x"

Loading…
Cancel
Save