#if defined(__linux__) && defined(__ELF__)
|
|
.section .note.GNU-stack,"",%progbits
|
|
#endif
|
|
|
|
#if defined(__x86_64__)
|
|
|
|
.global s16_mix_in_avx2
|
|
|
|
.text
|
|
|
|
# 16 bits in 256 bits = 16 samples at a time
|
|
s16_mix_in_avx2:
|
|
mov %rdx, %rax
|
|
and $-16, %al # 16 samples at a time
|
|
xor %rcx, %rcx
|
|
loop:
|
|
cmp %rax, %rcx
|
|
jge remainder
|
|
vmovdqu (%rdi,%rcx,2), %ymm0 # 16-bit size
|
|
vpaddsw (%rsi,%rcx,2), %ymm0, %ymm1
|
|
vmovdqu %ymm1, (%rdi,%rcx,2) # 16-bit size
|
|
add $16, %rcx # 16 samples at a time
|
|
jmp loop
|
|
remainder:
|
|
xor %r8, %r8
|
|
xor %r9, %r9
|
|
cmp %rdx, %rcx
|
|
jge done
|
|
mov (%rsi,%rcx,2), %r8w # 16-bit size
|
|
mov (%rdi,%rcx,2), %r9w # 16-bit size
|
|
movd %r8, %xmm0
|
|
movd %r9, %xmm1
|
|
paddsw %xmm0, %xmm1
|
|
movd %xmm1, %r8
|
|
mov %r8w, (%rdi,%rcx,2) # 16-bit size
|
|
inc %rcx
|
|
jmp remainder
|
|
done:
|
|
ret
|
|
|
|
#endif
|