diff --git a/daemon/.gitignore b/daemon/.gitignore index fb20bc43d..54e70e7b2 100644 --- a/daemon/.gitignore +++ b/daemon/.gitignore @@ -23,3 +23,6 @@ spandsp_logging.h mvr2s_x64_avx512.S mvr2s_x64_avx2.S mix_buffer.c +mix_in_x64_avx2.S +mix_in_x64_avx512bw.S +mix_in_x64_sse2.S diff --git a/daemon/Makefile b/daemon/Makefile index d3edef69a..0a83827f3 100644 --- a/daemon/Makefile +++ b/daemon/Makefile @@ -87,7 +87,7 @@ SRCS= main.c kernel.c poller.c helpers.c control_tcp.c call.c control_udp.c red LIBSRCS= loglib.c auxlib.c rtplib.c str.c socket.c streambuf.c ssllib.c dtmflib.c mix_buffer.c ifeq ($(with_transcoding),yes) LIBSRCS+= codeclib.strhash.c resample.c -LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S +LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S mix_in_x64_avx2.S mix_in_x64_avx512bw.S mix_in_x64_sse2.S endif OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o) diff --git a/lib/mix_buffer.c b/lib/mix_buffer.c index 776acb863..ba1198de2 100644 --- a/lib/mix_buffer.c +++ b/lib/mix_buffer.c @@ -23,6 +23,20 @@ struct mix_buffer_ssrc_source { }; + +#if defined(__x86_64__) +// mix_in_x64_sse2.S +mix_in_fn_t s16_mix_in_sse2; + +// mix_in_x64_avx2.S +mix_in_fn_t s16_mix_in_avx2; + +// mix_in_x64_avx512.S +mix_in_fn_t s16_mix_in_avx512; +#endif + + + static void s16_mix_in_c(void *restrict dst, const void *restrict src, unsigned int samples) { int16_t *d = dst; const int16_t *s = src; @@ -39,17 +53,28 @@ static void s16_mix_in_c(void *restrict dst, const void *restrict src, unsigned } +#ifndef ASAN_BUILD static mix_in_fn_t *resolve_s16_mix_in(void) { +#if defined(__x86_64__) + if (rtpe_has_cpu_flag(RTPE_CPU_FLAG_AVX512BW)) + return s16_mix_in_avx512; + if (rtpe_has_cpu_flag(RTPE_CPU_FLAG_AVX2)) + return s16_mix_in_avx2; + if (rtpe_has_cpu_flag(RTPE_CPU_FLAG_SSE2)) + return s16_mix_in_sse2; +#endif return s16_mix_in_c; } static mix_in_fn_t s16_mix_in __attribute__ ((ifunc ("resolve_s16_mix_in"))); +#else +#define s16_mix_in s16_mix_in_c +#endif const struct mix_buffer_impl impl_s16_c = { .sample_size = sizeof(int16_t), .mix_in = s16_mix_in, }; -// TODO: SIMD-accelerated implementations // must be locked already diff --git a/lib/mix_in_x64_avx2.S b/lib/mix_in_x64_avx2.S new file mode 100644 index 000000000..e994081dd --- /dev/null +++ b/lib/mix_in_x64_avx2.S @@ -0,0 +1,41 @@ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__x86_64__) + +.global s16_mix_in_avx2 + +.text + +# 16 bits in 256 bits = 16 samples at a time +s16_mix_in_avx2: + mov %rdx, %rax + and $-16, %al # 16 samples at a time + xor %rcx, %rcx +loop: + cmp %rax, %rcx + jge remainder + vmovdqu (%rdi,%rcx,2), %ymm0 # 16-bit size + vpaddsw (%rsi,%rcx,2), %ymm0, %ymm1 + vmovdqu %ymm1, (%rdi,%rcx,2) # 16-bit size + add $16, %rcx # 16 samples at a time + jmp loop +remainder: + xor %r8, %r8 + xor %r9, %r9 + cmp %rdx, %rcx + jge done + mov (%rsi,%rcx,2), %r8w # 16-bit size + mov (%rdi,%rcx,2), %r9w # 16-bit size + movd %r8, %xmm0 + movd %r9, %xmm1 + paddsw %xmm0, %xmm1 + movd %xmm1, %r8 + mov %r8w, (%rdi,%rcx,2) # 16-bit size + inc %rcx + jmp remainder +done: + ret + +#endif diff --git a/lib/mix_in_x64_avx512bw.S b/lib/mix_in_x64_avx512bw.S new file mode 100644 index 000000000..9fd291aa0 --- /dev/null +++ b/lib/mix_in_x64_avx512bw.S @@ -0,0 +1,41 @@ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__x86_64__) + +.global s16_mix_in_avx512 + +.text + +# 16 bits in 512 bits = 32 samples at a time +s16_mix_in_avx512: + mov %rdx, %rax + and $-32, %al # 32 samples at a time + xor %rcx, %rcx +loop: + cmp %rax, %rcx + jge remainder + vmovdqu16 (%rdi,%rcx,2), %zmm0 # 16-bit size + vpaddsw (%rsi,%rcx,2), %zmm0, %zmm1 + vmovdqu16 %zmm1, (%rdi,%rcx,2) # 16-bit size + add $32, %rcx # 32 samples at a time + jmp loop +remainder: + xor %r8, %r8 + xor %r9, %r9 + cmp %rdx, %rcx + jge done + mov (%rsi,%rcx,2), %r8w # 16-bit size + mov (%rdi,%rcx,2), %r9w # 16-bit size + movd %r8, %xmm0 + movd %r9, %xmm1 + paddsw %xmm0, %xmm1 + movd %xmm1, %r8 + mov %r8w, (%rdi,%rcx,2) # 16-bit size + inc %rcx + jmp remainder +done: + ret + +#endif diff --git a/lib/mix_in_x64_sse2.S b/lib/mix_in_x64_sse2.S new file mode 100644 index 000000000..8d1984aa2 --- /dev/null +++ b/lib/mix_in_x64_sse2.S @@ -0,0 +1,42 @@ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#if defined(__x86_64__) + +.global s16_mix_in_sse2 + +.text + +# 16 bits in 128 bits = 8 samples at a time +s16_mix_in_sse2: + mov %rdx, %rax + and $-8, %al # 8 samples at a time + xor %rcx, %rcx +loop: + cmp %rax, %rcx + jge remainder + movdqu (%rdi,%rcx,2), %xmm0 # 16-bit size + movdqu (%rsi,%rcx,2), %xmm1 # 16-bit size + paddsw %xmm0, %xmm1 + movdqu %xmm1, (%rdi,%rcx,2) # 16-bit size + add $8, %rcx # 8 samples at a time + jmp loop +remainder: + xor %r8, %r8 + xor %r9, %r9 + cmp %rdx, %rcx + jge done + mov (%rsi,%rcx,2), %r8w # 16-bit size + mov (%rdi,%rcx,2), %r9w # 16-bit size + movd %r8, %xmm0 + movd %r9, %xmm1 + paddsw %xmm0, %xmm1 + movd %xmm1, %r8 + mov %r8w, (%rdi,%rcx,2) # 16-bit size + inc %rcx + jmp remainder +done: + ret + +#endif diff --git a/recording-daemon/.gitignore b/recording-daemon/.gitignore index 5b276a259..535dad826 100644 --- a/recording-daemon/.gitignore +++ b/recording-daemon/.gitignore @@ -20,3 +20,6 @@ dtmflib.c *.8 mvr2s_x64_avx512.S mvr2s_x64_avx2.S +mix_in_x64_avx2.S +mix_in_x64_avx512bw.S +mix_in_x64_sse2.S diff --git a/recording-daemon/Makefile b/recording-daemon/Makefile index ff633bbed..4082e7989 100644 --- a/recording-daemon/Makefile +++ b/recording-daemon/Makefile @@ -36,7 +36,7 @@ SRCS= epoll.c garbage.c inotify.c main.c metafile.c stream.c recaux.c packet.c decoder.c output.c mix.c db.c log.c forward.c tag.c poller.c notify.c LIBSRCS= loglib.c auxlib.c rtplib.c codeclib.strhash.c resample.c str.c socket.c streambuf.c ssllib.c \ dtmflib.c -LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S +LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S mix_in_x64_avx2.S mix_in_x64_avx512bw.S mix_in_x64_sse2.S OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(LIBASM:.S=.o) MDS= rtpengine-recording.ronn diff --git a/t/.gitignore b/t/.gitignore index 90c824923..5fab6dab6 100644 --- a/t/.gitignore +++ b/t/.gitignore @@ -79,3 +79,6 @@ mvr2s_x64_avx512.S test-mix-buffer mix_buffer.c audio_player.c +mix_in_x64_avx2.S +mix_in_x64_avx512bw.S +mix_in_x64_sse2.S diff --git a/t/Makefile b/t/Makefile index 47deaff85..790692a21 100644 --- a/t/Makefile +++ b/t/Makefile @@ -82,7 +82,7 @@ DAEMONSRCS+= codec.c call.c ice.c kernel.c media_socket.c stun.c bencode.c polle media_player.c jitter_buffer.c t38.c tcp_listener.c mqtt.c websocket.c cli.c \ audio_player.c HASHSRCS+= call_interfaces.c control_ng.c sdp.c janus.c -LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S +LIBASM= mvr2s_x64_avx2.S mvr2s_x64_avx512.S mix_in_x64_avx2.S mix_in_x64_avx512bw.S mix_in_x64_sse2.S endif OBJS= $(SRCS:.c=.o) $(LIBSRCS:.c=.o) $(DAEMONSRCS:.c=.o) $(HASHSRCS:.c=.strhash.o) $(LIBASM:.S=.o) @@ -258,7 +258,9 @@ daemon-tests-audio-player-play-media: daemon-test-deps test-bitstr: test-bitstr.o -test-mix-buffer: test-mix-buffer.o $(COMMONOBJS) mix_buffer.o ssrc.o rtp.o crypto.o helpers.o +test-mix-buffer: test-mix-buffer.o $(COMMONOBJS) mix_buffer.o ssrc.o rtp.o crypto.o helpers.o \ + mix_in_x64_avx2.o mix_in_x64_sse2.o mix_in_x64_avx512bw.o codeclib.strhash.o dtmflib.o \ + mvr2s_x64_avx2.o mvr2s_x64_avx512.o resample.o spandsp_send_fax_pcm: spandsp_send_fax_pcm.o @@ -270,7 +272,7 @@ spandsp_recv_fax_t38: spandsp_recv_fax_t38.o spandsp_raw_fax_tests: spandsp_send_fax_pcm spandsp_recv_fax_pcm spandsp_send_fax_t38 spandsp_recv_fax_t38 -test-amr-decode: test-amr-decode-test.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o +test-amr-decode: test-amr-decode-test.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o resample.o test-amr-encode: test-amr-encode-test.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o @@ -286,7 +288,8 @@ test-stats: test-stats.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssr control_ng.strhash.o graphite.o \ streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \ media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o \ - websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o + websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o \ + mix_in_x64_avx2.o mix_in_x64_sse2.o mix_in_x64_avx512bw.o test-transcode: test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssrc.o call.o ice.o helpers.o \ kernel.o media_socket.o stun.o bencode.o socket.o poller.o dtls.o recording.o statistics.o \ @@ -294,7 +297,8 @@ test-transcode: test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o cod control_ng.strhash.o \ streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \ media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o websocket.o \ - cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o + cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o \ + mix_in_x64_avx2.o mix_in_x64_sse2.o mix_in_x64_avx512bw.o test-resample: test-resample.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o mvr2s_x64_avx2.o \ mvr2s_x64_avx512.o