diff --git a/README.md b/README.md index cd176c4c2..e611cc815 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ the following additional features are available: - Injection of DTMF events or PCM DTMF tones into running audio streams - Playback of pre-recorded streams/announcements - Transcoding between T.38 and PCM (G.711 or other audio codecs) +- Silence detection and comfort noise (RFC 3389) payloads *Rtpengine* does not (yet) support: diff --git a/daemon/codec.c b/daemon/codec.c index 8bb266a35..fc49c954b 100644 --- a/daemon/codec.c +++ b/daemon/codec.c @@ -85,6 +85,11 @@ struct dtx_entry { void *ssrc_ptr; // opaque pointer, doesn't hold a reference }; +struct silence_event { + uint64_t start; + uint64_t end; +}; + struct codec_ssrc_handler { struct ssrc_entry h; // must be first struct codec_handler *handler; @@ -109,6 +114,9 @@ struct codec_ssrc_handler { GQueue dtmf_events; struct dtmf_event dtmf_event; + // silence detection + GQueue silence_events; + uint64_t skip_pts; int rtp_mark:1; @@ -1169,6 +1177,7 @@ void codec_handlers_update(struct call_media *receiver, struct call_media *sink, struct rtp_payload_type *dtmf_pt = NULL; struct rtp_payload_type *reverse_dtmf_pt = NULL; int dtmf_pt_match = __supp_codec_match(receiver, sink, dtmf_payload_type, &dtmf_pt, &reverse_dtmf_pt); + int cn_pt_match = __supp_codec_match(receiver, sink, cn_payload_type, NULL, NULL); // stop transcoding if we've determined that we don't need it if (MEDIA_ISSET(sink, TRANSCODE) && !sink_transcoding) { @@ -1240,8 +1249,8 @@ void codec_handlers_update(struct call_media *receiver, struct call_media *sink, GQueue *dest_codecs = NULL; if (!flags || !flags->always_transcode) { - // we ignore output codec matches if we must transcode DTMF - if (dtmf_pt_match == 1 && MEDIA_ISSET(sink, TRANSCODE)) + // we ignore output codec matches if we must transcode supp codecs + if ((dtmf_pt_match == 1 || cn_pt_match == 1) && MEDIA_ISSET(sink, TRANSCODE)) ; else if (pcm_dtmf_detect) ; @@ -1293,6 +1302,11 @@ void codec_handlers_update(struct call_media *receiver, struct call_media *sink, if (rtp_payload_type_cmp_nf(pt, dest_pt)) goto transcode; + // do we need silence detection? + if (cn_pt_match == 2 && MEDIA_ISSET(sink, TRANSCODE)) + goto transcode; + + // XXX check format parameters as well ilog(LOG_DEBUG, "Sink supports codec " STR_FORMAT, STR_FMT(&pt->encoding_with_params)); __make_passthrough_gsl(handler, &passthrough_handlers); if (pt->codec_def && pt->codec_def->dtmf) @@ -2198,6 +2212,102 @@ void codec_handlers_stop(GQueue *q) { } + + +static void silence_event_free(void *p) { + g_slice_free1(sizeof(struct silence_event), p); +} + +#define __silence_detect_type(type) \ +static void __silence_detect_ ## type(struct codec_ssrc_handler *ch, AVFrame *frame, type thres) { \ + type *s = (void *) frame->data[0]; \ + struct silence_event *last = g_queue_peek_tail(&ch->silence_events); \ + \ + if (last && last->end) /* last event finished? */ \ + last = NULL; \ + \ + for (unsigned int i = 0; i < frame->nb_samples; i++) { \ + if (s[i] <= thres && s[1] >= -thres) { \ + /* silence */ \ + if (!last) { \ + /* new event */ \ + last = g_slice_alloc0(sizeof(*last)); \ + last->start = frame->pts + i; \ + g_queue_push_tail(&ch->silence_events, last); \ + } \ + } \ + else { \ + /* not silence */ \ + if (last && !last->end) { \ + /* close off event */ \ + last->end = frame->pts + i; \ + last = NULL; \ + } \ + } \ + } \ +} + +__silence_detect_type(double) +__silence_detect_type(float) +__silence_detect_type(int32_t) +__silence_detect_type(int16_t) + +static void __silence_detect(struct codec_ssrc_handler *ch, AVFrame *frame) { + if (!rtpe_config.silence_detect_int) + return; + if (ch->handler->cn_payload_type < 0) + return; + switch (frame->format) { + case AV_SAMPLE_FMT_DBL: + __silence_detect_double(ch, frame, rtpe_config.silence_detect_double); + break; + case AV_SAMPLE_FMT_FLT: + __silence_detect_float(ch, frame, rtpe_config.silence_detect_double); + break; + case AV_SAMPLE_FMT_S32: + __silence_detect_int32_t(ch, frame, rtpe_config.silence_detect_int); + break; + case AV_SAMPLE_FMT_S16: + __silence_detect_int16_t(ch, frame, rtpe_config.silence_detect_int >> 16); + break; + default: + ilog(LOG_WARN | LOG_FLAG_LIMIT, "Unsupported sample format %i for silence detection", + frame->format); + } +} +static int is_silence_event(str *inout, GQueue *events, uint64_t pts, uint64_t duration) { + uint64_t end = pts + duration; + + while (events->length) { + struct silence_event *first = g_queue_peek_head(events); + if (first->start > pts) // future event + return 0; + if (!first->end) // ongoing event + goto silence; + if (first->end > end) // event finished with end in the future + goto silence; + // event has ended: remove it + g_queue_pop_head(events); + // does the event fill the entire span? + if (first->end == end) { + silence_event_free(first); + goto silence; + } + // keep going, there might be more + silence_event_free(first); + } + return 0; + +silence: + // replace with CN payload + inout->len = rtpe_config.cn_payload.len; + memcpy(inout->s, rtpe_config.cn_payload.s, inout->len); + return 1; +} + + + + static struct ssrc_entry *__ssrc_handler_transcode_new(void *p) { struct codec_handler *h = p; @@ -2296,6 +2406,7 @@ static void __free_ssrc_handler(void *chp) { dtmf_rx_free(ch->dtmf_dsp); resample_shutdown(&ch->dtmf_resampler); g_queue_clear_full(&ch->dtmf_events, dtmf_event_free); + g_queue_clear_full(&ch->silence_events, silence_event_free); if (ch->dtx_buffer) obj_put(&ch->dtx_buffer->ttq.tt_obj); } @@ -2340,6 +2451,7 @@ static int packet_encoded_rtp(encoder_t *enc, void *u1, void *u2) { unsigned int repeats = 0; int payload_type = -1; + int is_dtmf = dtmf_event_payload(&inout, (uint64_t *) &enc->avpkt.pts, enc->avpkt.duration, &ch->dtmf_event, &ch->dtmf_events); if (is_dtmf) { @@ -2349,6 +2461,10 @@ static int packet_encoded_rtp(encoder_t *enc, void *u1, void *u2) { else if (is_dtmf == 3) repeats = 2; // DTMF end event } + else { + if (is_silence_event(&inout, &ch->silence_events, enc->avpkt.pts, enc->avpkt.duration)) + payload_type = ch->handler->cn_payload_type; + } // ready to send @@ -2469,6 +2585,7 @@ static int packet_decoded_common(decoder_t *decoder, AVFrame *frame, void *u1, v } __dtmf_detect(ch, frame); + __silence_detect(ch, frame); // locking deliberately ignored if (mp->media_out) diff --git a/daemon/main.c b/daemon/main.c index c8c76f0cf..536316b6a 100644 --- a/daemon/main.c +++ b/daemon/main.c @@ -380,6 +380,8 @@ static void options(int *argc, char ***argv) { AUTO_CLEANUP_GBUF(dtmf_udp_ep); AUTO_CLEANUP_GBUF(endpoint_learning); AUTO_CLEANUP_GBUF(dtls_sig); + double silence_detect = 0; + AUTO_CLEANUP_GVBUF(cn_payload); GOptionEntry e[] = { { "table", 't', 0, G_OPTION_ARG_INT, &rtpe_config.kernel_table, "Kernel table to use", "INT" }, @@ -465,6 +467,8 @@ static void options(int *argc, char ***argv) { #ifdef WITH_TRANSCODING { "dtx-delay", 0,0, G_OPTION_ARG_INT, &rtpe_config.dtx_delay, "Delay in milliseconds to trigger DTX handling","INT"}, { "max-dtx", 0,0, G_OPTION_ARG_INT, &rtpe_config.max_dtx, "Maximum duration of DTX handling", "INT"}, + { "silence-detect",0,0, G_OPTION_ARG_DOUBLE, &silence_detect, "Audio level threshold in percent for silence detection","FLOAT"}, + { "cn-payload",0,0, G_OPTION_ARG_STRING_ARRAY,&cn_payload, "Comfort noise parameters to replace silence with","INT INT INT ..."}, #endif { NULL, } @@ -684,6 +688,32 @@ static void options(int *argc, char ***argv) { if (rtpe_config.jb_length < 0) die("Invalid negative jitter buffer size"); + + if (silence_detect > 0) { + rtpe_config.silence_detect_double = silence_detect / 100.0; + rtpe_config.silence_detect_int = (int) ((silence_detect / 100.0) * UINT32_MAX); + } + + if (!cn_payload) + str_init_dup(&rtpe_config.cn_payload, "\x20"); + else { + int len = g_strv_length(cn_payload); + if (len < 1) + die("Invalid CN payload specified"); + rtpe_config.cn_payload.s = malloc(len); + for (int i = 0; i < len; i++) { + char *endp; + long p = strtol(cn_payload[i], &endp, 0); + if (endp == cn_payload[i] || *endp != '\0') + die("Invalid CN payload specified"); + if (p < 0 || p > 254) + die("Invalid CN payload specified"); + if (i == 0 && p > 127) + die("Invalid CN payload specified"); + rtpe_config.cn_payload.s[i] = p; + } + rtpe_config.cn_payload.len = len; + } } void fill_initial_rtpe_cfg(struct rtpengine_config* ini_rtpe_cfg) { diff --git a/daemon/rtpengine.pod b/daemon/rtpengine.pod index a2dd9cfc6..a2e73ea78 100644 --- a/daemon/rtpengine.pod +++ b/daemon/rtpengine.pod @@ -745,6 +745,51 @@ received within this time frame, then DTX processing will stop. Can be set to zero or negative to disable and keep DTX processing on indefinitely. Defaults to 30 seconds. +=item B<--silence-detect=>I + +Enable silence detection and specify threshold in percent. This option is +applicable to transcoded stream only and defaults to zero (disabled). + +When enabled, silence detection will be performed on all transcoded audio +streams. The threshold specified here is the sensitivity for detecting silence: +higher thresholds result in more audio to be detected as silence, while lower +thresholds result in less audio to be detected as silence. The threshold is +specified as percent between zero and 100. If set to 100, then all audio would +be detected as silence; if set to 50, then any audio that is quieter than 50% +of the maximum volume would be detected as silence; and so on. Setting it to +zero disables silence detection. To only detect silence that is very near or +equal to absolute silence, set this value to a low number such as 0.01. (For +certain codecs such as PCMA, a higher minimum threshold is required to detect +complete silence, as their compressed payloads don't decode to actual silence +but instead have a residual DC offset. For PCMA the minimum value is 0.013.) + +Audio that is detected as silence will be replaced by comfort noise as +specified by the B option (see below). Currently this is applicable +only to RTP peers that have advertised support for the B RTP payload type, +in which case the silence audio frames will be replaced by B RTP frames. + +=item B<--cn-payload=>I + +Specify one comfort noise parameter. This option can be given multiple times +and the format follows RFC 3389. When specified at the command line, list the +B<--cn-payload=> option multiple times, each one specifying a single CN +parameter. When used in the config file, list the option only a single time and +list multiple CN parameters separated by semicolons (e.g. +I). + +The first CN payload value given is the noise level, specified as -dBov as per +RFC 3389. This means that a noise level of zero corresponds to maximum volume, +while higher numbers correspond to lower volumes. The highest allowable number +is 127, corresponding to -127 dBov, which is near silence. + +Subsequent CN payload values carry spectral information (reflection +coefficients) as per RFC 3389. Allowable values for each coefficient are +between 0 and 254. Specifying spectral information is optional and the number +of coefficients listed (model order) is variable. + +The default values are 32 (-32 dBov) for the noise level and no spectral +information. + =back =head1 INTERFACES diff --git a/include/main.h b/include/main.h index ab8d6f53c..df6df7407 100644 --- a/include/main.h +++ b/include/main.h @@ -108,6 +108,9 @@ struct rtpengine_config { int http_threads; int dtx_delay; int max_dtx; + double silence_detect_double; + uint32_t silence_detect_int; + str cn_payload; }; diff --git a/t/auto-daemon-tests.pl b/t/auto-daemon-tests.pl index 59b48a520..f975eb1a0 100755 --- a/t/auto-daemon-tests.pl +++ b/t/auto-daemon-tests.pl @@ -10,7 +10,7 @@ use NGCP::Rtpclient::ICE; autotest_start(qw(--config-file=none -t -1 -i 203.0.113.1 -i 2001:db8:4321::1 - -n 2223 -c 12345 -f -L 7 -E -u 2222)) + -n 2223 -c 12345 -f -L 7 -E -u 2222 --silence-detect=1)) or die; @@ -684,7 +684,7 @@ o=- 1545997027 1 IN IP4 198.51.101.1 s=tester t=0 0 m=audio 3000 RTP/AVP 0 -c=IN IP4 198.51.100.1 +c=IN IP4 198.51.101.1 a=sendrecv ---------------------------------- v=0 @@ -721,13 +721,82 @@ a=rtcp:PORT SDP snd($sock_a, $port_b, rtp(0, 1000, 3000, 0x1234, "\x00" x 160)); -rcv($sock_b, $port_a, rtpm(0, 1000, 3000, 0x1234, "\x00" x 160)); +rcv($sock_b, $port_a, rtpm(0, 1000, 3000, -1, "\x00" x 160)); +snd($sock_b, $port_a, rtp(0, 2000, 4000, 0x3456, "\x00" x 160)); +($ssrc) = rcv($sock_a, $port_b, rtpm(0, 2000, 4000, -1, "\x00" x 160)); +snd($sock_b, $port_a, rtp(13, 2001, 4160, 0x3456, "\x12\x23\x23\x34\x56")); +rcv($sock_a, $port_b, rtpm(0, 2001, 4160, $ssrc, "\xce\x56\x69\xcc\x61\xca\x63\xd2\x66\x57\xe2\x47\x65\x59\x6a\x74\x5d\x4a\x68\xe9\x60\x4a\x63\x4b\xf4\x43\x4b\x48\x48\x52\x39\x57\x37\x4c\x39\x4c\x48\x3b\x43\x47\x44\x57\x48\xf5\x3e\x59\x3e\x52\x3b\x53\x3d\x53\x3b\x41\x5b\x38\x4a\x4b\x35\x48\x4a\x3e\x52\x50\x4b\x46\xfd\x3e\xf1\x3a\xd6\x35\x54\x5d\x3a\x58\x45\x42\x3d\x3e\x4c\x42\x3a\x58\x3c\x50\x3b\x6e\x36\x60\x3e\x3d\x3b\x41\x3a\x47\x35\x48\x35\x4b\x3e\x3d\x47\x3a\x3d\x39\x4f\x40\x42\x4a\x47\x3d\x6b\x42\x5a\x75\x53\x45\x5a\x4b\x4f\x48\x59\x48\x78\x43\x77\x4c\x42\x59\x47\x46\x3e\x67\x44\x3a\x67\x4b\x3f\x51\x48\x44\x3e\x54\x37\x6c\x45\x45\x3f\x6e\x3a\x68\x49\x4e\x3f\x47\x4b\x3e\xf3\x39")); +snd($sock_b, $port_a, rtp(0, 2002, 4320, 0x3456, "\x00" x 160)); +rcv($sock_a, $port_b, rtpm(0, 2002, 4320, $ssrc, "\x00" x 160)); +# test silence detection +snd($sock_a, $port_b, rtp(0, 1001, 3160, 0x1234, "\x00" x 160)); +($ssrc) = rcv($sock_b, $port_a, rtpm(0, 1001, 3160, -1, "\x00" x 160)); +snd($sock_a, $port_b, rtp(0, 1002, 3320, 0x1234, "\xff" x 160)); +rcv($sock_b, $port_a, rtpm(13, 1002, 3320, $ssrc, "\x20")); + + + +# reverse of the above, sockets/ports swapped + +($sock_b, $sock_a) = new_call([qw(198.51.101.1 6002)], [qw(198.51.101.3 7002)]); + +($port_b) = offer('accept CN', + { ICE => 'remove', replace => ['origin'], flags => ['always transcode'] }, < 'remove', replace => ['origin'] }, <