TT#101653 add silence detection option

Change-Id: I14940fcabdef475ed5dbe74d9af7ebc2af10311b
5 years ago · 73e4ef42eb
--- a/README.md
+++ b/README.md
@ -54,6 +54,7 @@ the following additional features are available:
 - Injection of DTMF events or PCM DTMF tones into running audio streams
 - Playback of pre-recorded streams/announcements
 - Transcoding between T.38 and PCM (G.711 or other audio codecs)
 - Silence detection and comfort noise (RFC 3389) payloads

 *Rtpengine* does not (yet) support:

--- a/daemon/codec.c
+++ b/daemon/codec.c
@ -85,6 +85,11 @@ struct dtx_entry {
 	void *ssrc_ptr; // opaque pointer, doesn't hold a reference
 };

 struct silence_event {
 	uint64_t start;
 	uint64_t end;
 };

 struct codec_ssrc_handler {
 	struct ssrc_entry h; // must be first
 	struct codec_handler *handler;
@ -109,6 +114,9 @@ struct codec_ssrc_handler {
 	GQueue dtmf_events;
 	struct dtmf_event dtmf_event;

 	// silence detection
 	GQueue silence_events;

 	uint64_t skip_pts;

 	int rtp_mark:1;
@ -1169,6 +1177,7 @@ void codec_handlers_update(struct call_media *receiver, struct call_media *sink,
 	struct rtp_payload_type *dtmf_pt = NULL;
 	struct rtp_payload_type *reverse_dtmf_pt = NULL;
 	int dtmf_pt_match = __supp_codec_match(receiver, sink, dtmf_payload_type, &dtmf_pt, &reverse_dtmf_pt);
 	int cn_pt_match = __supp_codec_match(receiver, sink, cn_payload_type, NULL, NULL);

 	// stop transcoding if we've determined that we don't need it
 	if (MEDIA_ISSET(sink, TRANSCODE) && !sink_transcoding) {
@ -1240,8 +1249,8 @@ void codec_handlers_update(struct call_media *receiver, struct call_media *sink,

 		GQueue *dest_codecs = NULL;
 		if (!flags || !flags->always_transcode) {
 			// we ignore output codec matches if we must transcode DTMF
 			if (dtmf_pt_match == 1 && MEDIA_ISSET(sink, TRANSCODE))
 			// we ignore output codec matches if we must transcode supp codecs
 			if ((dtmf_pt_match == 1 || cn_pt_match == 1) && MEDIA_ISSET(sink, TRANSCODE))
 				;
 			else if (pcm_dtmf_detect)
 				;
@ -1293,6 +1302,11 @@ void codec_handlers_update(struct call_media *receiver, struct call_media *sink,
 			if (rtp_payload_type_cmp_nf(pt, dest_pt))
 				goto transcode;

 			// do we need silence detection?
 			if (cn_pt_match == 2 && MEDIA_ISSET(sink, TRANSCODE))
 				goto transcode;

 			// XXX check format parameters as well
 			ilog(LOG_DEBUG, "Sink supports codec " STR_FORMAT, STR_FMT(&pt->encoding_with_params));
 			__make_passthrough_gsl(handler, &passthrough_handlers);
 			if (pt->codec_def && pt->codec_def->dtmf)
@ -2198,6 +2212,102 @@ void codec_handlers_stop(GQueue *q) {
 }




 static void silence_event_free(void *p) {
 	g_slice_free1(sizeof(struct silence_event), p);
 }

 #define __silence_detect_type(type) \
 static void __silence_detect_ ## type(struct codec_ssrc_handler *ch, AVFrame *frame, type thres) { \
 	type *s = (void *) frame->data[0]; \
 	struct silence_event *last = g_queue_peek_tail(&ch->silence_events); \
 \
 	if (last && last->end) /* last event finished? */ \
 		last = NULL; \
 \
 	for (unsigned int i = 0; i < frame->nb_samples; i++) { \
 		if (s[i] <= thres && s[1] >= -thres) { \
 			/* silence */ \
 			if (!last) { \
 				/* new event */ \
 				last = g_slice_alloc0(sizeof(*last)); \
 				last->start = frame->pts + i; \
 				g_queue_push_tail(&ch->silence_events, last); \
 			} \
 		} \
 		else { \
 			/* not silence */ \
 			if (last && !last->end) { \
 				/* close off event */ \
 				last->end = frame->pts + i; \
 				last = NULL; \
 			} \
 		} \
 	} \
 }

 __silence_detect_type(double)
 __silence_detect_type(float)
 __silence_detect_type(int32_t)
 __silence_detect_type(int16_t)

 static void __silence_detect(struct codec_ssrc_handler *ch, AVFrame *frame) {
 	if (!rtpe_config.silence_detect_int)
 		return;
 	if (ch->handler->cn_payload_type < 0)
 		return;
 	switch (frame->format) {
 		case AV_SAMPLE_FMT_DBL:
 			__silence_detect_double(ch, frame, rtpe_config.silence_detect_double);
 			break;
 		case AV_SAMPLE_FMT_FLT:
 			__silence_detect_float(ch, frame, rtpe_config.silence_detect_double);
 			break;
 		case AV_SAMPLE_FMT_S32:
 			__silence_detect_int32_t(ch, frame, rtpe_config.silence_detect_int);
 			break;
 		case AV_SAMPLE_FMT_S16:
 			__silence_detect_int16_t(ch, frame, rtpe_config.silence_detect_int >> 16);
 			break;
 		default:
 			ilog(LOG_WARN | LOG_FLAG_LIMIT, "Unsupported sample format %i for silence detection",
 					frame->format);
 	}
 }
 static int is_silence_event(str *inout, GQueue *events, uint64_t pts, uint64_t duration) {
 	uint64_t end = pts + duration;

 	while (events->length) {
 		struct silence_event *first = g_queue_peek_head(events);
 		if (first->start > pts) // future event
 			return 0;
 		if (!first->end) // ongoing event
 			goto silence;
 		if (first->end > end) // event finished with end in the future
 			goto silence;
 		// event has ended: remove it
 		g_queue_pop_head(events);
 		// does the event fill the entire span?
 		if (first->end == end) {
 			silence_event_free(first);
 			goto silence;
 		}
 		// keep going, there might be more
 		silence_event_free(first);
 	}
 	return 0;

 silence:
 	// replace with CN payload
 	inout->len = rtpe_config.cn_payload.len;
 	memcpy(inout->s, rtpe_config.cn_payload.s, inout->len);
 	return 1;
 }




 static struct ssrc_entry *__ssrc_handler_transcode_new(void *p) {
 	struct codec_handler *h = p;

@ -2296,6 +2406,7 @@ static void __free_ssrc_handler(void *chp) {
 		dtmf_rx_free(ch->dtmf_dsp);
 	resample_shutdown(&ch->dtmf_resampler);
 	g_queue_clear_full(&ch->dtmf_events, dtmf_event_free);
 	g_queue_clear_full(&ch->silence_events, silence_event_free);
 	if (ch->dtx_buffer)
 		obj_put(&ch->dtx_buffer->ttq.tt_obj);
 }
@ -2340,6 +2451,7 @@ static int packet_encoded_rtp(encoder_t *enc, void *u1, void *u2) {

 		unsigned int repeats = 0;
 		int payload_type = -1;

 		int is_dtmf = dtmf_event_payload(&inout, (uint64_t *) &enc->avpkt.pts, enc->avpkt.duration,
 				&ch->dtmf_event, &ch->dtmf_events);
 		if (is_dtmf) {
@ -2349,6 +2461,10 @@ static int packet_encoded_rtp(encoder_t *enc, void *u1, void *u2) {
 			else if (is_dtmf == 3)
 				repeats = 2; // DTMF end event
 		}
 		else {
 			if (is_silence_event(&inout, &ch->silence_events, enc->avpkt.pts, enc->avpkt.duration))
 				payload_type = ch->handler->cn_payload_type;
 		}

 		// ready to send

@ -2469,6 +2585,7 @@ static int packet_decoded_common(decoder_t *decoder, AVFrame *frame, void *u1, v
 	}

 	__dtmf_detect(ch, frame);
 	__silence_detect(ch, frame);

 	// locking deliberately ignored
 	if (mp->media_out)
--- a/daemon/main.c
+++ b/daemon/main.c
@ -380,6 +380,8 @@ static void options(int *argc, char ***argv) {
 	AUTO_CLEANUP_GBUF(dtmf_udp_ep);
 	AUTO_CLEANUP_GBUF(endpoint_learning);
 	AUTO_CLEANUP_GBUF(dtls_sig);
 	double silence_detect = 0;
 	AUTO_CLEANUP_GVBUF(cn_payload);

 	GOptionEntry e[] = {
 		{ "table",	't', 0, G_OPTION_ARG_INT,	&rtpe_config.kernel_table,		"Kernel table to use",		"INT"		},
@ -465,6 +467,8 @@ static void options(int *argc, char ***argv) {
 #ifdef WITH_TRANSCODING
 		{ "dtx-delay",	0,0,	G_OPTION_ARG_INT,	&rtpe_config.dtx_delay,	"Delay in milliseconds to trigger DTX handling","INT"},
 		{ "max-dtx",	0,0,	G_OPTION_ARG_INT,	&rtpe_config.max_dtx,	"Maximum duration of DTX handling",	"INT"},
 		{ "silence-detect",0,0,	G_OPTION_ARG_DOUBLE,	&silence_detect,	"Audio level threshold in percent for silence detection","FLOAT"},
 		{ "cn-payload",0,0,	G_OPTION_ARG_STRING_ARRAY,&cn_payload,		"Comfort noise parameters to replace silence with","INT INT INT ..."},
 #endif

 		{ NULL, }
@ -684,6 +688,32 @@ static void options(int *argc, char ***argv) {

 	if (rtpe_config.jb_length < 0)
 		die("Invalid negative jitter buffer size");

 	if (silence_detect > 0) {
 		rtpe_config.silence_detect_double = silence_detect / 100.0;
 		rtpe_config.silence_detect_int = (int) ((silence_detect / 100.0) * UINT32_MAX);
 	}

 	if (!cn_payload)
 		str_init_dup(&rtpe_config.cn_payload, "\x20");
 	else {
 		int len = g_strv_length(cn_payload);
 		if (len < 1)
 			die("Invalid CN payload specified");
 		rtpe_config.cn_payload.s = malloc(len);
 		for (int i = 0; i < len; i++) {
 			char *endp;
 			long p = strtol(cn_payload[i], &endp, 0);
 			if (endp == cn_payload[i] || *endp != '\0')
 				die("Invalid CN payload specified");
 			if (p < 0 || p > 254)
 				die("Invalid CN payload specified");
 			if (i == 0 && p > 127)
 				die("Invalid CN payload specified");
 			rtpe_config.cn_payload.s[i] = p;
 		}
 		rtpe_config.cn_payload.len = len;
 	}
 }

 void fill_initial_rtpe_cfg(struct rtpengine_config* ini_rtpe_cfg) {
--- a/daemon/rtpengine.pod
+++ b/daemon/rtpengine.pod
@ -745,6 +745,51 @@ received within this time frame, then DTX processing will stop. Can be set to
 zero or negative to disable and keep DTX processing on indefinitely. Defaults
 to 30 seconds.

 =item B<--silence-detect=>I<FLOAT>

 Enable silence detection and specify threshold in percent. This option is
 applicable to transcoded stream only and defaults to zero (disabled).

 When enabled, silence detection will be performed on all transcoded audio
 streams. The threshold specified here is the sensitivity for detecting silence:
 higher thresholds result in more audio to be detected as silence, while lower
 thresholds result in less audio to be detected as silence. The threshold is
 specified as percent between zero and 100. If set to 100, then all audio would
 be detected as silence; if set to 50, then any audio that is quieter than 50%
 of the maximum volume would be detected as silence; and so on. Setting it to
 zero disables silence detection. To only detect silence that is very near or
 equal to absolute silence, set this value to a low number such as 0.01. (For
 certain codecs such as PCMA, a higher minimum threshold is required to detect
 complete silence, as their compressed payloads don't decode to actual silence
 but instead have a residual DC offset. For PCMA the minimum value is 0.013.)

 Audio that is detected as silence will be replaced by comfort noise as
 specified by the B<cn-payload> option (see below). Currently this is applicable
 only to RTP peers that have advertised support for the B<CN> RTP payload type,
 in which case the silence audio frames will be replaced by B<CN> RTP frames.

 =item B<--cn-payload=>I<INT>

 Specify one comfort noise parameter. This option can be given multiple times
 and the format follows RFC 3389. When specified at the command line, list the
 B<--cn-payload=> option multiple times, each one specifying a single CN
 parameter. When used in the config file, list the option only a single time and
 list multiple CN parameters separated by semicolons (e.g.
 I<cn-payload = 20;40;60>).

 The first CN payload value given is the noise level, specified as -dBov as per
 RFC 3389. This means that a noise level of zero corresponds to maximum volume,
 while higher numbers correspond to lower volumes. The highest allowable number
 is 127, corresponding to -127 dBov, which is near silence.

 Subsequent CN payload values carry spectral information (reflection
 coefficients) as per RFC 3389. Allowable values for each coefficient are
 between 0 and 254. Specifying spectral information is optional and the number
 of coefficients listed (model order) is variable.

 The default values are 32 (-32 dBov) for the noise level and no spectral
 information.

 =back

 =head1 INTERFACES
--- a/include/main.h
+++ b/include/main.h
@ -108,6 +108,9 @@ struct rtpengine_config {
 	int			http_threads;
 	int			dtx_delay;
 	int			max_dtx;
 	double			silence_detect_double;
 	uint32_t		silence_detect_int;
 	str			cn_payload;
 };


--- a/t/auto-daemon-tests.pl
+++ b/t/auto-daemon-tests.pl
@ -10,7 +10,7 @@ use NGCP::Rtpclient::ICE;


 autotest_start(qw(--config-file=none -t -1 -i 203.0.113.1 -i 2001:db8:4321::1
 			-n 2223 -c 12345 -f -L 7 -E -u 2222))
 			-n 2223 -c 12345 -f -L 7 -E -u 2222 --silence-detect=1))
 		or die;


@ -684,7 +684,7 @@ o=- 1545997027 1 IN IP4 198.51.101.1
 s=tester
 t=0 0
 m=audio 3000 RTP/AVP 0
 c=IN IP4 198.51.100.1
 c=IN IP4 198.51.101.1
 a=sendrecv
 ----------------------------------
 v=0
@ -721,13 +721,82 @@ a=rtcp:PORT
 SDP

 snd($sock_a, $port_b, rtp(0, 1000, 3000, 0x1234, "\x00" x 160));
 rcv($sock_b, $port_a, rtpm(0, 1000, 3000, 0x1234, "\x00" x 160));
 rcv($sock_b, $port_a, rtpm(0, 1000, 3000, -1, "\x00" x 160));
 snd($sock_b, $port_a, rtp(0, 2000, 4000, 0x3456, "\x00" x 160));
 ($ssrc) = rcv($sock_a, $port_b, rtpm(0, 2000, 4000, -1, "\x00" x 160));
 snd($sock_b, $port_a, rtp(13, 2001, 4160, 0x3456, "\x12\x23\x23\x34\x56"));
 rcv($sock_a, $port_b, rtpm(0, 2001, 4160, $ssrc, "\xce\x56\x69\xcc\x61\xca\x63\xd2\x66\x57\xe2\x47\x65\x59\x6a\x74\x5d\x4a\x68\xe9\x60\x4a\x63\x4b\xf4\x43\x4b\x48\x48\x52\x39\x57\x37\x4c\x39\x4c\x48\x3b\x43\x47\x44\x57\x48\xf5\x3e\x59\x3e\x52\x3b\x53\x3d\x53\x3b\x41\x5b\x38\x4a\x4b\x35\x48\x4a\x3e\x52\x50\x4b\x46\xfd\x3e\xf1\x3a\xd6\x35\x54\x5d\x3a\x58\x45\x42\x3d\x3e\x4c\x42\x3a\x58\x3c\x50\x3b\x6e\x36\x60\x3e\x3d\x3b\x41\x3a\x47\x35\x48\x35\x4b\x3e\x3d\x47\x3a\x3d\x39\x4f\x40\x42\x4a\x47\x3d\x6b\x42\x5a\x75\x53\x45\x5a\x4b\x4f\x48\x59\x48\x78\x43\x77\x4c\x42\x59\x47\x46\x3e\x67\x44\x3a\x67\x4b\x3f\x51\x48\x44\x3e\x54\x37\x6c\x45\x45\x3f\x6e\x3a\x68\x49\x4e\x3f\x47\x4b\x3e\xf3\x39"));
 snd($sock_b, $port_a, rtp(0, 2002, 4320, 0x3456, "\x00" x 160));
 rcv($sock_a, $port_b, rtpm(0, 2002, 4320, $ssrc, "\x00" x 160));
 # test silence detection
 snd($sock_a, $port_b, rtp(0, 1001, 3160, 0x1234, "\x00" x 160));
 ($ssrc) = rcv($sock_b, $port_a, rtpm(0, 1001, 3160, -1, "\x00" x 160));
 snd($sock_a, $port_b, rtp(0, 1002, 3320, 0x1234, "\xff" x 160));
 rcv($sock_b, $port_a, rtpm(13, 1002, 3320, $ssrc, "\x20"));



 # reverse of the above, sockets/ports swapped

 ($sock_b, $sock_a) = new_call([qw(198.51.101.1 6002)], [qw(198.51.101.3 7002)]);

 ($port_b) = offer('accept CN',
 	{ ICE => 'remove', replace => ['origin'], flags => ['always transcode'] }, <<SDP);
 v=0
 o=- 1545997027 1 IN IP4 198.51.101.1
 s=tester
 t=0 0
 m=audio 6002 RTP/AVP 0 13
 c=IN IP4 198.51.101.1
 a=sendrecv
 ----------------------------------
 v=0
 o=- 1545997027 1 IN IP4 203.0.113.1
 s=tester
 t=0 0
 m=audio PORT RTP/AVP 0 13
 c=IN IP4 203.0.113.1
 a=rtpmap:0 PCMU/8000
 a=rtpmap:13 CN/8000
 a=sendrecv
 a=rtcp:PORT
 SDP

 ($port_a) = answer('accept CN',
 	{ ICE => 'remove', replace => ['origin'] }, <<SDP);
 v=0
 o=- 1545997027 1 IN IP4 198.51.101.1
 s=tester
 t=0 0
 m=audio 7002 RTP/AVP 0
 c=IN IP4 198.51.101.3
 a=sendrecv
 ----------------------------------
 v=0
 o=- 1545997027 1 IN IP4 203.0.113.1
 s=tester
 t=0 0
 m=audio PORT RTP/AVP 0 13
 c=IN IP4 203.0.113.1
 a=rtpmap:0 PCMU/8000
 a=rtpmap:13 CN/8000
 a=sendrecv
 a=rtcp:PORT
 SDP

 snd($sock_a, $port_b, rtp(0, 1000, 3000, 0x1234, "\x00" x 160));
 rcv($sock_b, $port_a, rtpm(0, 1000, 3000, -1, "\x00" x 160));
 snd($sock_b, $port_a, rtp(0, 2000, 4000, 0x3456, "\x00" x 160));
 ($ssrc) = rcv($sock_a, $port_b, rtpm(0, 2000, 4000, -1, "\x00" x 160));
 snd($sock_b, $port_a, rtp(13, 2001, 4160, 0x3456, "\x12\x23\x23\x34\x56"));
 rcv($sock_a, $port_b, rtpm(0, 2001, 4160, $ssrc, "\xce\x56\x69\xcc\x61\xca\x63\xd2\x66\x57\xe2\x47\x65\x59\x6a\x74\x5d\x4a\x68\xe9\x60\x4a\x63\x4b\xf4\x43\x4b\x48\x48\x52\x39\x57\x37\x4c\x39\x4c\x48\x3b\x43\x47\x44\x57\x48\xf5\x3e\x59\x3e\x52\x3b\x53\x3d\x53\x3b\x41\x5b\x38\x4a\x4b\x35\x48\x4a\x3e\x52\x50\x4b\x46\xfd\x3e\xf1\x3a\xd6\x35\x54\x5d\x3a\x58\x45\x42\x3d\x3e\x4c\x42\x3a\x58\x3c\x50\x3b\x6e\x36\x60\x3e\x3d\x3b\x41\x3a\x47\x35\x48\x35\x4b\x3e\x3d\x47\x3a\x3d\x39\x4f\x40\x42\x4a\x47\x3d\x6b\x42\x5a\x75\x53\x45\x5a\x4b\x4f\x48\x59\x48\x78\x43\x77\x4c\x42\x59\x47\x46\x3e\x67\x44\x3a\x67\x4b\x3f\x51\x48\x44\x3e\x54\x37\x6c\x45\x45\x3f\x6e\x3a\x68\x49\x4e\x3f\x47\x4b\x3e\xf3\x39"));
 snd($sock_b, $port_a, rtp(0, 2002, 4320, 0x3456, "\x00" x 160));
 rcv($sock_a, $port_b, rtpm(0, 2002, 4320, $ssrc, "\x00" x 160));
 # test silence detection
 snd($sock_a, $port_b, rtp(0, 1001, 3160, 0x1234, "\x00" x 160));
 ($ssrc) = rcv($sock_b, $port_a, rtpm(0, 1001, 3160, -1, "\x00" x 160));
 snd($sock_a, $port_b, rtp(0, 1002, 3320, 0x1234, "\xff" x 160));
 rcv($sock_b, $port_a, rtpm(13, 1002, 3320, $ssrc, "\x20"));