diff --git a/daemon/Makefile b/daemon/Makefile
index ffaa13a6d..9940ba92c 100644
--- a/daemon/Makefile
+++ b/daemon/Makefile
@@ -82,7 +82,7 @@ SRCS=		main.c kernel.c poller.c aux.c control_tcp.c call.c control_udp.c redis.c
 		crypto.c rtp.c call_interfaces.strhash.c dtls.c log.c cli.c graphite.c ice.c \
 		media_socket.c homer.c recording.c statistics.c cdr.c ssrc.c iptables.c tcp_listener.c \
 		codec.c load.c dtmf.c timerthread.c media_player.c jitter_buffer.c t38.c websocket.c \
-		mqtt.c janus.strhash.c
+		mqtt.c janus.strhash.c audio_player.c
 LIBSRCS=	loglib.c auxlib.c rtplib.c str.c socket.c streambuf.c ssllib.c dtmflib.c mix_buffer.c
 ifeq ($(with_transcoding),yes)
 LIBSRCS+=	codeclib.strhash.c resample.c
diff --git a/daemon/audio_player.c b/daemon/audio_player.c
new file mode 100644
index 000000000..f3f79926e
--- /dev/null
+++ b/daemon/audio_player.c
@@ -0,0 +1,204 @@
+#ifdef WITH_TRANSCODING
+
+#include "audio_player.h"
+#include "call.h"
+#include "media_player.h"
+#include "mix_buffer.h"
+#include "codec.h"
+
+
+struct audio_player {
+	struct media_player *mp;
+	struct mix_buffer mb;
+	struct timeval last_run;
+
+	unsigned int ptime_us;
+	unsigned int ptime; // in samples
+
+	unsigned long long pts;
+};
+
+
+// call is locked in R and mp is locked
+static bool audio_player_run(struct media_player *mp) {
+	if (!mp || !mp->media)
+		return false;
+
+	struct audio_player *ap = mp->media->audio_player;
+	if (!ap || !ap->ptime_us)
+		return false;
+
+	ap->last_run = rtpe_now; // equals mp->next_run
+
+	unsigned int size;
+	void *buf = mix_buffer_read_fast(&ap->mb, ap->ptime, &size);
+	if (!buf) {
+		buf = g_alloca(size);
+		mix_buffer_read_slow(&ap->mb, buf, ap->ptime);
+	}
+
+	media_player_add_packet(mp, buf, size, ap->ptime_us, ap->pts);
+	ap->pts += ap->ptime;
+
+	return false;
+}
+
+// call locked in W
+bool audio_player_setup(struct call_media *m, const struct rtp_payload_type *dst_pt,
+		unsigned int size_ms, unsigned int delay_ms)
+{
+	if (!dst_pt)
+		return false;
+	unsigned int bufsize_ms = size_ms;
+	if (!bufsize_ms)
+		bufsize_ms = rtpe_config.audio_buffer_length;
+	if (!bufsize_ms)
+		return false;
+
+	unsigned int clockrate = fraction_mult(dst_pt->clock_rate, &dst_pt->codec_def->default_clockrate_fact);
+
+	unsigned int ptime_ms = m->ptime;
+	if (!ptime_ms)
+		ptime_ms = 20;
+	unsigned int ptime_us = ptime_ms * 1000;
+	unsigned int ptime_smp = ptime_ms * clockrate / 1000; // in samples
+
+	// TODO: shortcut this to avoid the detour of avframe -> avpacket -> avframe (all in s16)
+	// TODO: determine dest sample format from created encoder
+	struct rtp_payload_type src_pt = {
+		.payload_type = -1,
+		.encoding = STR_CONST_INIT("PCM-S16LE"), // XXX support flp
+		.channels = dst_pt->channels,
+		.clock_rate = clockrate,
+		.ptime = ptime_ms,
+	};
+
+	struct audio_player *ap;
+	struct media_player *mp = NULL;
+
+	// check if objects exists and parameters are still the same
+
+	if ((ap = m->audio_player) && (mp = ap->mp)) {
+		if (!media_player_pt_match(mp, &src_pt, dst_pt))
+			{ /* do reset below */ }
+		if (ap->ptime != ptime_smp || ap->ptime_us != ptime_us)
+			{ /* do reset below */ }
+		else // everything matched
+			return true;
+
+		ilogs(transcoding, LOG_DEBUG, "Resetting audio player for new parameters");
+	}
+	else
+		ilogs(transcoding, LOG_DEBUG, "Creating new audio player");
+
+	// create ap and mp objects, or reset them if needed
+
+	if (ap) {
+		mix_buffer_destroy(&ap->mb);
+		ZERO(ap->mb);
+	}
+	else
+		ap = m->audio_player = g_slice_alloc0(sizeof(*m->audio_player));
+
+	if (mp)
+		media_player_stop(mp);
+	else
+		mp = ap->mp = media_player_new(m->monologue);
+	if (!mp)
+		goto error;
+
+	// set everything up
+
+	src_pt.codec_def = codec_find_by_av(AV_CODEC_ID_PCM_S16LE), // XXX shortcut this?
+
+	mp->run_func = audio_player_run;
+
+	ap->ptime_us = ptime_us;
+	ap->ptime = ptime_smp;
+
+	if (media_player_setup(mp, &src_pt, dst_pt))
+		goto error;
+
+	bufsize_ms = MAX(bufsize_ms, ptime_ms * 2); // make sure the buf size is at least 2 frames
+
+	mix_buffer_init(&ap->mb, AV_SAMPLE_FMT_S16, clockrate, dst_pt->channels, bufsize_ms, delay_ms);
+
+	return true;
+
+error:
+	audio_player_free(m);
+	return false;
+}
+
+
+// call locked in W
+void audio_player_start(struct call_media *m) {
+	struct audio_player *ap;
+
+	if (!m || !(ap = m->audio_player))
+		return;
+
+	struct media_player *mp = ap->mp;
+	if (!mp)
+		return;
+
+	media_player_set_media(mp, m);
+
+	if (mp->next_run.tv_sec) // already running?
+		return;
+
+	ilogs(transcoding, LOG_DEBUG, "Starting audio player");
+
+	ap->last_run = rtpe_now;
+
+	mp->next_run = rtpe_now;
+	timeval_add_usec(&mp->next_run, ap->ptime_us);
+	timerthread_obj_schedule_abs(&mp->tt_obj, &mp->next_run);
+
+}
+
+
+void audio_player_add_frame(struct audio_player *ap, uint32_t ssrc, AVFrame *frame) {
+	bool ret = mix_buffer_write(&ap->mb, ssrc, frame->extended_data[0], frame->nb_samples);
+	if (!ret)
+		ilogs(transcoding, LOG_WARN | LOG_FLAG_LIMIT, "Failed to add samples to mix buffer");
+}
+
+
+void audio_player_stop(struct call_media *m) {
+	struct audio_player *ap = m->audio_player;
+	if (!ap)
+		return;
+	ilogs(transcoding, LOG_DEBUG, "Stopping audio player");
+	media_player_stop(ap->mp);
+	media_player_put(&ap->mp);
+}
+
+
+bool audio_player_is_active(struct call_media *m) {
+	if (!m->audio_player)
+		return false;
+	if (!m->audio_player->mp)
+		return false;
+	if (!m->audio_player->mp->next_run.tv_sec)
+		return false;
+	return true;
+}
+
+
+bool audio_player_pt_match(struct call_media *m, const struct rtp_payload_type *pt) {
+	return rtp_payload_type_eq_exact(&m->audio_player->mp->coder.handler->dest_pt, pt);
+}
+
+
+void audio_player_free(struct call_media *m) {
+	struct audio_player *ap = m->audio_player;
+	if (!ap)
+		return;
+	mix_buffer_destroy(&ap->mb);
+	media_player_put(&ap->mp);
+	g_slice_free1(sizeof(*ap), ap);
+	m->audio_player = NULL;
+}
+
+#endif
diff --git a/daemon/call.c b/daemon/call.c
index 2e74b1b99..4919a9491 100644
--- a/daemon/call.c
+++ b/daemon/call.c
@@ -50,6 +50,7 @@
 #include "mqtt.h"
 #include "janus.h"
 #include "dtmf.h"
+#include "audio_player.h"
 
 
 struct iterator_helper {
@@ -2815,6 +2816,7 @@ static void __update_init_subscribers(struct call_monologue *ml, GQueue *streams
 
 		recording_setup_media(media);
 		t38_gateway_start(media->t38_gateway);
+		audio_player_start(media);
 
 		if (mqtt_publish_scope() == MPS_MEDIA)
 			mqtt_timer_start(&media->mqtt_timer, media->call, media);
@@ -3657,6 +3659,7 @@ static void __call_cleanup(struct call *c) {
 		ice_shutdown(&md->ice_agent);
 		media_stop(md);
 		t38_gateway_put(&md->t38_gateway);
+		audio_player_free(md);
 	}
 
 	for (GList *l = c->monologues.head; l; l = l->next) {
@@ -4597,6 +4600,7 @@ int call_get_mono_dialogue(struct call_monologue *dialogue[2], struct call *call
 
 static void media_stop(struct call_media *m) {
 	t38_gateway_stop(m->t38_gateway);
+	audio_player_stop(m);
 	codec_handlers_stop(&m->codec_handlers_store);
 	rtcp_timer_stop(&m->rtcp_timer);
 	mqtt_timer_stop(&m->mqtt_timer);
diff --git a/daemon/call_interfaces.c b/daemon/call_interfaces.c
index c743aeef8..9a6b2522b 100644
--- a/daemon/call_interfaces.c
+++ b/daemon/call_interfaces.c
@@ -1079,6 +1079,14 @@ static void call_ng_flags_flags(struct sdp_ng_flags *out, str *s, void *dummy) {
 		case CSH_LOOKUP("no-passthrough"):
 			out->passthrough_off = 1;
 			break;
+		case CSH_LOOKUP("player"):
+		case CSH_LOOKUP("audio-player"):
+			out->audio_player = AP_TRANSCODING;
+			break;
+		case CSH_LOOKUP("no-player"):
+		case CSH_LOOKUP("no-audio-player"):
+			out->audio_player = AP_OFF;
+			break;
 		case CSH_LOOKUP("no-jitter-buffer"):
 			out->disable_jb = 1;
 			break;
@@ -1516,6 +1524,37 @@ static void call_ng_main_flags(struct sdp_ng_flags *out, str *key, bencode_item_
 							STR_FMT(&s));
 			}
 			break;
+		case CSH_LOOKUP("player"):
+		case CSH_LOOKUP("audio-player"):
+			switch (__csh_lookup(&s)) {
+				case CSH_LOOKUP("default"):
+					out->audio_player = AP_DEFAULT;
+					break;
+				case CSH_LOOKUP("on"):
+				case CSH_LOOKUP("yes"):
+				case CSH_LOOKUP("enable"):
+				case CSH_LOOKUP("enabled"):
+				case CSH_LOOKUP("transcode"):
+				case CSH_LOOKUP("transcoding"):
+					out->audio_player = AP_TRANSCODING;
+					break;
+				case CSH_LOOKUP("no"):
+				case CSH_LOOKUP("off"):
+				case CSH_LOOKUP("disable"):
+				case CSH_LOOKUP("disabled"):
+					out->audio_player = AP_OFF;
+					break;
+				case CSH_LOOKUP("force"):
+				case CSH_LOOKUP("forced"):
+				case CSH_LOOKUP("always"):
+				case CSH_LOOKUP("everything"):
+					out->audio_player = AP_FORCE;
+					break;
+				default:
+					ilog(LOG_WARN, "Unknown 'audio-player' flag encountered: '" STR_FORMAT "'",
+							STR_FMT(&s));
+			}
+			break;
 		case CSH_LOOKUP("transport protocol"):
 		case CSH_LOOKUP("transport-protocol"):
 			if (!str_cmp(&s, "accept"))
diff --git a/daemon/codec.c b/daemon/codec.c
index 91fd00257..947390520 100644
--- a/daemon/codec.c
+++ b/daemon/codec.c
@@ -17,6 +17,7 @@
 #include "timerthread.h"
 #include "log_funcs.h"
 #include "mqtt.h"
+#include "audio_player.h"
 #ifdef WITH_TRANSCODING
 #include "fix_frame_channel_layout.h"
 #endif
@@ -234,6 +235,7 @@ static codec_handler_func handler_func_dtmf;
 static codec_handler_func handler_func_t38;
 
 static struct ssrc_entry *__ssrc_handler_transcode_new(void *p);
+static struct ssrc_entry *__ssrc_handler_decode_new(void *p);
 static struct ssrc_entry *__ssrc_handler_new(void *p);
 static void __ssrc_handler_stop(void *p, void *dummy);
 static void __free_ssrc_handler(void *);
@@ -246,6 +248,7 @@ static int packet_decode(struct codec_ssrc_handler *, struct codec_ssrc_handler
 static int packet_encoded_rtp(encoder_t *enc, void *u1, void *u2);
 static int packet_decoded_fifo(decoder_t *decoder, AVFrame *frame, void *u1, void *u2);
 static int packet_decoded_direct(decoder_t *decoder, AVFrame *frame, void *u1, void *u2);
+static int packet_decoded_audio_player(decoder_t *decoder, AVFrame *frame, void *u1, void *u2);
 
 static void codec_touched(struct codec_store *cs, struct rtp_payload_type *pt);
 
@@ -513,6 +516,12 @@ static void __make_transcoder(struct codec_handler *handler, struct rtp_payload_
 	__make_transcoder_full(handler, dest, output_transcoders, dtmf_payload_type, pcm_dtmf_detect,
 			cn_payload_type, packet_decoded_fifo, __ssrc_handler_transcode_new);
 }
+static void __make_audio_player_decoder(struct codec_handler *handler, struct rtp_payload_type *dest,
+		bool pcm_dtmf_detect)
+{
+	__make_transcoder_full(handler, dest, NULL, -1, pcm_dtmf_detect, -1, packet_decoded_audio_player,
+			__ssrc_handler_decode_new);
+}
 
 // used for generic playback (audio_player, t38_gateway)
 struct codec_handler *codec_handler_make_playback(const struct rtp_payload_type *src_pt,
@@ -543,6 +552,12 @@ struct codec_handler *codec_handler_make_media_player(const struct rtp_payload_t
 	struct codec_handler *h = codec_handler_make_playback(src_pt, dst_pt, last_ts, media, ssrc);
 	if (!h)
 		return NULL;
+	if (audio_player_is_active(media)) {
+		h->packet_decoded = packet_decoded_audio_player;
+		if (!audio_player_pt_match(media, dst_pt))
+			ilogs(codec, LOG_WARN, "Codec mismatch between audio player and media player (wanted: "
+					STR_FORMAT ")", STR_FMT(&dst_pt->encoding_with_params));
+	}
 	return h;
 }
 struct codec_handler *codec_handler_make_dummy(const struct rtp_payload_type *dst_pt, struct call_media *media)
@@ -794,6 +809,10 @@ static void __generator_stop(struct call_media *media) {
 		t38_gateway_put(&media->t38_gateway);
 	}
 }
+static void __generator_stop_all(struct call_media *media) {
+	__generator_stop(media);
+	audio_player_stop(media);
+}
 
 static void __t38_options_from_flags(struct t38_options *t_opts, const struct sdp_ng_flags *flags) {
 #define t38_opt(name) t_opts->name = flags ? flags->t38_ ## name : 0
@@ -1016,8 +1035,8 @@ bool codec_handlers_update(struct call_media *receiver, struct call_media *sink,
 	}
 	// everything else is unsupported: pass through
 	if (proto_is_not_rtp(receiver->protocol)) {
-		__generator_stop(receiver);
-		__generator_stop(sink);
+		__generator_stop_all(receiver);
+		__generator_stop_all(sink);
 		codec_handlers_stop(&receiver->codec_handlers_store);
 		return false;
 	}
@@ -1041,6 +1060,17 @@ bool codec_handlers_update(struct call_media *receiver, struct call_media *sink,
 	receiver->dtmf_count = 0;
 	GSList *passthrough_handlers = NULL;
 
+	// default choice of audio player usage is based on whether it was in use previously,
+	// overridden by signalling flags, overridden by global option
+	bool use_audio_player = !!MEDIA_ISSET(sink, AUDIO_PLAYER);
+
+	if (flags && flags->audio_player == AP_FORCE)
+		use_audio_player = true;
+	else if (flags && flags->audio_player == AP_OFF)
+		use_audio_player = false;
+	else if (rtpe_config.use_audio_player == UAP_ALWAYS)
+		use_audio_player = true;
+
 	// first gather info about what we can send
 	AUTO_CLEANUP_NULL(GHashTable *supplemental_sinks, __g_hash_table_destroy);
 	struct rtp_payload_type *pref_dest_codec = NULL;
@@ -1068,7 +1098,7 @@ bool codec_handlers_update(struct call_media *receiver, struct call_media *sink,
 		do_dtmf_detect = true;
 
 	// do we have to force everything through the transcoding engine even if codecs match?
-	bool force_transcoding = do_pcm_dtmf_blocking || do_dtmf_blocking;
+	bool force_transcoding = do_pcm_dtmf_blocking || do_dtmf_blocking || use_audio_player;
 	if (sink->monologue->inject_dtmf)
 		force_transcoding = true;
 
@@ -1160,6 +1190,7 @@ bool codec_handlers_update(struct call_media *receiver, struct call_media *sink,
 				STR_FMT(&pt->encoding_with_params),
 				STR_FMT(&sink_pt->encoding_with_full_params), sink_pt->payload_type);
 
+sink_pt_fixed:;
 		// we have found a usable output codec. gather matching output supp codecs
 		struct rtp_payload_type *sink_dtmf_pt = __supp_payload_type(supplemental_sinks,
 				sink_pt->clock_rate, "telephone-event");
@@ -1290,7 +1321,27 @@ bool codec_handlers_update(struct call_media *receiver, struct call_media *sink,
 		__make_passthrough_gsl(handler, &passthrough_handlers, sink_dtmf_pt, sink_cn_pt);
 		goto next;
 
-transcode:;
+transcode:
+		// enable audio player if not explicitly disabled
+		if (rtpe_config.use_audio_player == UAP_TRANSCODING && (!flags || flags->audio_player != AP_OFF))
+			use_audio_player = true;
+		else if (flags && flags->audio_player == AP_TRANSCODING)
+			use_audio_player = true;
+
+		if (use_audio_player) {
+			// when using the audio player, everything must decode to the same
+			// format that is appropriate for the audio player
+			if (sink_pt != pref_dest_codec && pref_dest_codec) {
+				ilogs(codec, LOG_DEBUG, "Switching sink codec for " STR_FORMAT " to "
+						STR_FORMAT " (%i) due to usage of audio player",
+				STR_FMT(&pt->encoding_with_params),
+				STR_FMT(&pref_dest_codec->encoding_with_full_params),
+				pref_dest_codec->payload_type);
+				sink_pt = pref_dest_codec;
+				force_transcoding = true;
+				goto sink_pt_fixed;
+			}
+		}
 		// look up the reverse side of this payload type, which is the decoder to our
 		// encoder. if any codec options such as bitrate were set during an offer,
 		// they're in the decoder PT. copy them to the encoder PT.
@@ -1305,9 +1356,12 @@ transcode:;
 			}
 		}
 		is_transcoding = true;
-		__make_transcoder(handler, sink_pt, output_transcoders,
-				sink_dtmf_pt ? sink_dtmf_pt->payload_type : -1,
-				pcm_dtmf_detect, sink_cn_pt ? sink_cn_pt->payload_type : -1);
+		if (!use_audio_player)
+			__make_transcoder(handler, sink_pt, output_transcoders,
+					sink_dtmf_pt ? sink_dtmf_pt->payload_type : -1,
+					pcm_dtmf_detect, sink_cn_pt ? sink_cn_pt->payload_type : -1);
+		else
+			__make_audio_player_decoder(handler, sink_pt, pcm_dtmf_detect);
 		// for DTMF delay: we pretend that there is no output DTMF payload type (sink_dtmf_pt == NULL)
 		// so that DTMF is converted to audio (so it can be replaced with silence). we still want
 		// to output DTMF event packets when we can though, so we need to remember the DTMF payload
@@ -1319,37 +1373,66 @@ next:
 		l = l->next;
 	}
 
+	if (!use_audio_player) {
+		MEDIA_CLEAR(sink, AUDIO_PLAYER);
+		audio_player_stop(sink);
+	}
+	else
+		MEDIA_SET(sink, AUDIO_PLAYER);
+
 	if (is_transcoding) {
-		// we have to translate RTCP packets
-		receiver->rtcp_handler = rtcp_transcode_handler;
+		MEDIA_SET(receiver, TRANSCODE);
 
-		for (GList *l = receiver->codecs.codec_prefs.head; l; ) {
-			struct rtp_payload_type *pt = l->data;
+		if (!use_audio_player) {
+			// we have to translate RTCP packets
+			receiver->rtcp_handler = rtcp_transcode_handler;
 
-			if (pt->codec_def) {
-				// supported
-				l = l->next;
-				continue;
+			for (GList *l = receiver->codecs.codec_prefs.head; l; ) {
+				struct rtp_payload_type *pt = l->data;
+
+				if (pt->codec_def) {
+					// supported
+					l = l->next;
+					continue;
+				}
+
+				ilogs(codec, LOG_DEBUG, "Stripping unsupported codec " STR_FORMAT
+						" due to active transcoding",
+						STR_FMT(&pt->encoding));
+				codec_touched(&receiver->codecs, pt);
+				l = __codec_store_delete_link(l, &receiver->codecs);
 			}
 
-			ilogs(codec, LOG_DEBUG, "Stripping unsupported codec " STR_FORMAT
-					" due to active transcoding",
-					STR_FMT(&pt->encoding));
-			codec_touched(&receiver->codecs, pt);
-			l = __codec_store_delete_link(l, &receiver->codecs);
+
+			// at least some payload types will be transcoded, which will result in SSRC
+			// change. for payload types which we don't actually transcode, we still
+			// must substitute the SSRC
+			while (passthrough_handlers) {
+				struct codec_handler *handler = passthrough_handlers->data;
+				__make_passthrough_ssrc(handler);
+				passthrough_handlers = g_slist_delete_link(passthrough_handlers,
+						passthrough_handlers);
+
+			}
 		}
+		else {
+			receiver->rtcp_handler = rtcp_sink_handler;
+			MEDIA_CLEAR(receiver, RTCP_GEN);
 
+			// change all passthrough handlers also to transcoders
+			while (passthrough_handlers) {
+				struct codec_handler *handler = passthrough_handlers->data;
+				__make_audio_player_decoder(handler, pref_dest_codec, false);
+				passthrough_handlers = g_slist_delete_link(passthrough_handlers,
+						passthrough_handlers);
 
-		// at least some payload types will be transcoded, which will result in SSRC
-		// change. for payload types which we don't actually transcode, we still
-		// must substitute the SSRC
-		while (passthrough_handlers) {
-			struct codec_handler *handler = passthrough_handlers->data;
-			__make_passthrough_ssrc(handler);
-			passthrough_handlers = g_slist_delete_link(passthrough_handlers, passthrough_handlers);
+			}
 
+			audio_player_setup(sink, pref_dest_codec, rtpe_config.audio_buffer_length,
+					rtpe_config.audio_buffer_delay);
 		}
 	}
+
 	g_slist_free(passthrough_handlers);
 
 	if (MEDIA_ISSET(receiver, RTCP_GEN)) {
@@ -1361,9 +1444,6 @@ next:
 		__codec_rtcp_timer(sink);
 	}
 
-	if (is_transcoding)
-		MEDIA_SET(receiver, TRANSCODE);
-
 	return is_transcoding;
 }
 
@@ -3537,6 +3617,32 @@ static struct ssrc_entry *__ssrc_handler_transcode_new(void *p) {
 
 	return &ch->h;
 
+err:
+	obj_put(&ch->h);
+	return NULL;
+}
+static struct ssrc_entry *__ssrc_handler_decode_new(void *p) {
+	struct codec_handler *h = p;
+
+	ilogs(codec, LOG_DEBUG, "Creating SSRC decoder for %s/%u/%i",
+			h->source_pt.codec_def->rtpname, h->source_pt.clock_rate,
+			h->source_pt.channels);
+
+	struct codec_ssrc_handler *ch = obj_alloc0("codec_ssrc_handler", sizeof(*ch), __free_ssrc_handler);
+	ch->handler = h;
+	ch->ptime = h->dest_pt.ptime;
+
+	format_t dest_format = {
+		.clockrate = h->dest_pt.clock_rate,
+		.channels = h->dest_pt.channels,
+		.format = AV_SAMPLE_FMT_S16,
+	};
+
+	if (!__ssrc_handler_decode_common(ch, h, &dest_format))
+		goto err;
+
+	return &ch->h;
+
 err:
 	obj_put(&ch->h);
 	return NULL;
@@ -3794,6 +3900,24 @@ static int packet_decoded_fifo(decoder_t *decoder, AVFrame *frame, void *u1, voi
 static int packet_decoded_direct(decoder_t *decoder, AVFrame *frame, void *u1, void *u2) {
 	return packet_decoded_common(decoder, frame, u1, u2, encoder_input_data);
 }
+static int packet_decoded_audio_player(decoder_t *decoder, AVFrame *frame, void *u1, void *u2) {
+	struct codec_ssrc_handler *ch = u1;
+	struct media_packet *mp = u2;
+
+	ilogs(transcoding, LOG_DEBUG, "RTP media decoded for audio player: TS %llu, samples %u",
+			(unsigned long long) frame->pts, frame->nb_samples);
+
+	struct call_media *m = mp->media_out;
+	if (!m || !m->audio_player) {
+		// discard XXX log?
+		return 0;
+	}
+
+	audio_player_add_frame(m->audio_player, ch->h.ssrc, frame);
+	// XXX error checking/reporting
+
+	return 0;
+}
 
 static int __rtp_decode(struct codec_ssrc_handler *ch, struct codec_ssrc_handler *input_ch,
 		struct transcode_packet *packet, struct media_packet *mp)
diff --git a/daemon/main.c b/daemon/main.c
index 05ac50718..b3192a180 100644
--- a/daemon/main.c
+++ b/daemon/main.c
@@ -92,6 +92,8 @@ struct rtpengine_config rtpe_config = {
 	.dtx_shift = 5,
 	.dtx_buffer = 10,
 	.dtx_lag = 100,
+	.audio_buffer_delay = 5,
+	.audio_buffer_length = 500,
 	.mqtt_port = 1883,
 	.mqtt_keepalive = 30,
 	.mqtt_publish_interval = 5000,
@@ -448,6 +450,7 @@ static void options(int *argc, char ***argv) {
 #endif
 	AUTO_CLEANUP_GBUF(mos);
 	AUTO_CLEANUP_GBUF(dcc);
+	AUTO_CLEANUP_GBUF(use_audio_player);
 
 	rwlock_lock_w(&rtpe_config.config_lock);
 
@@ -555,6 +558,9 @@ static void options(int *argc, char ***argv) {
 		{ "silence-detect",0,0,	G_OPTION_ARG_DOUBLE,	&silence_detect,	"Audio level threshold in percent for silence detection","FLOAT"},
 		{ "cn-payload",0,0,	G_OPTION_ARG_STRING_ARRAY,&cn_payload,		"Comfort noise parameters to replace silence with","INT INT INT ..."},
 		{ "player-cache",0,0,	G_OPTION_ARG_NONE,	&rtpe_config.player_cache,"Cache media files for playback in memory",NULL},
+		{ "audio-buffer-length",0,0,	G_OPTION_ARG_INT,&rtpe_config.audio_buffer_length,"Length in milliseconds of audio buffer","INT"},
+		{ "audio-buffer-delay",0,0,	G_OPTION_ARG_INT,&rtpe_config.audio_buffer_delay,"Initial delay in milliseconds for buffered audio","INT"},
+		{ "audio-player",0,0,	G_OPTION_ARG_STRING,	&use_audio_player,	"When to enable the internal audio player","on-demand|play-media|transcoding|always"},
 #endif
 #ifdef HAVE_MQTT
 		{ "mqtt-host",0,0,	G_OPTION_ARG_STRING,	&rtpe_config.mqtt_host,	"Mosquitto broker host or address",	"HOST|IP"},
@@ -832,6 +838,30 @@ static void options(int *argc, char ***argv) {
 			die("Invalid --amr-dtx ('%s')", amr_dtx);
 	}
 
+	if (use_audio_player) {
+		if (!strcasecmp(use_audio_player, "on-demand")
+				|| !strcasecmp(use_audio_player, "on demand")
+				|| !strcasecmp(use_audio_player, "off")
+				|| !strcasecmp(use_audio_player, "no")
+				|| !strcasecmp(use_audio_player, "never"))
+			rtpe_config.use_audio_player = UAP_ON_DEMAND;
+		else if (!strcasecmp(use_audio_player, "play-media")
+				|| !strcasecmp(use_audio_player, "play media")
+				|| !strcasecmp(use_audio_player, "media player")
+				|| !strcasecmp(use_audio_player, "media-player"))
+			rtpe_config.use_audio_player = UAP_PLAY_MEDIA;
+		else if (!strcasecmp(use_audio_player, "transcoding")
+				|| !strcasecmp(use_audio_player, "transcode"))
+			rtpe_config.use_audio_player = UAP_TRANSCODING;
+		else if (!strcasecmp(use_audio_player, "always")
+				|| !strcasecmp(use_audio_player, "everything")
+				|| !strcasecmp(use_audio_player, "force")
+				|| !strcasecmp(use_audio_player, "forced"))
+			rtpe_config.use_audio_player = UAP_ALWAYS;
+		else
+			die("Invalid --audio-player option ('%s')", use_audio_player);
+	}
+
 	if (!rtpe_config.software_id)
 		rtpe_config.software_id = g_strdup_printf("rtpengine-%s", RTPENGINE_VERSION);
 	g_strcanon(rtpe_config.software_id, "QWERTYUIOPASDFGHJKLZXCVBNMqwertyuiopasdfghjklzxcvbnm1234567890-", '-');
diff --git a/daemon/rtpengine.pod b/daemon/rtpengine.pod
index 34e281fc0..3c576b82a 100644
--- a/daemon/rtpengine.pod
+++ b/daemon/rtpengine.pod
@@ -952,6 +952,65 @@ option enabled.
 
 RTP data is cached and retained in memory for the lifetime of the process.
 
+=item B<audio-buffer-length=>I<INT>
+
+Set the buffer length used by the audio player (see below) in milliseconds. The
+default is 500 milliseconds.
+
+The buffer must be long enough to accommodate at least two frames of audio from
+all contributing sources, which means at least 40 ms or 60 ms for most cases.
+If media playback (via the B<play media>) command is desired, then the buffer
+must be able to accommodate at least one full frame from the source media file,
+whose length can vary depending on the format of the source media file. For 8
+kHz B<.wav> files this is 256 ms (2048 samples). Therefore 500 ms is the
+recommended value.
+
+=item B<audio-buffer-delay=>I<INT>
+
+Initial delay for new sources contributing to an audio buffer (used by the
+audio player, see below) in milliseconds. The default is 5 ms.
+
+The initial delay is meant to compensate for varying inter-arrival times of
+media packets (jitter). If set too low, intermittent high jitter will result in
+gaps in the output audio. If set too high, output audio will have an
+unnecessary latency added to it.
+
+=item B<audio-player=>B<on-demand>|B<play-media>|B<transcoding>|B<always>
+
+Define when to enable the audio player if not explicitly instructed otherwise.
+The default setting is B<on-demand>.
+
+Enabling the audio player for a party to a call makes B<rtpengine> produce its
+own audio RTP stream (instead of just forwarding an audio stream received from
+elsewhere). The audio is generated from a circular audio buffer (see above) and
+all contributing audio sources are mixed into that one audio buffer.
+Contributing audio sources are audio streams received from elsewhere (that
+would otherwise simply be forwarded) and audio produced by the B<play media>
+command.
+
+With this set to B<on-demand>, the audio player is enabled only if explicitly
+requested by the user for a particular call via the B<audio-player=> option
+used in a signalling message.
+
+When set to B<play-media>, the audio player is enabled only while media
+playback via the B<play media> command is active. After media playback is
+finished, the audio player is again disabled and audio goes back to simply
+being forwarded.
+
+Setting this option to B<transcoding> leaves the audio player disabled unless
+any sort of transcoding is required for a call.
+
+With a setting of B<always>, the audio player is enabled for all calls, unless
+explicitly disabled via the B<audio-player=> option used in a signalling
+message. This forces all audio through the transcoding engine, even if input
+and output codecs are the same.
+
+Audio player usage can be changed on a call-by-call basis by including the
+B<audio-player=> option in a signalling message. This option supports the
+values B<transcoding> and B<always>, which result in the behaviour described
+just above, and B<off> which forces the audio player to be disabled regardless
+of this setting.
+
 =item B<--poller-per-thread>
 
 Enable 'poller per thread' functionality: for every worker thread (see the
diff --git a/docs/ng_control_protocol.md b/docs/ng_control_protocol.md
index 8eb153730..712327784 100644
--- a/docs/ng_control_protocol.md
+++ b/docs/ng_control_protocol.md
@@ -119,6 +119,16 @@ Optionally included keys are:
 	body. The default is to auto-detect the address family if possible (if the receiving end is known
 	already) or otherwise to leave it unchanged.
 
+* `audio player`
+
+    Contains a string value of either `default`, `transcoding`, `off`, or `always`.
+
+    The values `transcoding` and `always` result in the behaviour described
+    under the `audio-player` config option in the manual, and override the
+    global setting from the config file. The value `off` disables usage of the
+    audio player regardless of the global config setting. The option `default`
+    results in the behaviour mandated by the global config setting.
+
 * `delay-buffer`
 
 	Takes an integer as value. When set to non-zero, enables the delay
diff --git a/include/audio_player.h b/include/audio_player.h
new file mode 100644
index 000000000..af8bdfc08
--- /dev/null
+++ b/include/audio_player.h
@@ -0,0 +1,42 @@
+#ifndef _AUDIO_PLAYER_H_
+#define _AUDIO_PLAYER_H_
+
+#ifdef WITH_TRANSCODING
+
+#include <stdbool.h>
+#include <libavutil/frame.h>
+#include <stdint.h>
+
+
+/*
+ * Similar to the existing media_player, but instead of simply producing
+ * its own standalone output media stream, the audio_player takes over the
+ * entire media stream flowing to the receiver, including media forwarded
+ * from the opposite side of the call, as well as media produced by the
+ * media_player.
+ */
+
+struct audio_player;
+struct call_media;
+struct rtp_payload_type;
+
+bool audio_player_setup(struct call_media *, const struct rtp_payload_type *,
+		unsigned int size_ms, unsigned int delay_ms);
+void audio_player_free(struct call_media *);
+
+void audio_player_start(struct call_media *);
+void audio_player_stop(struct call_media *);
+bool audio_player_is_active(struct call_media *);
+bool audio_player_pt_match(struct call_media *, const struct rtp_payload_type *);
+
+void audio_player_add_frame(struct audio_player *, uint32_t ssrc, AVFrame *);
+
+#else
+
+INLINE void audio_player_start(struct call_media *m) { }
+INLINE void audio_player_free(struct call_media *m) { }
+INLINE void audio_player_stop(struct call_media *m) { }
+
+#endif
+
+#endif
diff --git a/include/call.h b/include/call.h
index e5382f6ab..65cdf3d24 100644
--- a/include/call.h
+++ b/include/call.h
@@ -188,6 +188,7 @@ enum {
 #define MEDIA_FLAG_ECHO				0x10000000
 #define MEDIA_FLAG_BLACKHOLE			0x20000000
 #define MEDIA_FLAG_REORDER_FORCED		0x40000000
+#define MEDIA_FLAG_AUDIO_PLAYER			0x80000000
 #define MEDIA_FLAG_LEGACY_OSRTP			SHARED_FLAG_LEGACY_OSRTP
 #define MEDIA_FLAG_LEGACY_OSRTP_REV		SHARED_FLAG_LEGACY_OSRTP_REV
 
@@ -253,6 +254,7 @@ struct codec_tracker;
 struct rtcp_timer;
 struct mqtt_timer;
 struct janus_session;
+struct audio_player;
 
 
 typedef bencode_buffer_t call_buffer_t;
@@ -433,6 +435,7 @@ struct call_media {
 	struct mqtt_timer	*mqtt_timer;			/* master lock for scheduling purposes */
 	//struct codec_handler	*dtmf_injector;
 	struct t38_gateway	*t38_gateway;
+	struct audio_player	*audio_player;
 	struct codec_handler	*t38_handler;
 
 	unsigned int		buffer_delay;
diff --git a/include/call_interfaces.h b/include/call_interfaces.h
index c90325896..083bf8186 100644
--- a/include/call_interfaces.h
+++ b/include/call_interfaces.h
@@ -109,6 +109,12 @@ struct sdp_ng_flags {
 	long long duration;
 	long long pause;
 	long long start_pos;
+	enum {
+		AP_DEFAULT = 0,
+		AP_OFF,
+		AP_TRANSCODING,
+		AP_FORCE,
+	} audio_player:2;
 	unsigned int asymmetric:1,
 	             protocol_accept:1,
 	             no_redis_update:1,
diff --git a/include/main.h b/include/main.h
index 40e5592aa..e836f91c9 100644
--- a/include/main.h
+++ b/include/main.h
@@ -132,6 +132,14 @@ struct rtpengine_config {
 	uint32_t		silence_detect_int;
 	str			cn_payload;
 	int			player_cache;
+	int			audio_buffer_length;
+	int			audio_buffer_delay;
+	enum {
+		UAP_ON_DEMAND = 0,
+		UAP_PLAY_MEDIA,
+		UAP_TRANSCODING,
+		UAP_ALWAYS,
+	}			use_audio_player;
 	char			*software_id;
 	int			poller_per_thread;
 	char			*mqtt_host;
diff --git a/t/.gitignore b/t/.gitignore
index d0ff95e3d..c7283ca96 100644
--- a/t/.gitignore
+++ b/t/.gitignore
@@ -78,3 +78,4 @@ mvr2s_x64_avx2.S
 mvr2s_x64_avx512.S
 test-mix-buffer
 mix_buffer.c
+audio_player.c
diff --git a/t/Makefile b/t/Makefile
index 1d8bfa56f..36caba389 100644
--- a/t/Makefile
+++ b/t/Makefile
@@ -78,7 +78,8 @@ LIBSRCS+=	codeclib.strhash.c resample.c socket.c streambuf.c dtmflib.c
 DAEMONSRCS+=	codec.c call.c ice.c kernel.c media_socket.c stun.c bencode.c poller.c \
 		dtls.c recording.c statistics.c rtcp.c redis.c iptables.c graphite.c \
 		cookie_cache.c udp_listener.c homer.c load.c cdr.c dtmf.c timerthread.c \
-		media_player.c jitter_buffer.c t38.c tcp_listener.c mqtt.c websocket.c cli.c
+		media_player.c jitter_buffer.c t38.c tcp_listener.c mqtt.c websocket.c cli.c \
+		audio_player.c
 HASHSRCS+=	call_interfaces.c control_ng.c sdp.c janus.c
 LIBASM=		mvr2s_x64_avx2.S mvr2s_x64_avx512.S
 endif
@@ -267,7 +268,7 @@ test-stats:	test-stats.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssr
 	control_ng.strhash.o graphite.o \
 	streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
 	media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o \
-	websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o
+	websocket.o cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o
 
 test-transcode:	test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o codec.o ssrc.o call.o ice.o aux.o \
 	kernel.o media_socket.o stun.o bencode.o socket.o poller.o dtls.o recording.o statistics.o \
@@ -275,7 +276,7 @@ test-transcode:	test-transcode.o $(COMMONOBJS) codeclib.strhash.o resample.o cod
 	control_ng.strhash.o \
 	streambuf.o cookie_cache.o udp_listener.o homer.o load.o cdr.o dtmf.o timerthread.o \
 	media_player.o jitter_buffer.o dtmflib.o t38.o tcp_listener.o mqtt.o janus.strhash.o websocket.o \
-	cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o
+	cli.o mvr2s_x64_avx2.o mvr2s_x64_avx512.o audio_player.o mix_buffer.o
 
 test-resample:	test-resample.o $(COMMONOBJS) codeclib.strhash.o resample.o dtmflib.o mvr2s_x64_avx2.o \
 	mvr2s_x64_avx512.o
diff --git a/utils/rtpengine-ng-client b/utils/rtpengine-ng-client
index db16a3adc..70a249044 100755
--- a/utils/rtpengine-ng-client
+++ b/utils/rtpengine-ng-client
@@ -84,6 +84,7 @@ my @string_opts = qw(
 	frequency
 	blob
 	sdp
+	audio-player
 );
 
 my @int_opts = qw(