|
|
@ -24,6 +24,8 @@ |
|
|
|
*/ |
|
|
|
*/ |
|
|
|
|
|
|
|
|
|
|
|
#include <flite/flite.h> |
|
|
|
#include <flite/flite.h> |
|
|
|
|
|
|
|
#include "libavutil/audio_fifo.h" |
|
|
|
|
|
|
|
#include "libavutil/avstring.h" |
|
|
|
#include "libavutil/channel_layout.h" |
|
|
|
#include "libavutil/channel_layout.h" |
|
|
|
#include "libavutil/file.h" |
|
|
|
#include "libavutil/file.h" |
|
|
|
#include "libavutil/opt.h" |
|
|
|
#include "libavutil/opt.h" |
|
|
@ -39,11 +41,14 @@ typedef struct FliteContext { |
|
|
|
char *voice_str; |
|
|
|
char *voice_str; |
|
|
|
char *textfile; |
|
|
|
char *textfile; |
|
|
|
char *text; |
|
|
|
char *text; |
|
|
|
cst_wave *wave; |
|
|
|
char *text_p; |
|
|
|
int16_t *wave_samples; |
|
|
|
char *text_saveptr; |
|
|
|
int wave_nb_samples; |
|
|
|
int nb_channels; |
|
|
|
|
|
|
|
int sample_rate; |
|
|
|
|
|
|
|
AVAudioFifo *fifo; |
|
|
|
int list_voices; |
|
|
|
int list_voices; |
|
|
|
cst_voice *voice; |
|
|
|
cst_voice *voice; |
|
|
|
|
|
|
|
cst_audio_streaming_info *asi; |
|
|
|
struct voice_entry *voice_entry; |
|
|
|
struct voice_entry *voice_entry; |
|
|
|
int64_t pts; |
|
|
|
int64_t pts; |
|
|
|
int frame_nb_samples; ///< number of samples per frame
|
|
|
|
int frame_nb_samples; ///< number of samples per frame
|
|
|
@ -140,10 +145,30 @@ static int select_voice(struct voice_entry **entry_ret, const char *voice_name, |
|
|
|
return AVERROR(EINVAL); |
|
|
|
return AVERROR(EINVAL); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int audio_stream_chunk_by_word(const cst_wave *wave, int start, int size, |
|
|
|
|
|
|
|
int last, cst_audio_streaming_info *asi) |
|
|
|
|
|
|
|
{ |
|
|
|
|
|
|
|
FliteContext *flite = asi->userdata; |
|
|
|
|
|
|
|
void *const ptr[8] = { &wave->samples[start] }; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flite->nb_channels = wave->num_channels; |
|
|
|
|
|
|
|
flite->sample_rate = wave->sample_rate; |
|
|
|
|
|
|
|
if (!flite->fifo) { |
|
|
|
|
|
|
|
flite->fifo = av_audio_fifo_alloc(AV_SAMPLE_FMT_S16, flite->nb_channels, size); |
|
|
|
|
|
|
|
if (!flite->fifo) |
|
|
|
|
|
|
|
return CST_AUDIO_STREAM_STOP; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
av_audio_fifo_write(flite->fifo, ptr, size); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return CST_AUDIO_STREAM_CONT; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static av_cold int init(AVFilterContext *ctx) |
|
|
|
static av_cold int init(AVFilterContext *ctx) |
|
|
|
{ |
|
|
|
{ |
|
|
|
FliteContext *flite = ctx->priv; |
|
|
|
FliteContext *flite = ctx->priv; |
|
|
|
int ret = 0; |
|
|
|
int ret = 0; |
|
|
|
|
|
|
|
char *text; |
|
|
|
|
|
|
|
|
|
|
|
if (flite->list_voices) { |
|
|
|
if (flite->list_voices) { |
|
|
|
list_voices(ctx, "\n"); |
|
|
|
list_voices(ctx, "\n"); |
|
|
@ -197,10 +222,21 @@ static av_cold int init(AVFilterContext *ctx) |
|
|
|
return AVERROR(EINVAL); |
|
|
|
return AVERROR(EINVAL); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
/* synth all the file data in block */ |
|
|
|
flite->asi = new_audio_streaming_info(); |
|
|
|
flite->wave = flite_text_to_wave(flite->text, flite->voice); |
|
|
|
if (!flite->asi) |
|
|
|
flite->wave_samples = flite->wave->samples; |
|
|
|
return AVERROR_BUG; |
|
|
|
flite->wave_nb_samples = flite->wave->num_samples; |
|
|
|
|
|
|
|
|
|
|
|
flite->asi->asc = audio_stream_chunk_by_word; |
|
|
|
|
|
|
|
flite->asi->userdata = flite; |
|
|
|
|
|
|
|
feat_set(flite->voice->features, "streaming_info", audio_streaming_info_val(flite->asi)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flite->text_p = flite->text; |
|
|
|
|
|
|
|
if (!(text = av_strtok(flite->text_p, "\n", &flite->text_saveptr))) |
|
|
|
|
|
|
|
return AVERROR(EINVAL); |
|
|
|
|
|
|
|
flite->text_p = NULL; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flite_text_to_speech(text, flite->voice, "none"); |
|
|
|
|
|
|
|
|
|
|
|
return 0; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -216,8 +252,7 @@ static av_cold void uninit(AVFilterContext *ctx) |
|
|
|
} |
|
|
|
} |
|
|
|
pthread_mutex_unlock(&flite_mutex); |
|
|
|
pthread_mutex_unlock(&flite_mutex); |
|
|
|
} |
|
|
|
} |
|
|
|
delete_wave(flite->wave); |
|
|
|
av_audio_fifo_free(flite->fifo); |
|
|
|
flite->wave = NULL; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
static int query_formats(AVFilterContext *ctx) |
|
|
|
static int query_formats(AVFilterContext *ctx) |
|
|
@ -230,13 +265,13 @@ static int query_formats(AVFilterContext *ctx) |
|
|
|
AVFilterFormats *sample_rates = NULL; |
|
|
|
AVFilterFormats *sample_rates = NULL; |
|
|
|
AVChannelLayout chlayout = { 0 }; |
|
|
|
AVChannelLayout chlayout = { 0 }; |
|
|
|
|
|
|
|
|
|
|
|
av_channel_layout_default(&chlayout, flite->wave->num_channels); |
|
|
|
av_channel_layout_default(&chlayout, flite->nb_channels); |
|
|
|
|
|
|
|
|
|
|
|
if ((ret = ff_add_channel_layout (&chlayouts , &chlayout )) < 0 || |
|
|
|
if ((ret = ff_add_channel_layout (&chlayouts , &chlayout )) < 0 || |
|
|
|
(ret = ff_set_common_channel_layouts (ctx , chlayouts )) < 0 || |
|
|
|
(ret = ff_set_common_channel_layouts (ctx , chlayouts )) < 0 || |
|
|
|
(ret = ff_add_format (&sample_formats, AV_SAMPLE_FMT_S16 )) < 0 || |
|
|
|
(ret = ff_add_format (&sample_formats, AV_SAMPLE_FMT_S16 )) < 0 || |
|
|
|
(ret = ff_set_common_formats (ctx , sample_formats )) < 0 || |
|
|
|
(ret = ff_set_common_formats (ctx , sample_formats )) < 0 || |
|
|
|
(ret = ff_add_format (&sample_rates , flite->wave->sample_rate)) < 0 || |
|
|
|
(ret = ff_add_format (&sample_rates , flite->sample_rate )) < 0 || |
|
|
|
(ret = ff_set_common_samplerates (ctx , sample_rates )) < 0) |
|
|
|
(ret = ff_set_common_samplerates (ctx , sample_rates )) < 0) |
|
|
|
return ret; |
|
|
|
return ret; |
|
|
|
|
|
|
|
|
|
|
@ -248,12 +283,13 @@ static int config_props(AVFilterLink *outlink) |
|
|
|
AVFilterContext *ctx = outlink->src; |
|
|
|
AVFilterContext *ctx = outlink->src; |
|
|
|
FliteContext *flite = ctx->priv; |
|
|
|
FliteContext *flite = ctx->priv; |
|
|
|
|
|
|
|
|
|
|
|
outlink->sample_rate = flite->wave->sample_rate; |
|
|
|
outlink->sample_rate = flite->sample_rate; |
|
|
|
outlink->time_base = (AVRational){1, flite->wave->sample_rate}; |
|
|
|
outlink->time_base = (AVRational){1, flite->sample_rate}; |
|
|
|
|
|
|
|
|
|
|
|
av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n", |
|
|
|
av_log(ctx, AV_LOG_VERBOSE, "voice:%s fmt:%s sample_rate:%d\n", |
|
|
|
flite->voice_str, |
|
|
|
flite->voice_str, |
|
|
|
av_get_sample_fmt_name(outlink->format), outlink->sample_rate); |
|
|
|
av_get_sample_fmt_name(outlink->format), outlink->sample_rate); |
|
|
|
|
|
|
|
|
|
|
|
return 0; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -261,28 +297,36 @@ static int activate(AVFilterContext *ctx) |
|
|
|
{ |
|
|
|
{ |
|
|
|
AVFilterLink *outlink = ctx->outputs[0]; |
|
|
|
AVFilterLink *outlink = ctx->outputs[0]; |
|
|
|
FliteContext *flite = ctx->priv; |
|
|
|
FliteContext *flite = ctx->priv; |
|
|
|
int nb_samples = FFMIN(flite->wave_nb_samples, flite->frame_nb_samples); |
|
|
|
|
|
|
|
AVFrame *samplesref; |
|
|
|
AVFrame *samplesref; |
|
|
|
|
|
|
|
int nb_samples; |
|
|
|
|
|
|
|
|
|
|
|
if (!ff_outlink_frame_wanted(outlink)) |
|
|
|
if (!ff_outlink_frame_wanted(outlink)) |
|
|
|
return FFERROR_NOT_READY; |
|
|
|
return FFERROR_NOT_READY; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
nb_samples = FFMIN(av_audio_fifo_size(flite->fifo), flite->frame_nb_samples); |
|
|
|
if (!nb_samples) { |
|
|
|
if (!nb_samples) { |
|
|
|
|
|
|
|
char *text; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!(text = av_strtok(flite->text_p, "\n", &flite->text_saveptr))) { |
|
|
|
ff_outlink_set_status(outlink, AVERROR_EOF, flite->pts); |
|
|
|
ff_outlink_set_status(outlink, AVERROR_EOF, flite->pts); |
|
|
|
return 0; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
flite_text_to_speech(text, flite->voice, "none"); |
|
|
|
|
|
|
|
ff_filter_set_ready(ctx, 100); |
|
|
|
|
|
|
|
return 0; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
samplesref = ff_get_audio_buffer(outlink, nb_samples); |
|
|
|
samplesref = ff_get_audio_buffer(outlink, nb_samples); |
|
|
|
if (!samplesref) |
|
|
|
if (!samplesref) |
|
|
|
return AVERROR(ENOMEM); |
|
|
|
return AVERROR(ENOMEM); |
|
|
|
|
|
|
|
|
|
|
|
memcpy(samplesref->data[0], flite->wave_samples, |
|
|
|
av_audio_fifo_read(flite->fifo, (void **)samplesref->extended_data, |
|
|
|
nb_samples * flite->wave->num_channels * 2); |
|
|
|
nb_samples); |
|
|
|
|
|
|
|
|
|
|
|
samplesref->pts = flite->pts; |
|
|
|
samplesref->pts = flite->pts; |
|
|
|
samplesref->sample_rate = flite->wave->sample_rate; |
|
|
|
samplesref->sample_rate = flite->sample_rate; |
|
|
|
flite->pts += nb_samples; |
|
|
|
flite->pts += nb_samples; |
|
|
|
flite->wave_samples += nb_samples * flite->wave->num_channels; |
|
|
|
|
|
|
|
flite->wave_nb_samples -= nb_samples; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return ff_filter_frame(outlink, samplesref); |
|
|
|
return ff_filter_frame(outlink, samplesref); |
|
|
|
} |
|
|
|
} |
|
|
|