|
|
|
@ -30,6 +30,35 @@ option java_outer_classname = "AudioConfigProto"; |
|
|
|
|
option java_package = "com.google.cloud.dialogflow.v2beta1"; |
|
|
|
|
option objc_class_prefix = "DF"; |
|
|
|
|
|
|
|
|
|
// Hints for the speech recognizer to help with recognition in a specific |
|
|
|
|
// conversation state. |
|
|
|
|
message SpeechContext { |
|
|
|
|
// Optional. A list of strings containing words and phrases that the speech |
|
|
|
|
// recognizer should recognize with higher likelihood. |
|
|
|
|
// |
|
|
|
|
// This list can be used to: |
|
|
|
|
// * improve accuracy for words and phrases you expect the user to say, |
|
|
|
|
// e.g. typical commands for your Dialogflow agent |
|
|
|
|
// * add additional words to the speech recognizer vocabulary |
|
|
|
|
// * ... |
|
|
|
|
// |
|
|
|
|
// See the [Cloud Speech |
|
|
|
|
// documentation](https://cloud.google.com/speech-to-text/quotas) for usage |
|
|
|
|
// limits. |
|
|
|
|
repeated string phrases = 1; |
|
|
|
|
|
|
|
|
|
// Optional. Boost for this context compared to other contexts: |
|
|
|
|
// |
|
|
|
|
// * If the boost is positive, Dialogflow will increase the probability that |
|
|
|
|
// the phrases in this context are recognized over similar sounding phrases. |
|
|
|
|
// * If the boost is unspecified or non-positive, Dialogflow will not apply |
|
|
|
|
// any boost. |
|
|
|
|
// |
|
|
|
|
// Dialogflow recommends that you use boosts in the range (0, 20] and that you |
|
|
|
|
// find a value that fits your use case with binary search. |
|
|
|
|
float boost = 2; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Audio encoding of the audio content sent in the conversational query request. |
|
|
|
|
// Refer to the |
|
|
|
|
// [Cloud Speech API |
|
|
|
@ -79,33 +108,29 @@ enum AudioEncoding { |
|
|
|
|
AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Hints for the speech recognizer to help with recognition in a specific |
|
|
|
|
// conversation state. |
|
|
|
|
message SpeechContext { |
|
|
|
|
// Optional. A list of strings containing words and phrases that the speech |
|
|
|
|
// recognizer should recognize with higher likelihood. |
|
|
|
|
// |
|
|
|
|
// This list can be used to: |
|
|
|
|
// * improve accuracy for words and phrases you expect the user to say, |
|
|
|
|
// e.g. typical commands for your Dialogflow agent |
|
|
|
|
// * add additional words to the speech recognizer vocabulary |
|
|
|
|
// * ... |
|
|
|
|
// |
|
|
|
|
// See the [Cloud Speech |
|
|
|
|
// documentation](https://cloud.google.com/speech-to-text/quotas) for usage |
|
|
|
|
// limits. |
|
|
|
|
repeated string phrases = 1; |
|
|
|
|
// Information for a word recognized by the speech recognizer. |
|
|
|
|
message SpeechWordInfo { |
|
|
|
|
// The word this info is for. |
|
|
|
|
string word = 3; |
|
|
|
|
|
|
|
|
|
// Optional. Boost for this context compared to other contexts: |
|
|
|
|
// |
|
|
|
|
// * If the boost is positive, Dialogflow will increase the probability that |
|
|
|
|
// the phrases in this context are recognized over similar sounding phrases. |
|
|
|
|
// * If the boost is unspecified or non-positive, Dialogflow will not apply |
|
|
|
|
// any boost. |
|
|
|
|
// Time offset relative to the beginning of the audio that corresponds to the |
|
|
|
|
// start of the spoken word. This is an experimental feature and the accuracy |
|
|
|
|
// of the time offset can vary. |
|
|
|
|
google.protobuf.Duration start_offset = 1; |
|
|
|
|
|
|
|
|
|
// Time offset relative to the beginning of the audio that corresponds to the |
|
|
|
|
// end of the spoken word. This is an experimental feature and the accuracy of |
|
|
|
|
// the time offset can vary. |
|
|
|
|
google.protobuf.Duration end_offset = 2; |
|
|
|
|
|
|
|
|
|
// The Speech confidence between 0.0 and 1.0 for this word. A higher number |
|
|
|
|
// indicates an estimated greater likelihood that the recognized word is |
|
|
|
|
// correct. The default of 0.0 is a sentinel value indicating that confidence |
|
|
|
|
// was not set. |
|
|
|
|
// |
|
|
|
|
// Dialogflow recommends that you use boosts in the range (0, 20] and that you |
|
|
|
|
// find a value that fits your use case with binary search. |
|
|
|
|
float boost = 2; |
|
|
|
|
// This field is not guaranteed to be fully stable over time for the same |
|
|
|
|
// audio input. Users should also not rely on it to always be provided. |
|
|
|
|
float confidence = 4; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Variant of the specified [Speech model][google.cloud.dialogflow.v2beta1.InputAudioConfig.model] to use. |
|
|
|
@ -151,31 +176,6 @@ enum SpeechModelVariant { |
|
|
|
|
USE_ENHANCED = 3; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Information for a word recognized by the speech recognizer. |
|
|
|
|
message SpeechWordInfo { |
|
|
|
|
// The word this info is for. |
|
|
|
|
string word = 3; |
|
|
|
|
|
|
|
|
|
// Time offset relative to the beginning of the audio that corresponds to the |
|
|
|
|
// start of the spoken word. This is an experimental feature and the accuracy |
|
|
|
|
// of the time offset can vary. |
|
|
|
|
google.protobuf.Duration start_offset = 1; |
|
|
|
|
|
|
|
|
|
// Time offset relative to the beginning of the audio that corresponds to the |
|
|
|
|
// end of the spoken word. This is an experimental feature and the accuracy of |
|
|
|
|
// the time offset can vary. |
|
|
|
|
google.protobuf.Duration end_offset = 2; |
|
|
|
|
|
|
|
|
|
// The Speech confidence between 0.0 and 1.0 for this word. A higher number |
|
|
|
|
// indicates an estimated greater likelihood that the recognized word is |
|
|
|
|
// correct. The default of 0.0 is a sentinel value indicating that confidence |
|
|
|
|
// was not set. |
|
|
|
|
// |
|
|
|
|
// This field is not guaranteed to be fully stable over time for the same |
|
|
|
|
// audio input. Users should also not rely on it to always be provided. |
|
|
|
|
float confidence = 4; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Instructs the speech recognizer on how to process the audio content. |
|
|
|
|
message InputAudioConfig { |
|
|
|
|
// Required. Audio encoding of the audio content to process. |
|
|
|
@ -249,6 +249,21 @@ message InputAudioConfig { |
|
|
|
|
bool single_utterance = 8; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Description of which voice to use for speech synthesis. |
|
|
|
|
message VoiceSelectionParams { |
|
|
|
|
// Optional. The name of the voice. If not set, the service will choose a |
|
|
|
|
// voice based on the other parameters such as language_code and |
|
|
|
|
// [ssml_gender][google.cloud.dialogflow.v2beta1.VoiceSelectionParams.ssml_gender]. |
|
|
|
|
string name = 1; |
|
|
|
|
|
|
|
|
|
// Optional. The preferred gender of the voice. If not set, the service will |
|
|
|
|
// choose a voice based on the other parameters such as language_code and |
|
|
|
|
// [name][google.cloud.dialogflow.v2beta1.VoiceSelectionParams.name]. Note that this is only a preference, not requirement. If a |
|
|
|
|
// voice of the appropriate gender is not available, the synthesizer should |
|
|
|
|
// substitute a voice with a different gender rather than failing the request. |
|
|
|
|
SsmlVoiceGender ssml_gender = 2; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Gender of the voice as described in |
|
|
|
|
// [SSML voice element](https://www.w3.org/TR/speech-synthesis11/#edef_voice). |
|
|
|
|
enum SsmlVoiceGender { |
|
|
|
@ -266,21 +281,6 @@ enum SsmlVoiceGender { |
|
|
|
|
SSML_VOICE_GENDER_NEUTRAL = 3; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Description of which voice to use for speech synthesis. |
|
|
|
|
message VoiceSelectionParams { |
|
|
|
|
// Optional. The name of the voice. If not set, the service will choose a |
|
|
|
|
// voice based on the other parameters such as language_code and |
|
|
|
|
// [ssml_gender][google.cloud.dialogflow.v2beta1.VoiceSelectionParams.ssml_gender]. |
|
|
|
|
string name = 1; |
|
|
|
|
|
|
|
|
|
// Optional. The preferred gender of the voice. If not set, the service will |
|
|
|
|
// choose a voice based on the other parameters such as language_code and |
|
|
|
|
// [name][google.cloud.dialogflow.v2beta1.VoiceSelectionParams.name]. Note that this is only a preference, not requirement. If a |
|
|
|
|
// voice of the appropriate gender is not available, the synthesizer should |
|
|
|
|
// substitute a voice with a different gender rather than failing the request. |
|
|
|
|
SsmlVoiceGender ssml_gender = 2; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Configuration of how speech should be synthesized. |
|
|
|
|
message SynthesizeSpeechConfig { |
|
|
|
|
// Optional. Speaking rate/speed, in the range [0.25, 4.0]. 1.0 is the normal |
|
|
|
@ -313,6 +313,24 @@ message SynthesizeSpeechConfig { |
|
|
|
|
VoiceSelectionParams voice = 4; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Instructs the speech synthesizer how to generate the output audio content. |
|
|
|
|
// If this audio config is supplied in a request, it overrides all existing |
|
|
|
|
// text-to-speech settings applied to the agent. |
|
|
|
|
message OutputAudioConfig { |
|
|
|
|
// Required. Audio encoding of the synthesized audio content. |
|
|
|
|
OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// The synthesis sample rate (in hertz) for this audio. If not |
|
|
|
|
// provided, then the synthesizer will use the default sample rate based on |
|
|
|
|
// the audio encoding. If this is different from the voice's natural sample |
|
|
|
|
// rate, then the synthesizer will honor this request by converting to the |
|
|
|
|
// desired sample rate (which might result in worse audio quality). |
|
|
|
|
int32 sample_rate_hertz = 2; |
|
|
|
|
|
|
|
|
|
// Configuration of how speech should be synthesized. |
|
|
|
|
SynthesizeSpeechConfig synthesize_speech_config = 3; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Audio encoding of the output audio format in Text-To-Speech. |
|
|
|
|
enum OutputAudioEncoding { |
|
|
|
|
// Not specified. |
|
|
|
@ -331,21 +349,3 @@ enum OutputAudioEncoding { |
|
|
|
|
// than MP3 while using approximately the same bitrate. |
|
|
|
|
OUTPUT_AUDIO_ENCODING_OGG_OPUS = 3; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Instructs the speech synthesizer how to generate the output audio content. |
|
|
|
|
// If this audio config is supplied in a request, it overrides all existing |
|
|
|
|
// text-to-speech settings applied to the agent. |
|
|
|
|
message OutputAudioConfig { |
|
|
|
|
// Required. Audio encoding of the synthesized audio content. |
|
|
|
|
OutputAudioEncoding audio_encoding = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// The synthesis sample rate (in hertz) for this audio. If not |
|
|
|
|
// provided, then the synthesizer will use the default sample rate based on |
|
|
|
|
// the audio encoding. If this is different from the voice's natural sample |
|
|
|
|
// rate, then the synthesizer will honor this request by converting to the |
|
|
|
|
// desired sample rate (which might result in worse audio quality). |
|
|
|
|
int32 sample_rate_hertz = 2; |
|
|
|
|
|
|
|
|
|
// Configuration of how speech should be synthesized. |
|
|
|
|
SynthesizeSpeechConfig synthesize_speech_config = 3; |
|
|
|
|
} |
|
|
|
|