feat: Voice Activity Detection: adding speech event time and speech event type

PiperOrigin-RevId: 511839326
2 years ago · f04b13639d
parent a563815f12
commit f04b13639d
2 changed files with 138 additions and 50 deletions
--- a/google/cloud/speech/v1/cloud_speech.proto
+++ b/google/cloud/speech/v1/cloud_speech.proto
@ -36,7 +36,8 @@ option objc_class_prefix = "GCS";
 // Service that implements Google Cloud Speech API.
 service Speech {
  option (google.api.default_host) = "speech.googleapis.com";
-  option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
+  option (google.api.oauth_scopes) =
+      "https://www.googleapis.com/auth/cloud-platform";

  // Performs synchronous speech recognition: receive results after all audio
  // has been sent and processed.
@ -54,7 +55,8 @@ service Speech {
  // a `LongRunningRecognizeResponse` message.
  // For more information on asynchronous speech recognition, see the
  // [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
-  rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
+  rpc LongRunningRecognize(LongRunningRecognizeRequest)
+      returns (google.longrunning.Operation) {
    option (google.api.http) = {
      post: "/v1/speech:longrunningrecognize"
      body: "*"
@ -68,8 +70,8 @@ service Speech {

  // Performs bidirectional streaming speech recognition: receive results while
  // sending audio. This method is only available via the gRPC API (not REST).
-  rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
-  }
+  rpc StreamingRecognize(stream StreamingRecognizeRequest)
+      returns (stream StreamingRecognizeResponse) {}
 }

 // The top-level message sent by the client for the `Recognize` method.
@ -93,7 +95,8 @@ message LongRunningRecognizeRequest {
  RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED];

  // Optional. Specifies an optional destination for the recognition results.
-  TranscriptOutputConfig output_config = 4 [(google.api.field_behavior) = OPTIONAL];
+  TranscriptOutputConfig output_config = 4
+      [(google.api.field_behavior) = OPTIONAL];
 }

 // Specifies an optional destination for the recognition results.
@ -134,6 +137,15 @@ message StreamingRecognizeRequest {
 // Provides information to the recognizer that specifies how to process the
 // request.
 message StreamingRecognitionConfig {
+  // Events that a timeout can be set on for voice activity.
+  message VoiceActivityTimeout {
+    // Duration to timeout the stream if no speech begins.
+    google.protobuf.Duration speech_start_timeout = 1;
+
+    // Duration to timeout the stream after speech ends.
+    google.protobuf.Duration speech_end_timeout = 2;
+  }
+
  // Required. Provides information to the recognizer that specifies how to
  // process the request.
  RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED];
@ -166,6 +178,15 @@ message StreamingRecognitionConfig {
  // the `is_final=false` flag).
  // If `false` or omitted, only `is_final=true` result(s) are returned.
  bool interim_results = 3;
+
+  // If `true`, responses with voice activity speech events will be returned as
+  // they are detected.
+  bool enable_voice_activity_events = 5;
+
+  // If set, the server will automatically close the stream after the specified
+  // duration has elapsed after the last VOICE_ACTIVITY speech event has been
+  // sent. The field `voice_activity_events` must also be set to true.
+  VoiceActivityTimeout voice_activity_timeout = 6;
 }

 // Provides information to the recognizer that specifies how to process the
@ -193,7 +214,8 @@ message RecognitionConfig {
  // an `AudioEncoding` when you send  send `FLAC` or `WAV` audio, the
  // encoding configuration must match the encoding described in the audio
  // header; otherwise the request returns an
-  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
+  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error
+  // code.
  enum AudioEncoding {
    // Not specified.
    ENCODING_UNSPECIFIED = 0;
@ -246,7 +268,8 @@ message RecognitionConfig {

  // Encoding of audio data sent in all `RecognitionAudio` messages.
  // This field is optional for `FLAC` and `WAV` audio files and required
-  // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
+  // for all other audio formats. For details, see
+  // [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
  AudioEncoding encoding = 1;

  // Sample rate in Hertz of the audio data sent in all
@ -255,7 +278,8 @@ message RecognitionConfig {
  // source to 16000 Hz. If that's not possible, use the native sample rate of
  // the audio source (instead of re-sampling).
  // This field is optional for FLAC and WAV audio files, but is
-  // required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
+  // required for all other audio formats. For details, see
+  // [AudioEncoding][google.cloud.speech.v1.RecognitionConfig.AudioEncoding].
  int32 sample_rate_hertz = 2;

  // The number of channels in the input audio data.
@ -454,10 +478,8 @@ message SpeakerDiarizationConfig {
  int32 max_speaker_count = 3;

  // Output only. Unused.
-  int32 speaker_tag = 5 [
-    deprecated = true,
-    (google.api.field_behavior) = OUTPUT_ONLY
-  ];
+  int32 speaker_tag = 5
+      [deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY];
 }

 // Description of audio data to be recognized.
@ -619,8 +641,8 @@ message SpeechContext {

 // Contains audio data in the encoding specified in the `RecognitionConfig`.
 // Either `content` or `uri` must be supplied. Supplying both or neither
-// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
-// [content limits](https://cloud.google.com/speech-to-text/quotas#content).
+// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
+// See [content limits](https://cloud.google.com/speech-to-text/quotas#content).
 message RecognitionAudio {
  // The audio source, which is either inline content or a Google Cloud
  // Storage uri.
@ -635,8 +657,9 @@ message RecognitionAudio {
    // Currently, only Google Cloud Storage URIs are
    // supported, which must be specified in the following format:
    // `gs://bucket_name/object_name` (other URI formats return
-    // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
-    // [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
+    // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
+    // For more information, see [Request
+    // URIs](https://cloud.google.com/storage/docs/reference-uris).
    string uri = 2;
  }
 }
@ -701,8 +724,8 @@ message LongRunningRecognizeMetadata {
  // Time of the most recent processing update.
  google.protobuf.Timestamp last_update_time = 3;

-  // Output only. The URI of the audio file being transcribed. Empty if the audio was sent
-  // as byte content.
+  // Output only. The URI of the audio file being transcribed. Empty if the
+  // audio was sent as byte content.
  string uri = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
 }

@ -769,6 +792,23 @@ message StreamingRecognizeResponse {
    // until the server closes the gRPC connection. This event is only sent if
    // `single_utterance` was set to `true`, and is not used otherwise.
    END_OF_SINGLE_UTTERANCE = 1;
+
+    // This event indicates that the server has detected the beginning of human
+    // voice activity in the stream. This event can be returned multiple times
+    // if speech starts and stops repeatedly throughout the stream. This event
+    // is only sent if `voice_activity_events` is set to true.
+    SPEECH_ACTIVITY_BEGIN = 2;
+
+    // This event indicates that the server has detected the end of human voice
+    // activity in the stream. This event can be returned multiple times if
+    // speech starts and stops repeatedly throughout the stream. This event is
+    // only sent if `voice_activity_events` is set to true.
+    SPEECH_ACTIVITY_END = 3;
+
+    // This event indicates that the user-set timeout for speech activity begin
+    // or end has exceeded. Upon receiving this event, the client is expected to
+    // send a half close. Further audio will not be processed.
+    SPEECH_ACTIVITY_TIMEOUT = 4;
  }

  // If set, returns a [google.rpc.Status][google.rpc.Status] message that
@ -784,6 +824,9 @@ message StreamingRecognizeResponse {
  // Indicates the type of speech event.
  SpeechEventType speech_event_type = 4;

+  // Time offset between the beginning of the audio and event emission.
+  google.protobuf.Duration speech_event_time = 8;
+
  // When available, billed audio seconds for the stream.
  // Set only if this is the last response in the stream.
  google.protobuf.Duration total_billed_time = 5;
@ -828,9 +871,9 @@ message StreamingRecognitionResult {
  // For audio_channel_count = N, its output values can range from '1' to 'N'.
  int32 channel_tag = 5;

-  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
-  // of the language in this result. This language code was detected to have
-  // the most likelihood of being spoken in the audio.
+  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
+  // language tag of the language in this result. This language code was
+  // detected to have the most likelihood of being spoken in the audio.
  string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
 }

@ -851,9 +894,9 @@ message SpeechRecognitionResult {
  // beginning of the audio.
  google.protobuf.Duration result_end_time = 4;

-  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
-  // of the language in this result. This language code was detected to have
-  // the most likelihood of being spoken in the audio.
+  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
+  // language tag of the language in this result. This language code was
+  // detected to have the most likelihood of being spoken in the audio.
  string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
 }

--- a/google/cloud/speech/v1p1beta1/cloud_speech.proto
+++ b/google/cloud/speech/v1p1beta1/cloud_speech.proto
@ -36,7 +36,8 @@ option objc_class_prefix = "GCS";
 // Service that implements Google Cloud Speech API.
 service Speech {
  option (google.api.default_host) = "speech.googleapis.com";
-  option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
+  option (google.api.oauth_scopes) =
+      "https://www.googleapis.com/auth/cloud-platform";

  // Performs synchronous speech recognition: receive results after all audio
  // has been sent and processed.
@ -54,7 +55,8 @@ service Speech {
  // a `LongRunningRecognizeResponse` message.
  // For more information on asynchronous speech recognition, see the
  // [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
-  rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
+  rpc LongRunningRecognize(LongRunningRecognizeRequest)
+      returns (google.longrunning.Operation) {
    option (google.api.http) = {
      post: "/v1p1beta1/speech:longrunningrecognize"
      body: "*"
@ -68,8 +70,8 @@ service Speech {

  // Performs bidirectional streaming speech recognition: receive results while
  // sending audio. This method is only available via the gRPC API (not REST).
-  rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
-  }
+  rpc StreamingRecognize(stream StreamingRecognizeRequest)
+      returns (stream StreamingRecognizeResponse) {}
 }

 // The top-level message sent by the client for the `Recognize` method.
@ -93,7 +95,8 @@ message LongRunningRecognizeRequest {
  RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED];

  // Optional. Specifies an optional destination for the recognition results.
-  TranscriptOutputConfig output_config = 4 [(google.api.field_behavior) = OPTIONAL];
+  TranscriptOutputConfig output_config = 4
+      [(google.api.field_behavior) = OPTIONAL];
 }

 // Specifies an optional destination for the recognition results.
@ -134,6 +137,15 @@ message StreamingRecognizeRequest {
 // Provides information to the recognizer that specifies how to process the
 // request.
 message StreamingRecognitionConfig {
+  // Events that a timeout can be set on for voice activity.
+  message VoiceActivityTimeout {
+    // Duration to timeout the stream if no speech begins.
+    google.protobuf.Duration speech_start_timeout = 1;
+
+    // Duration to timeout the stream after speech ends.
+    google.protobuf.Duration speech_end_timeout = 2;
+  }
+
  // Required. Provides information to the recognizer that specifies how to
  // process the request.
  RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED];
@ -166,6 +178,15 @@ message StreamingRecognitionConfig {
  // the `is_final=false` flag).
  // If `false` or omitted, only `is_final=true` result(s) are returned.
  bool interim_results = 3;
+
+  // If `true`, responses with voice activity speech events will be returned as
+  // they are detected.
+  bool enable_voice_activity_events = 5;
+
+  // If set, the server will automatically close the stream after the specified
+  // duration has elapsed after the last VOICE_ACTIVITY speech event has been
+  // sent. The field `voice_activity_events` must also be set to true.
+  VoiceActivityTimeout voice_activity_timeout = 6;
 }

 // Provides information to the recognizer that specifies how to process the
@ -193,7 +214,8 @@ message RecognitionConfig {
  // an `AudioEncoding` when you send  send `FLAC` or `WAV` audio, the
  // encoding configuration must match the encoding described in the audio
  // header; otherwise the request returns an
-  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
+  // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error
+  // code.
  enum AudioEncoding {
    // Not specified.
    ENCODING_UNSPECIFIED = 0;
@ -252,7 +274,8 @@ message RecognitionConfig {

  // Encoding of audio data sent in all `RecognitionAudio` messages.
  // This field is optional for `FLAC` and `WAV` audio files and required
-  // for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
+  // for all other audio formats. For details, see
+  // [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
  AudioEncoding encoding = 1;

  // Sample rate in Hertz of the audio data sent in all
@ -261,7 +284,8 @@ message RecognitionConfig {
  // source to 16000 Hz. If that's not possible, use the native sample rate of
  // the audio source (instead of re-sampling).
  // This field is optional for FLAC and WAV audio files, but is
-  // required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
+  // required for all other audio formats. For details, see
+  // [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
  int32 sample_rate_hertz = 2;

  // The number of channels in the input audio data.
@ -477,10 +501,8 @@ message SpeakerDiarizationConfig {
  int32 max_speaker_count = 3;

  // Output only. Unused.
-  int32 speaker_tag = 5 [
-    deprecated = true,
-    (google.api.field_behavior) = OUTPUT_ONLY
-  ];
+  int32 speaker_tag = 5
+      [deprecated = true, (google.api.field_behavior) = OUTPUT_ONLY];
 }

 // Description of audio data to be recognized.
@ -646,8 +668,8 @@ message SpeechContext {

 // Contains audio data in the encoding specified in the `RecognitionConfig`.
 // Either `content` or `uri` must be supplied. Supplying both or neither
-// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
-// [content limits](https://cloud.google.com/speech-to-text/quotas#content).
+// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
+// See [content limits](https://cloud.google.com/speech-to-text/quotas#content).
 message RecognitionAudio {
  // The audio source, which is either inline content or a Google Cloud
  // Storage uri.
@ -662,8 +684,9 @@ message RecognitionAudio {
    // Currently, only Google Cloud Storage URIs are
    // supported, which must be specified in the following format:
    // `gs://bucket_name/object_name` (other URI formats return
-    // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
-    // [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
+    // [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
+    // For more information, see [Request
+    // URIs](https://cloud.google.com/storage/docs/reference-uris).
    string uri = 2;
  }
 }
@ -728,12 +751,14 @@ message LongRunningRecognizeMetadata {
  // Time of the most recent processing update.
  google.protobuf.Timestamp last_update_time = 3;

-  // Output only. The URI of the audio file being transcribed. Empty if the audio was sent
-  // as byte content.
+  // Output only. The URI of the audio file being transcribed. Empty if the
+  // audio was sent as byte content.
  string uri = 4 [(google.api.field_behavior) = OUTPUT_ONLY];

-  // Output only. A copy of the TranscriptOutputConfig if it was set in the request.
-  TranscriptOutputConfig output_config = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
+  // Output only. A copy of the TranscriptOutputConfig if it was set in the
+  // request.
+  TranscriptOutputConfig output_config = 5
+      [(google.api.field_behavior) = OUTPUT_ONLY];
 }

 // `StreamingRecognizeResponse` is the only message returned to the client by
@ -799,6 +824,23 @@ message StreamingRecognizeResponse {
    // until the server closes the gRPC connection. This event is only sent if
    // `single_utterance` was set to `true`, and is not used otherwise.
    END_OF_SINGLE_UTTERANCE = 1;
+
+    // This event indicates that the server has detected the beginning of human
+    // voice activity in the stream. This event can be returned multiple times
+    // if speech starts and stops repeatedly throughout the stream. This event
+    // is only sent if `voice_activity_events` is set to true.
+    SPEECH_ACTIVITY_BEGIN = 2;
+
+    // This event indicates that the server has detected the end of human voice
+    // activity in the stream. This event can be returned multiple times if
+    // speech starts and stops repeatedly throughout the stream. This event is
+    // only sent if `voice_activity_events` is set to true.
+    SPEECH_ACTIVITY_END = 3;
+
+    // This event indicates that the user-set timeout for speech activity begin
+    // or end has exceeded. Upon receiving this event, the client is expected to
+    // send a half close. Further audio will not be processed.
+    SPEECH_ACTIVITY_TIMEOUT = 4;
  }

  // If set, returns a [google.rpc.Status][google.rpc.Status] message that
@ -814,6 +856,9 @@ message StreamingRecognizeResponse {
  // Indicates the type of speech event.
  SpeechEventType speech_event_type = 4;

+  // Time offset between the beginning of the audio and event emission.
+  google.protobuf.Duration speech_event_time = 8;
+
  // When available, billed audio seconds for the stream.
  // Set only if this is the last response in the stream.
  google.protobuf.Duration total_billed_time = 5;
@ -858,9 +903,9 @@ message StreamingRecognitionResult {
  // For audio_channel_count = N, its output values can range from '1' to 'N'.
  int32 channel_tag = 5;

-  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
-  // of the language in this result. This language code was detected to have
-  // the most likelihood of being spoken in the audio.
+  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
+  // language tag of the language in this result. This language code was
+  // detected to have the most likelihood of being spoken in the audio.
  string language_code = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
 }

@ -881,9 +926,9 @@ message SpeechRecognitionResult {
  // beginning of the audio.
  google.protobuf.Duration result_end_time = 4;

-  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
-  // of the language in this result. This language code was detected to have
-  // the most likelihood of being spoken in the audio.
+  // Output only. The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt)
+  // language tag of the language in this result. This language code was
+  // detected to have the most likelihood of being spoken in the audio.
  string language_code = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
 }