Synchronize new proto/yaml changes.

PiperOrigin-RevId: 271102771
pull/582/head
Google APIs 6 years ago committed by Copybara-Service
parent 9dc1d37b6b
commit a5d0708f07
  1. 2
      google/cloud/speech/artman_speech_v1.yaml
  2. 3
      google/cloud/speech/artman_speech_v1p1beta1.yaml
  3. 50
      google/cloud/speech/speech_v1p1beta1.yaml
  4. 214
      google/cloud/speech/v1p1beta1/cloud_speech.proto
  5. 90
      google/cloud/speech/v1p1beta1/speech_gapic.legacy.yaml
  6. 52
      google/cloud/speech/v1p1beta1/speech_gapic.yaml
  7. 35
      google/cloud/speech/v1p1beta1/speech_grpc_service_config.json
  8. 36
      google/cloud/speech/v1p1beta1/speech_v1p1beta1.yaml

@ -6,7 +6,7 @@ common:
- name: google-common-protos
src_proto_paths:
- v1
service_yaml: speech_v1.yaml
service_yaml: v1/speech_v1.yaml
gapic_yaml: v1/speech_gapic.yaml
samples: v1/samples
proto_package: google.cloud.speech.v1

@ -6,9 +6,10 @@ common:
- name: google-common-protos
src_proto_paths:
- v1p1beta1
service_yaml: speech_v1p1beta1.yaml
service_yaml: v1p1beta1/speech_v1p1beta1.yaml
gapic_yaml: v1p1beta1/speech_gapic.yaml
samples: v1p1beta1/samples
proto_package: google.cloud.speech.v1p1beta1
artifacts:
- name: gapic_config
type: GAPIC_CONFIG

@ -1,50 +0,0 @@
type: google.api.Service
config_version: 3
name: speech.googleapis.com
title: Cloud Speech API
apis:
- name: google.cloud.speech.v1p1beta1.Speech
documentation:
summary: Converts audio to text by applying powerful neural network models.
overview: |-
# Introduction
Google Cloud Speech API provides speech recognition as a service.
backend:
rules:
- selector: google.longrunning.Operations.ListOperations
deadline: 200.0
- selector: google.longrunning.Operations.GetOperation
deadline: 200.0
- selector: google.longrunning.Operations.WaitOperation
deadline: 200.0
- selector: google.cloud.speech.v1p1beta1.Speech.Recognize
deadline: 200.0
- selector: google.cloud.speech.v1p1beta1.Speech.LongRunningRecognize
deadline: 200.0
- selector: google.cloud.speech.v1p1beta1.Speech.StreamingRecognize
deadline: 905.0
http:
rules:
- selector: google.longrunning.Operations.ListOperations
get: /v1/operations
additional_bindings:
- get: /v1beta1/operations
- selector: google.longrunning.Operations.GetOperation
get: '/v1/operations/{name=*}'
additional_bindings:
- get: '/v1beta1/operations/{name=*}'
- get: '/v1p1beta1/operations/{name=*}'
authentication:
rules:
- selector: '*'
oauth:
canonical_scopes: |-
https://www.googleapis.com/auth/cloud-platform

@ -1,4 +1,4 @@
// Copyright 2018 Google LLC.
// Copyright 2019 Google LLC.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -18,10 +18,11 @@ syntax = "proto3";
package google.cloud.speech.v1p1beta1;
import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/api/field_behavior.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/any.proto";
import "google/protobuf/duration.proto";
import "google/protobuf/empty.proto";
import "google/protobuf/timestamp.proto";
import "google/rpc/status.proto";
@ -30,9 +31,13 @@ option go_package = "google.golang.org/genproto/googleapis/cloud/speech/v1p1beta
option java_multiple_files = true;
option java_outer_classname = "SpeechProto";
option java_package = "com.google.cloud.speech.v1p1beta1";
option objc_class_prefix = "GCS";
// Service that implements Google Cloud Speech API.
service Speech {
option (google.api.default_host) = "speech.googleapis.com";
option (google.api.oauth_scopes) = "https://www.googleapis.com/auth/cloud-platform";
// Performs synchronous speech recognition: receive results after all audio
// has been sent and processed.
rpc Recognize(RecognizeRequest) returns (RecognizeResponse) {
@ -40,52 +45,59 @@ service Speech {
post: "/v1p1beta1/speech:recognize"
body: "*"
};
option (google.api.method_signature) = "config,audio";
}
// Performs asynchronous speech recognition: receive results via the
// google.longrunning.Operations interface. Returns either an
// `Operation.error` or an `Operation.response` which contains
// a `LongRunningRecognizeResponse` message.
rpc LongRunningRecognize(LongRunningRecognizeRequest)
returns (google.longrunning.Operation) {
// For more information on asynchronous speech recognition, see the
// [how-to](https://cloud.google.com/speech-to-text/docs/async-recognize).
rpc LongRunningRecognize(LongRunningRecognizeRequest) returns (google.longrunning.Operation) {
option (google.api.http) = {
post: "/v1p1beta1/speech:longrunningrecognize"
body: "*"
};
option (google.api.method_signature) = "config,audio";
option (google.longrunning.operation_info) = {
response_type: "LongRunningRecognizeResponse"
metadata_type: "LongRunningRecognizeMetadata"
};
}
// Performs bidirectional streaming speech recognition: receive results while
// sending audio. This method is only available via the gRPC API (not REST).
rpc StreamingRecognize(stream StreamingRecognizeRequest)
returns (stream StreamingRecognizeResponse) {}
rpc StreamingRecognize(stream StreamingRecognizeRequest) returns (stream StreamingRecognizeResponse) {
}
}
// The top-level message sent by the client for the `Recognize` method.
message RecognizeRequest {
// *Required* Provides information to the recognizer that specifies how to
// Required. Provides information to the recognizer that specifies how to
// process the request.
RecognitionConfig config = 1;
RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED];
// *Required* The audio data to be recognized.
RecognitionAudio audio = 2;
// Required. The audio data to be recognized.
RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED];
}
// The top-level message sent by the client for the `LongRunningRecognize`
// method.
message LongRunningRecognizeRequest {
// *Required* Provides information to the recognizer that specifies how to
// Required. Provides information to the recognizer that specifies how to
// process the request.
RecognitionConfig config = 1;
RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED];
// *Required* The audio data to be recognized.
RecognitionAudio audio = 2;
// Required. The audio data to be recognized.
RecognitionAudio audio = 2 [(google.api.field_behavior) = REQUIRED];
}
// The top-level message sent by the client for the `StreamingRecognize` method.
// Multiple `StreamingRecognizeRequest` messages are sent. The first message
// must contain a `streaming_config` message and must not contain `audio` data.
// All subsequent messages must contain `audio` data and must not contain a
// `streaming_config` message.
// must contain a `streaming_config` message and must not contain
// `audio_content`. All subsequent messages must contain `audio_content` and
// must not contain a `streaming_config` message.
message StreamingRecognizeRequest {
// The streaming request, which is either a streaming config or audio content.
oneof streaming_request {
@ -99,9 +111,9 @@ message StreamingRecognizeRequest {
// `StreamingRecognizeRequest` message must not contain `audio_content` data
// and all subsequent `StreamingRecognizeRequest` messages must contain
// `audio_content` data. The audio bytes must be encoded as specified in
// `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
// `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
// pure binary representation (not base64). See
// [content limits](/speech-to-text/quotas#content).
// [content limits](https://cloud.google.com/speech-to-text/quotas#content).
bytes audio_content = 2;
}
}
@ -109,11 +121,11 @@ message StreamingRecognizeRequest {
// Provides information to the recognizer that specifies how to process the
// request.
message StreamingRecognitionConfig {
// *Required* Provides information to the recognizer that specifies how to
// Required. Provides information to the recognizer that specifies how to
// process the request.
RecognitionConfig config = 1;
RecognitionConfig config = 1 [(google.api.field_behavior) = REQUIRED];
// *Optional* If `false` or omitted, the recognizer will perform continuous
// If `false` or omitted, the recognizer will perform continuous
// recognition (continuing to wait for and process audio even if the user
// pauses speaking) until the client closes the input stream (gRPC API) or
// until the maximum time limit has been reached. May return multiple
@ -126,7 +138,7 @@ message StreamingRecognitionConfig {
// `true`.
bool single_utterance = 2;
// *Optional* If `true`, interim results (tentative hypotheses) may be
// If `true`, interim results (tentative hypotheses) may be
// returned as they become available (these interim results are indicated with
// the `is_final=false` flag).
// If `false` or omitted, only `is_final=true` result(s) are returned.
@ -138,13 +150,15 @@ message StreamingRecognitionConfig {
message RecognitionConfig {
// The encoding of the audio data sent in the request.
//
// All encodings support only 1 channel (mono) audio.
// All encodings support only 1 channel (mono) audio, unless the
// `audio_channel_count` and `enable_separate_recognition_per_channel` fields
// are set.
//
// For best results, the audio source should be captured and transmitted using
// a lossless encoding (`FLAC` or `LINEAR16`). The accuracy of the speech
// recognition can be reduced if lossy codecs are used to capture or transmit
// audio, particularly if background noise is present. Lossy codecs include
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, and `SPEEX_WITH_HEADER_BYTE`.
// `MULAW`, `AMR`, `AMR_WB`, `OGG_OPUS`, `SPEEX_WITH_HEADER_BYTE`, and `MP3`.
//
// The `FLAC` and `WAV` audio file formats include a header that describes the
// included audio content. You can request recognition for `WAV` files that
@ -155,8 +169,7 @@ message RecognitionConfig {
// an `AudioEncoding` when you send send `FLAC` or `WAV` audio, the
// encoding configuration must match the encoding described in the audio
// header; otherwise the request returns an
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error
// code.
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT] error code.
enum AudioEncoding {
// Not specified.
ENCODING_UNSPECIFIED = 0;
@ -209,8 +222,7 @@ message RecognitionConfig {
// Encoding of audio data sent in all `RecognitionAudio` messages.
// This field is optional for `FLAC` and `WAV` audio files and required
// for all other audio formats. For details, see
// [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
// for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
AudioEncoding encoding = 1;
// Sample rate in Hertz of the audio data sent in all
@ -218,12 +230,11 @@ message RecognitionConfig {
// 16000 is optimal. For best results, set the sampling rate of the audio
// source to 16000 Hz. If that's not possible, use the native sample rate of
// the audio source (instead of re-sampling).
// This field is optional for `FLAC` and `WAV` audio files and required
// for all other audio formats. For details, see
// [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
// This field is optional for FLAC and WAV audio files, but is
// required for all other audio formats. For details, see [AudioEncoding][google.cloud.speech.v1p1beta1.RecognitionConfig.AudioEncoding].
int32 sample_rate_hertz = 2;
// *Optional* The number of channels in the input audio data.
// The number of channels in the input audio data.
// ONLY set this for MULTI-CHANNEL recognition.
// Valid values for LINEAR16 and FLAC are `1`-`8`.
// Valid values for OGG_OPUS are '1'-'254'.
@ -234,7 +245,7 @@ message RecognitionConfig {
// `enable_separate_recognition_per_channel` to 'true'.
int32 audio_channel_count = 7;
// This needs to be set to true explicitly and `audio_channel_count` > 1
// This needs to be set to `true` explicitly and `audio_channel_count` > 1
// to get each channel recognized separately. The recognition result will
// contain a `channel_tag` field to state which channel that result belongs
// to. If this is not true, we will only recognize the first channel. The
@ -242,28 +253,29 @@ message RecognitionConfig {
// `audio_channel_count` multiplied by the length of the audio.
bool enable_separate_recognition_per_channel = 12;
// *Required* The language of the supplied audio as a
// Required. The language of the supplied audio as a
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag.
// Example: "en-US".
// See [Language Support](/speech-to-text/docs/languages)
// for a list of the currently supported language codes.
string language_code = 3;
// See [Language
// Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
// of the currently supported language codes.
string language_code = 3 [(google.api.field_behavior) = REQUIRED];
// *Optional* A list of up to 3 additional
// A list of up to 3 additional
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tags,
// listing possible alternative languages of the supplied audio.
// See [Language Support](/speech-to-text/docs/languages)
// for a list of the currently supported language codes.
// If alternative languages are listed, recognition result will contain
// recognition in the most likely language detected including the main
// language_code. The recognition result will include the language tag
// of the language detected in the audio.
// Note: This feature is only supported for Voice Command and Voice Search
// use cases and performance may vary for other use cases (e.g., phone call
// See [Language
// Support](https://cloud.google.com/speech-to-text/docs/languages) for a list
// of the currently supported language codes. If alternative languages are
// listed, recognition result will contain recognition in the most likely
// language detected including the main language_code. The recognition result
// will include the language tag of the language detected in the audio. Note:
// This feature is only supported for Voice Command and Voice Search use cases
// and performance may vary for other use cases (e.g., phone call
// transcription).
repeated string alternative_language_codes = 18;
// *Optional* Maximum number of recognition hypotheses to be returned.
// Maximum number of recognition hypotheses to be returned.
// Specifically, the maximum number of `SpeechRecognitionAlternative` messages
// within each `SpeechRecognitionResult`.
// The server may return fewer than `max_alternatives`.
@ -271,30 +283,31 @@ message RecognitionConfig {
// one. If omitted, will return a maximum of one.
int32 max_alternatives = 4;
// *Optional* If set to `true`, the server will attempt to filter out
// If set to `true`, the server will attempt to filter out
// profanities, replacing all but the initial character in each filtered word
// with asterisks, e.g. "f***". If set to `false` or omitted, profanities
// won't be filtered out.
bool profanity_filter = 5;
// *Optional* array of
// [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext]. A means to
// provide context to assist the speech recognition. For more information, see
// [Phrase Hints](/speech-to-text/docs/basics#phrase-hints).
// Array of [SpeechContext][google.cloud.speech.v1p1beta1.SpeechContext].
// A means to provide context to assist the speech recognition. For more
// information, see
// [speech
// adaptation](https://cloud.google.com/speech-to-text/docs/context-strength).
repeated SpeechContext speech_contexts = 6;
// *Optional* If `true`, the top result includes a list of words and
// If `true`, the top result includes a list of words and
// the start and end time offsets (timestamps) for those words. If
// `false`, no word-level time offset information is returned. The default is
// `false`.
bool enable_word_time_offsets = 8;
// *Optional* If `true`, the top result includes a list of words and the
// If `true`, the top result includes a list of words and the
// confidence for those words. If `false`, no word-level confidence
// information is returned. The default is `false`.
bool enable_word_confidence = 15;
// *Optional* If 'true', adds punctuation to recognition result hypotheses.
// If 'true', adds punctuation to recognition result hypotheses.
// This feature is only available in select languages. Setting this for
// requests in other languages has no effect at all.
// The default 'false' value does not add punctuation to result hypotheses.
@ -303,19 +316,18 @@ message RecognitionConfig {
// premium feature.
bool enable_automatic_punctuation = 11;
// *Optional* If 'true', enables speaker detection for each recognized word in
// If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
// Note: Use diarization_config instead.
bool enable_speaker_diarization = 16 [deprecated = true];
// *Optional*
// If set, specifies the estimated number of speakers in the conversation.
// Defaults to '2'. Ignored unless enable_speaker_diarization is set to true.
// Note: Use diarization_config instead.
int32 diarization_speaker_count = 17 [deprecated = true];
// *Optional* Config to enable speaker diarization and set additional
// Config to enable speaker diarization and set additional
// parameters to make diarization better suited for your application.
// Note: When this is enabled, we send all the words from the beginning of the
// audio for the top alternative in every consecutive STREAMING responses.
@ -325,10 +337,10 @@ message RecognitionConfig {
// in the top alternative of the FINAL SpeechRecognitionResult.
SpeakerDiarizationConfig diarization_config = 19;
// *Optional* Metadata regarding this request.
// Metadata regarding this request.
RecognitionMetadata metadata = 9;
// *Optional* Which model to select for the given request. Select the model
// Which model to select for the given request. Select the model
// best suited to your domain to get best results. If a model is not
// explicitly specified, then we auto-select a model based on the parameters
// in the RecognitionConfig.
@ -362,7 +374,7 @@ message RecognitionConfig {
// </table>
string model = 13;
// *Optional* Set to true to use an enhanced model for speech recognition.
// Set to true to use an enhanced model for speech recognition.
// If `use_enhanced` is set to true and the `model` field is not set, then
// an appropriate enhanced model is chosen if an enhanced model exists for
// the audio.
@ -373,23 +385,18 @@ message RecognitionConfig {
bool use_enhanced = 14;
}
// *Optional* Config to enable speaker diarization.
// Config to enable speaker diarization.
message SpeakerDiarizationConfig {
// *Optional* If 'true', enables speaker detection for each recognized word in
// If 'true', enables speaker detection for each recognized word in
// the top alternative of the recognition result using a speaker_tag provided
// in the WordInfo.
bool enable_speaker_diarization = 1;
// Note: Set min_speaker_count = max_speaker_count to fix the number of
// speakers to be detected in the audio.
// *Optional*
// Minimum number of speakers in the conversation. This range gives you more
// flexibility by allowing the system to automatically determine the correct
// number of speakers. If not set, the default value is 2.
int32 min_speaker_count = 2;
// *Optional*
// Maximum number of speakers in the conversation. This range gives you more
// flexibility by allowing the system to automatically determine the correct
// number of speakers. If not set, the default value is 6.
@ -520,7 +527,7 @@ message RecognitionMetadata {
// Obfuscated (privacy-protected) ID of the user, to identify number of
// unique users using the service.
int64 obfuscated_id = 9;
int64 obfuscated_id = 9 [deprecated = true];
// Description of the content. Eg. "Recordings of federal supreme court
// hearings from 2012".
@ -530,12 +537,12 @@ message RecognitionMetadata {
// Provides "hints" to the speech recognizer to favor specific words and phrases
// in the results.
message SpeechContext {
// *Optional* A list of strings containing words and phrases "hints" so that
// A list of strings containing words and phrases "hints" so that
// the speech recognition is more likely to recognize them. This can be used
// to improve the accuracy for specific words and phrases, for example, if
// specific commands are typically spoken by the user. This can also be used
// to add additional words to the vocabulary of the recognizer. See
// [usage limits](/speech-to-text/quotas#content).
// [usage limits](https://cloud.google.com/speech-to-text/quotas#content).
//
// List items can also be set to classes for groups of words that represent
// common concepts that occur in natural language. For example, rather than
@ -557,14 +564,14 @@ message SpeechContext {
// Contains audio data in the encoding specified in the `RecognitionConfig`.
// Either `content` or `uri` must be supplied. Supplying both or neither
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT].
// See [content limits](/speech-to-text/quotas#content).
// returns [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]. See
// [content limits](https://cloud.google.com/speech-to-text/quotas#content).
message RecognitionAudio {
// The audio source, which is either inline content or a Google Cloud
// Storage uri.
oneof audio_source {
// The audio data bytes encoded as specified in
// `RecognitionConfig`. Note: as with all bytes fields, protobuffers use a
// `RecognitionConfig`. Note: as with all bytes fields, proto buffers use a
// pure binary representation, whereas JSON representations use base64.
bytes content = 1;
@ -573,9 +580,8 @@ message RecognitionAudio {
// Currently, only Google Cloud Storage URIs are
// supported, which must be specified in the following format:
// `gs://bucket_name/object_name` (other URI formats return
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]).
// For more information, see [Request
// URIs](https://cloud.google.com/storage/docs/reference-uris).
// [google.rpc.Code.INVALID_ARGUMENT][google.rpc.Code.INVALID_ARGUMENT]). For more information, see
// [Request URIs](https://cloud.google.com/storage/docs/reference-uris).
string uri = 2;
}
}
@ -584,7 +590,7 @@ message RecognitionAudio {
// contains the result as zero or more sequential `SpeechRecognitionResult`
// messages.
message RecognizeResponse {
// Output only. Sequential list of transcription results corresponding to
// Sequential list of transcription results corresponding to
// sequential portions of audio.
repeated SpeechRecognitionResult results = 2;
}
@ -595,7 +601,7 @@ message RecognizeResponse {
// returned by the `GetOperation` call of the `google::longrunning::Operations`
// service.
message LongRunningRecognizeResponse {
// Output only. Sequential list of transcription results corresponding to
// Sequential list of transcription results corresponding to
// sequential portions of audio.
repeated SpeechRecognitionResult results = 2;
}
@ -680,44 +686,44 @@ message StreamingRecognizeResponse {
END_OF_SINGLE_UTTERANCE = 1;
}
// Output only. If set, returns a [google.rpc.Status][google.rpc.Status]
// message that specifies the error for the operation.
// If set, returns a [google.rpc.Status][google.rpc.Status] message that
// specifies the error for the operation.
google.rpc.Status error = 1;
// Output only. This repeated list contains zero or more results that
// This repeated list contains zero or more results that
// correspond to consecutive portions of the audio currently being processed.
// It contains zero or one `is_final=true` result (the newly settled portion),
// followed by zero or more `is_final=false` results (the interim results).
repeated StreamingRecognitionResult results = 2;
// Output only. Indicates the type of speech event.
// Indicates the type of speech event.
SpeechEventType speech_event_type = 4;
}
// A streaming speech recognition result corresponding to a portion of the audio
// that is currently being processed.
message StreamingRecognitionResult {
// Output only. May contain one or more recognition hypotheses (up to the
// May contain one or more recognition hypotheses (up to the
// maximum specified in `max_alternatives`).
// These alternatives are ordered in terms of accuracy, with the top (first)
// alternative being the most probable, as ranked by the recognizer.
repeated SpeechRecognitionAlternative alternatives = 1;
// Output only. If `false`, this `StreamingRecognitionResult` represents an
// If `false`, this `StreamingRecognitionResult` represents an
// interim result that may change. If `true`, this is the final time the
// speech service will return this particular `StreamingRecognitionResult`,
// the recognizer will not return any further hypotheses for this portion of
// the transcript and corresponding audio.
bool is_final = 2;
// Output only. An estimate of the likelihood that the recognizer will not
// An estimate of the likelihood that the recognizer will not
// change its guess about this interim result. Values range from 0.0
// (completely unstable) to 1.0 (completely stable).
// This field is only provided for interim results (`is_final=false`).
// The default of 0.0 is a sentinel value indicating `stability` was not set.
float stability = 3;
// Output only. Time offset of the end of this result relative to the
// Time offset of the end of this result relative to the
// beginning of the audio.
google.protobuf.Duration result_end_time = 4;
@ -726,16 +732,15 @@ message StreamingRecognitionResult {
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 5;
// Output only. The
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
// language in this result. This language code was detected to have the most
// likelihood of being spoken in the audio.
// The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
// of the language in this result. This language code was detected to have
// the most likelihood of being spoken in the audio.
string language_code = 6;
}
// A speech recognition result corresponding to a portion of the audio.
message SpeechRecognitionResult {
// Output only. May contain one or more recognition hypotheses (up to the
// May contain one or more recognition hypotheses (up to the
// maximum specified in `max_alternatives`).
// These alternatives are ordered in terms of accuracy, with the top (first)
// alternative being the most probable, as ranked by the recognizer.
@ -746,19 +751,18 @@ message SpeechRecognitionResult {
// For audio_channel_count = N, its output values can range from '1' to 'N'.
int32 channel_tag = 2;
// Output only. The
// [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag of the
// language in this result. This language code was detected to have the most
// likelihood of being spoken in the audio.
// The [BCP-47](https://www.rfc-editor.org/rfc/bcp/bcp47.txt) language tag
// of the language in this result. This language code was detected to have
// the most likelihood of being spoken in the audio.
string language_code = 5;
}
// Alternative hypotheses (a.k.a. n-best list).
message SpeechRecognitionAlternative {
// Output only. Transcript text representing the words that the user spoke.
// Transcript text representing the words that the user spoke.
string transcript = 1;
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
// The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is set only for the top alternative of a non-streaming
// result or, of a streaming result where `is_final=true`.
@ -767,7 +771,7 @@ message SpeechRecognitionAlternative {
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 2;
// Output only. A list of word-specific information for each recognized word.
// A list of word-specific information for each recognized word.
// Note: When `enable_speaker_diarization` is true, you will see all the words
// from the beginning of the audio.
repeated WordInfo words = 3;
@ -775,7 +779,7 @@ message SpeechRecognitionAlternative {
// Word-specific information for recognized words.
message WordInfo {
// Output only. Time offset relative to the beginning of the audio,
// Time offset relative to the beginning of the audio,
// and corresponding to the start of the spoken word.
// This field is only set if `enable_word_time_offsets=true` and only
// in the top hypothesis.
@ -783,7 +787,7 @@ message WordInfo {
// vary.
google.protobuf.Duration start_time = 1;
// Output only. Time offset relative to the beginning of the audio,
// Time offset relative to the beginning of the audio,
// and corresponding to the end of the spoken word.
// This field is only set if `enable_word_time_offsets=true` and only
// in the top hypothesis.
@ -791,10 +795,10 @@ message WordInfo {
// vary.
google.protobuf.Duration end_time = 2;
// Output only. The word corresponding to this set of information.
// The word corresponding to this set of information.
string word = 3;
// Output only. The confidence estimate between 0.0 and 1.0. A higher number
// The confidence estimate between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. This field is set only for the top alternative of a non-streaming
// result or, of a streaming result where `is_final=true`.
@ -803,7 +807,7 @@ message WordInfo {
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
float confidence = 4;
// Output only. A distinct integer value is assigned for every speaker within
// A distinct integer value is assigned for every speaker within
// the audio. This field specifies which one of those speakers was detected to
// have spoken this word. Value ranges from '1' to diarization_speaker_count.
// speaker_tag is set if enable_speaker_diarization = 'true' and only in the

@ -0,0 +1,90 @@
type: com.google.api.codegen.ConfigProto
config_schema_version: 1.0.0
language_settings:
java:
package_name: com.google.cloud.speech.v1p1beta1
python:
package_name: google.cloud.speech_v1p1beta1.gapic
go:
package_name: cloud.google.com/go/speech/apiv1p1beta1
csharp:
package_name: Google.Cloud.Speech.V1P1Beta1
ruby:
package_name: Google::Cloud::Speech::V1p1beta1
php:
package_name: Google\Cloud\Speech\V1p1beta1
nodejs:
package_name: speech.v1p1beta1
domain_layer_location: google-cloud
interfaces:
- name: google.cloud.speech.v1p1beta1.Speech
smoke_test:
method: Recognize
init_fields:
- config.language_code="en-US"
- config.sample_rate_hertz=44100
- config.encoding=FLAC
- audio.uri="gs://gapic-toolkit/hello.flac"
collections: []
retry_codes_def:
- name: idempotent
retry_codes:
- DEADLINE_EXCEEDED
- UNAVAILABLE
- name: non_idempotent
retry_codes: []
retry_params_def:
- name: default
initial_retry_delay_millis: 100
retry_delay_multiplier: 1.3
max_retry_delay_millis: 60000
initial_rpc_timeout_millis: 1000000
rpc_timeout_multiplier: 1
max_rpc_timeout_millis: 1000000
total_timeout_millis: 5000000
methods:
- name: Recognize
flattening:
groups:
- parameters:
- config
- audio
required_fields:
- config
- audio
sample_code_init_fields:
- config.encoding=FLAC
- config.sample_rate_hertz=44100
- config.language_code="en-US"
- audio.uri=gs://bucket_name/file_name.flac
retry_codes_name: idempotent
retry_params_name: default
timeout_millis: 1000000
- name: LongRunningRecognize
flattening:
groups:
- parameters:
- config
- audio
required_fields:
- config
- audio
sample_code_init_fields:
- config.encoding=FLAC
- config.sample_rate_hertz=44100
- config.language_code="en-US"
- audio.uri=gs://bucket_name/file_name.flac
retry_codes_name: non_idempotent
retry_params_name: default
timeout_millis: 60000
long_running:
return_type: google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse
metadata_type: google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata
initial_poll_delay_millis: 20000
poll_delay_multiplier: 1.5
max_poll_delay_millis: 45000
total_poll_timeout_millis: 86400000
- name: StreamingRecognize
retry_codes_name: idempotent
retry_params_name: default
timeout_millis: 1000000

@ -1,5 +1,5 @@
type: com.google.api.codegen.ConfigProto
config_schema_version: 1.0.0
config_schema_version: 2.0.0
language_settings:
java:
package_name: com.google.cloud.speech.v1p1beta1
@ -25,66 +25,16 @@ interfaces:
- config.sample_rate_hertz=44100
- config.encoding=FLAC
- audio.uri="gs://gapic-toolkit/hello.flac"
collections: []
retry_codes_def:
- name: idempotent
retry_codes:
- DEADLINE_EXCEEDED
- UNAVAILABLE
- name: non_idempotent
retry_codes: []
retry_params_def:
- name: default
initial_retry_delay_millis: 100
retry_delay_multiplier: 1.3
max_retry_delay_millis: 60000
initial_rpc_timeout_millis: 1000000
rpc_timeout_multiplier: 1
max_rpc_timeout_millis: 1000000
total_timeout_millis: 5000000
methods:
- name: Recognize
flattening:
groups:
- parameters:
- config
- audio
required_fields:
- config
- audio
sample_code_init_fields:
- config.encoding=FLAC
- config.sample_rate_hertz=44100
- config.language_code="en-US"
- audio.uri=gs://bucket_name/file_name.flac
retry_codes_name: idempotent
retry_params_name: default
timeout_millis: 1000000
- name: LongRunningRecognize
flattening:
groups:
- parameters:
- config
- audio
required_fields:
- config
- audio
sample_code_init_fields:
- config.encoding=FLAC
- config.sample_rate_hertz=44100
- config.language_code="en-US"
- audio.uri=gs://bucket_name/file_name.flac
retry_codes_name: non_idempotent
retry_params_name: default
timeout_millis: 60000
long_running:
return_type: google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse
metadata_type: google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata
initial_poll_delay_millis: 20000
poll_delay_multiplier: 1.5
max_poll_delay_millis: 45000
total_poll_timeout_millis: 86400000
- name: StreamingRecognize
retry_codes_name: idempotent
retry_params_name: default
timeout_millis: 1000000

@ -0,0 +1,35 @@
{
"methodConfig": [
{
"name": [
{
"service": "google.cloud.speech.v1p1beta1.Speech",
"method": "Recognize"
},
{
"service": "google.cloud.speech.v1p1beta1.Speech",
"method": "StreamingRecognize"
}
],
"timeout": "5000s",
"retryPolicy": {
"initialBackoff": "0.100s",
"maxBackoff": "60s",
"backoffMultiplier": 1.3,
"retryableStatusCodes": [
"DEADLINE_EXCEEDED",
"UNAVAILABLE"
]
}
},
{
"name": [
{
"service": "google.cloud.speech.v1p1beta1.Speech",
"method": "LongRunningRecognize"
}
],
"timeout": "5000s"
}
]
}

@ -0,0 +1,36 @@
type: google.api.Service
config_version: 3
name: speech.googleapis.com
title: Cloud Speech-to-Text API
apis:
- name: google.cloud.speech.v1p1beta1.Speech
types:
- name: google.cloud.speech.v1p1beta1.LongRunningRecognizeMetadata
- name: google.cloud.speech.v1p1beta1.LongRunningRecognizeResponse
documentation:
summary: Converts audio to text by applying powerful neural network models.
overview: |-
# Introduction
Google Cloud Speech API provides speech recognition as a service.
backend:
rules:
- selector: 'google.cloud.speech.v1p1beta1.Speech.*'
deadline: 355.0
- selector: 'google.longrunning.Operations.*'
deadline: 355.0
authentication:
rules:
- selector: 'google.cloud.speech.v1p1beta1.Speech.*'
oauth:
canonical_scopes: |-
https://www.googleapis.com/auth/cloud-platform
- selector: 'google.longrunning.Operations.*'
oauth:
canonical_scopes: |-
https://www.googleapis.com/auth/cloud-platform
Loading…
Cancel
Save