624 lines
24 KiB
624 lines
24 KiB
// Copyright 2022 Google LLC |
// |
// Licensed under the Apache License, Version 2.0 (the "License"); |
// you may not use this file except in compliance with the License. |
// You may obtain a copy of the License at |
// |
// http://www.apache.org/licenses/LICENSE-2.0 |
// |
// Unless required by applicable law or agreed to in writing, software |
// distributed under the License is distributed on an "AS IS" BASIS, |
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
// See the License for the specific language governing permissions and |
// limitations under the License. |
syntax = "proto3"; |
package google.cloud.bigquery.storage.v1; |
import "google/api/annotations.proto"; |
import "google/api/client.proto"; |
import "google/api/field_behavior.proto"; |
import "google/api/resource.proto"; |
import "google/cloud/bigquery/storage/v1/arrow.proto"; |
import "google/cloud/bigquery/storage/v1/avro.proto"; |
import "google/cloud/bigquery/storage/v1/protobuf.proto"; |
import "google/cloud/bigquery/storage/v1/stream.proto"; |
import "google/cloud/bigquery/storage/v1/table.proto"; |
import "google/protobuf/timestamp.proto"; |
import "google/protobuf/wrappers.proto"; |
import "google/rpc/status.proto"; |
option csharp_namespace = "Google.Cloud.BigQuery.Storage.V1"; |
option go_package = "google.golang.org/genproto/googleapis/cloud/bigquery/storage/v1;storage"; |
option java_multiple_files = true; |
option java_outer_classname = "StorageProto"; |
option java_package = "com.google.cloud.bigquery.storage.v1"; |
option php_namespace = "Google\\Cloud\\BigQuery\\Storage\\V1"; |
option (google.api.resource_definition) = { |
type: "bigquery.googleapis.com/Table" |
pattern: "projects/{project}/datasets/{dataset}/tables/{table}" |
}; |
// BigQuery Read API. |
// |
// The Read API can be used to read data from BigQuery. |
service BigQueryRead { |
option (google.api.default_host) = "bigquerystorage.googleapis.com"; |
option (google.api.oauth_scopes) = |
"https://www.googleapis.com/auth/bigquery," |
"https://www.googleapis.com/auth/cloud-platform"; |
// Creates a new read session. A read session divides the contents of a |
// BigQuery table into one or more streams, which can then be used to read |
// data from the table. The read session also specifies properties of the |
// data to be read, such as a list of columns or a push-down filter describing |
// the rows to be returned. |
// |
// A particular row can be read by at most one stream. When the caller has |
// reached the end of each stream in the session, then all the data in the |
// table has been read. |
// |
// Data is assigned to each stream such that roughly the same number of |
// rows can be read from each stream. Because the server-side unit for |
// assigning data is collections of rows, the API does not guarantee that |
// each stream will return the same number or rows. Additionally, the |
// limits are enforced based on the number of pre-filtered rows, so some |
// filters can lead to lopsided assignments. |
// |
// Read sessions automatically expire 6 hours after they are created and do |
// not require manual clean-up by the caller. |
rpc CreateReadSession(CreateReadSessionRequest) returns (ReadSession) { |
option (google.api.http) = { |
post: "/v1/{read_session.table=projects/*/datasets/*/tables/*}" |
body: "*" |
}; |
option (google.api.method_signature) = "parent,read_session,max_stream_count"; |
} |
// Reads rows from the stream in the format prescribed by the ReadSession. |
// Each response contains one or more table rows, up to a maximum of 100 MiB |
// per response; read requests which attempt to read individual rows larger |
// than 100 MiB will fail. |
// |
// Each request also returns a set of stream statistics reflecting the current |
// state of the stream. |
rpc ReadRows(ReadRowsRequest) returns (stream ReadRowsResponse) { |
option (google.api.http) = { |
get: "/v1/{read_stream=projects/*/locations/*/sessions/*/streams/*}" |
}; |
option (google.api.method_signature) = "read_stream,offset"; |
} |
// Splits a given `ReadStream` into two `ReadStream` objects. These |
// `ReadStream` objects are referred to as the primary and the residual |
// streams of the split. The original `ReadStream` can still be read from in |
// the same manner as before. Both of the returned `ReadStream` objects can |
// also be read from, and the rows returned by both child streams will be |
// the same as the rows read from the original stream. |
// |
// Moreover, the two child streams will be allocated back-to-back in the |
// original `ReadStream`. Concretely, it is guaranteed that for streams |
// original, primary, and residual, that original[0-j] = primary[0-j] and |
// original[j-n] = residual[0-m] once the streams have been read to |
// completion. |
rpc SplitReadStream(SplitReadStreamRequest) returns (SplitReadStreamResponse) { |
option (google.api.http) = { |
get: "/v1/{name=projects/*/locations/*/sessions/*/streams/*}" |
}; |
} |
} |
// BigQuery Write API. |
// |
// The Write API can be used to write data to BigQuery. |
// |
// For supplementary information about the Write API, see: |
// https://cloud.google.com/bigquery/docs/write-api |
service BigQueryWrite { |
option (google.api.default_host) = "bigquerystorage.googleapis.com"; |
option (google.api.oauth_scopes) = |
"https://www.googleapis.com/auth/bigquery," |
"https://www.googleapis.com/auth/bigquery.insertdata," |
"https://www.googleapis.com/auth/cloud-platform"; |
// Creates a write stream to the given table. |
// Additionally, every table has a special stream named '_default' |
// to which data can be written. This stream doesn't need to be created using |
// CreateWriteStream. It is a stream that can be used simultaneously by any |
// number of clients. Data written to this stream is considered committed as |
// soon as an acknowledgement is received. |
rpc CreateWriteStream(CreateWriteStreamRequest) returns (WriteStream) { |
option (google.api.http) = { |
post: "/v1/{parent=projects/*/datasets/*/tables/*}" |
body: "write_stream" |
}; |
option (google.api.method_signature) = "parent,write_stream"; |
} |
// Appends data to the given stream. |
// |
// If `offset` is specified, the `offset` is checked against the end of |
// stream. The server returns `OUT_OF_RANGE` in `AppendRowsResponse` if an |
// attempt is made to append to an offset beyond the current end of the stream |
// or `ALREADY_EXISTS` if user provides an `offset` that has already been |
// written to. User can retry with adjusted offset within the same RPC |
// connection. If `offset` is not specified, append happens at the end of the |
// stream. |
// |
// The response contains an optional offset at which the append |
// happened. No offset information will be returned for appends to a |
// default stream. |
// |
// Responses are received in the same order in which requests are sent. |
// There will be one response for each successful inserted request. Responses |
// may optionally embed error information if the originating AppendRequest was |
// not successfully processed. |
// |
// The specifics of when successfully appended data is made visible to the |
// table are governed by the type of stream: |
// |
// * For COMMITTED streams (which includes the default stream), data is |
// visible immediately upon successful append. |
// |
// * For BUFFERED streams, data is made visible via a subsequent `FlushRows` |
// rpc which advances a cursor to a newer offset in the stream. |
// |
// * For PENDING streams, data is not made visible until the stream itself is |
// finalized (via the `FinalizeWriteStream` rpc), and the stream is explicitly |
// committed via the `BatchCommitWriteStreams` rpc. |
// |
// Note: For users coding against the gRPC api directly, it may be |
// necessary to supply the x-goog-request-params system parameter |
// with `write_stream=<full_write_stream_name>`. |
// |
// More information about system parameters: |
// https://cloud.google.com/apis/docs/system-parameters |
rpc AppendRows(stream AppendRowsRequest) returns (stream AppendRowsResponse) { |
option (google.api.http) = { |
post: "/v1/{write_stream=projects/*/datasets/*/tables/*/streams/*}" |
body: "*" |
}; |
option (google.api.method_signature) = "write_stream"; |
} |
// Gets information about a write stream. |
rpc GetWriteStream(GetWriteStreamRequest) returns (WriteStream) { |
option (google.api.http) = { |
post: "/v1/{name=projects/*/datasets/*/tables/*/streams/*}" |
body: "*" |
}; |
option (google.api.method_signature) = "name"; |
} |
// Finalize a write stream so that no new data can be appended to the |
// stream. Finalize is not supported on the '_default' stream. |
rpc FinalizeWriteStream(FinalizeWriteStreamRequest) returns (FinalizeWriteStreamResponse) { |
option (google.api.http) = { |
post: "/v1/{name=projects/*/datasets/*/tables/*/streams/*}" |
body: "*" |
}; |
option (google.api.method_signature) = "name"; |
} |
// Atomically commits a group of `PENDING` streams that belong to the same |
// `parent` table. |
// |
// Streams must be finalized before commit and cannot be committed multiple |
// times. Once a stream is committed, data in the stream becomes available |
// for read operations. |
rpc BatchCommitWriteStreams(BatchCommitWriteStreamsRequest) returns (BatchCommitWriteStreamsResponse) { |
option (google.api.http) = { |
get: "/v1/{parent=projects/*/datasets/*/tables/*}" |
}; |
option (google.api.method_signature) = "parent"; |
} |
// Flushes rows to a BUFFERED stream. |
// |
// If users are appending rows to BUFFERED stream, flush operation is |
// required in order for the rows to become available for reading. A |
// Flush operation flushes up to any previously flushed offset in a BUFFERED |
// stream, to the offset specified in the request. |
// |
// Flush is not supported on the _default stream, since it is not BUFFERED. |
rpc FlushRows(FlushRowsRequest) returns (FlushRowsResponse) { |
option (google.api.http) = { |
post: "/v1/{write_stream=projects/*/datasets/*/tables/*/streams/*}" |
body: "*" |
}; |
option (google.api.method_signature) = "write_stream"; |
} |
} |
// Request message for `CreateReadSession`. |
message CreateReadSessionRequest { |
// Required. The request project that owns the session, in the form of |
// `projects/{project_id}`. |
string parent = 1 [ |
(google.api.field_behavior) = REQUIRED, |
(google.api.resource_reference) = { |
type: "cloudresourcemanager.googleapis.com/Project" |
} |
]; |
// Required. Session to be created. |
ReadSession read_session = 2 [(google.api.field_behavior) = REQUIRED]; |
// Max initial number of streams. If unset or zero, the server will |
// provide a value of streams so as to produce reasonable throughput. Must be |
// non-negative. The number of streams may be lower than the requested number, |
// depending on the amount parallelism that is reasonable for the table. Error |
// will be returned if the max count is greater than the current system |
// max limit of 1,000. |
// |
// Streams must be read starting from offset 0. |
int32 max_stream_count = 3; |
} |
// Request message for `ReadRows`. |
message ReadRowsRequest { |
// Required. Stream to read rows from. |
string read_stream = 1 [ |
(google.api.field_behavior) = REQUIRED, |
(google.api.resource_reference) = { |
type: "bigquerystorage.googleapis.com/ReadStream" |
} |
]; |
// The offset requested must be less than the last row read from Read. |
// Requesting a larger offset is undefined. If not specified, start reading |
// from offset zero. |
int64 offset = 2; |
} |
// Information on if the current connection is being throttled. |
message ThrottleState { |
// How much this connection is being throttled. Zero means no throttling, |
// 100 means fully throttled. |
int32 throttle_percent = 1; |
} |
// Estimated stream statistics for a given read Stream. |
message StreamStats { |
message Progress { |
// The fraction of rows assigned to the stream that have been processed by |
// the server so far, not including the rows in the current response |
// message. |
// |
// This value, along with `at_response_end`, can be used to interpolate |
// the progress made as the rows in the message are being processed using |
// the following formula: `at_response_start + (at_response_end - |
// at_response_start) * rows_processed_from_response / rows_in_response`. |
// |
// Note that if a filter is provided, the `at_response_end` value of the |
// previous response may not necessarily be equal to the |
// `at_response_start` value of the current response. |
double at_response_start = 1; |
// Similar to `at_response_start`, except that this value includes the |
// rows in the current response. |
double at_response_end = 2; |
} |
// Represents the progress of the current stream. |
Progress progress = 2; |
} |
// Response from calling `ReadRows` may include row data, progress and |
// throttling information. |
message ReadRowsResponse { |
// Row data is returned in format specified during session creation. |
oneof rows { |
// Serialized row data in AVRO format. |
AvroRows avro_rows = 3; |
// Serialized row data in Arrow RecordBatch format. |
ArrowRecordBatch arrow_record_batch = 4; |
} |
// Number of serialized rows in the rows block. |
int64 row_count = 6; |
// Statistics for the stream. |
StreamStats stats = 2; |
// Throttling state. If unset, the latest response still describes |
// the current throttling status. |
ThrottleState throttle_state = 5; |
// The schema for the read. If read_options.selected_fields is set, the |
// schema may be different from the table schema as it will only contain |
// the selected fields. This schema is equivelant to the one returned by |
// CreateSession. This field is only populated in the first ReadRowsResponse |
// RPC. |
oneof schema { |
// Output only. Avro schema. |
AvroSchema avro_schema = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; |
// Output only. Arrow schema. |
ArrowSchema arrow_schema = 8 [(google.api.field_behavior) = OUTPUT_ONLY]; |
} |
} |
// Request message for `SplitReadStream`. |
message SplitReadStreamRequest { |
// Required. Name of the stream to split. |
string name = 1 [ |
(google.api.field_behavior) = REQUIRED, |
(google.api.resource_reference) = { |
type: "bigquerystorage.googleapis.com/ReadStream" |
} |
]; |
// A value in the range (0.0, 1.0) that specifies the fractional point at |
// which the original stream should be split. The actual split point is |
// evaluated on pre-filtered rows, so if a filter is provided, then there is |
// no guarantee that the division of the rows between the new child streams |
// will be proportional to this fractional value. Additionally, because the |
// server-side unit for assigning data is collections of rows, this fraction |
// will always map to a data storage boundary on the server side. |
double fraction = 2; |
} |
// Response message for `SplitReadStream`. |
message SplitReadStreamResponse { |
// Primary stream, which contains the beginning portion of |
// |original_stream|. An empty value indicates that the original stream can no |
// longer be split. |
ReadStream primary_stream = 1; |
// Remainder stream, which contains the tail of |original_stream|. An empty |
// value indicates that the original stream can no longer be split. |
ReadStream remainder_stream = 2; |
} |
// Request message for `CreateWriteStream`. |
message CreateWriteStreamRequest { |
// Required. Reference to the table to which the stream belongs, in the format |
// of `projects/{project}/datasets/{dataset}/tables/{table}`. |
string parent = 1 [ |
(google.api.field_behavior) = REQUIRED, |
(google.api.resource_reference) = { |
type: "bigquery.googleapis.com/Table" |
} |
]; |
// Required. Stream to be created. |
WriteStream write_stream = 2 [(google.api.field_behavior) = REQUIRED]; |
} |
// Request message for `AppendRows`. |
// |
// Due to the nature of AppendRows being a bidirectional streaming RPC, certain |
// parts of the AppendRowsRequest need only be specified for the first request |
// sent each time the gRPC network connection is opened/reopened. |
message AppendRowsRequest { |
// ProtoData contains the data rows and schema when constructing append |
// requests. |
message ProtoData { |
// Proto schema used to serialize the data. This value only needs to be |
// provided as part of the first request on a gRPC network connection, |
// and will be ignored for subsequent requests on the connection. |
ProtoSchema writer_schema = 1; |
// Serialized row data in protobuf message format. |
// Currently, the backend expects the serialized rows to adhere to |
// proto2 semantics when appending rows, particularly with respect to |
// how default values are encoded. |
ProtoRows rows = 2; |
} |
// Required. The write_stream identifies the target of the append operation, and only |
// needs to be specified as part of the first request on the gRPC connection. |
// If provided for subsequent requests, it must match the value of the first |
// request. |
// |
// For explicitly created write streams, the format is: |
// |
// * `projects/{project}/datasets/{dataset}/tables/{table}/streams/{id}` |
// |
// For the special default stream, the format is: |
// |
// * `projects/{project}/datasets/{dataset}/tables/{table}/streams/_default`. |
string write_stream = 1 [ |
(google.api.field_behavior) = REQUIRED, |
(google.api.resource_reference) = { |
type: "bigquerystorage.googleapis.com/WriteStream" |
} |
]; |
// If present, the write is only performed if the next append offset is same |
// as the provided value. If not present, the write is performed at the |
// current end of stream. Specifying a value for this field is not allowed |
// when calling AppendRows for the '_default' stream. |
google.protobuf.Int64Value offset = 2; |
// Input rows. The `writer_schema` field must be specified at the initial |
// request and currently, it will be ignored if specified in following |
// requests. Following requests must have data in the same format as the |
// initial request. |
oneof rows { |
// Rows in proto format. |
ProtoData proto_rows = 4; |
} |
// Id set by client to annotate its identity. Only initial request setting is |
// respected. |
string trace_id = 6; |
} |
// Response message for `AppendRows`. |
message AppendRowsResponse { |
// AppendResult is returned for successful append requests. |
message AppendResult { |
// The row offset at which the last append occurred. The offset will not be |
// set if appending using default streams. |
google.protobuf.Int64Value offset = 1; |
} |
oneof response { |
// Result if the append is successful. |
AppendResult append_result = 1; |
// Error returned when problems were encountered. If present, |
// it indicates rows were not accepted into the system. |
// Users can retry or continue with other append requests within the |
// same connection. |
// |
// Additional information about error signalling: |
// |
// ALREADY_EXISTS: Happens when an append specified an offset, and the |
// backend already has received data at this offset. Typically encountered |
// in retry scenarios, and can be ignored. |
// |
// OUT_OF_RANGE: Returned when the specified offset in the stream is beyond |
// the current end of the stream. |
// |
// INVALID_ARGUMENT: Indicates a malformed request or data. |
// |
// ABORTED: Request processing is aborted because of prior failures. The |
// request can be retried if previous failure is addressed. |
// |
// INTERNAL: Indicates server side error(s) that can be retried. |
google.rpc.Status error = 2; |
} |
// If backend detects a schema update, pass it to user so that user can |
// use it to input new type of message. It will be empty when no schema |
// updates have occurred. |
TableSchema updated_schema = 3; |
} |
// Request message for `GetWriteStreamRequest`. |
message GetWriteStreamRequest { |
// Required. Name of the stream to get, in the form of |
// `projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}`. |
string name = 1 [ |
(google.api.field_behavior) = REQUIRED, |
(google.api.resource_reference) = { |
type: "bigquerystorage.googleapis.com/WriteStream" |
} |
]; |
} |
// Request message for `BatchCommitWriteStreams`. |
message BatchCommitWriteStreamsRequest { |
// Required. Parent table that all the streams should belong to, in the form of |
// `projects/{project}/datasets/{dataset}/tables/{table}`. |
string parent = 1 [ |
(google.api.field_behavior) = REQUIRED, |
(google.api.resource_reference) = { |
type: "bigquery.googleapis.com/Table" |
} |
]; |
// Required. The group of streams that will be committed atomically. |
repeated string write_streams = 2 [(google.api.field_behavior) = REQUIRED]; |
} |
// Response message for `BatchCommitWriteStreams`. |
message BatchCommitWriteStreamsResponse { |
// The time at which streams were committed in microseconds granularity. |
// This field will only exist when there are no stream errors. |
// **Note** if this field is not set, it means the commit was not successful. |
google.protobuf.Timestamp commit_time = 1; |
// Stream level error if commit failed. Only streams with error will be in |
// the list. |
// If empty, there is no error and all streams are committed successfully. |
// If non empty, certain streams have errors and ZERO stream is committed due |
// to atomicity guarantee. |
repeated StorageError stream_errors = 2; |
} |
// Request message for invoking `FinalizeWriteStream`. |
message FinalizeWriteStreamRequest { |
// Required. Name of the stream to finalize, in the form of |
// `projects/{project}/datasets/{dataset}/tables/{table}/streams/{stream}`. |
string name = 1 [ |
(google.api.field_behavior) = REQUIRED, |
(google.api.resource_reference) = { |
type: "bigquerystorage.googleapis.com/WriteStream" |
} |
]; |
} |
// Response message for `FinalizeWriteStream`. |
message FinalizeWriteStreamResponse { |
// Number of rows in the finalized stream. |
int64 row_count = 1; |
} |
// Request message for `FlushRows`. |
message FlushRowsRequest { |
// Required. The stream that is the target of the flush operation. |
string write_stream = 1 [ |
(google.api.field_behavior) = REQUIRED, |
(google.api.resource_reference) = { |
type: "bigquerystorage.googleapis.com/WriteStream" |
} |
]; |
// Ending offset of the flush operation. Rows before this offset(including |
// this offset) will be flushed. |
google.protobuf.Int64Value offset = 2; |
} |
// Respond message for `FlushRows`. |
message FlushRowsResponse { |
// The rows before this offset (including this offset) are flushed. |
int64 offset = 1; |
} |
// Structured custom BigQuery Storage error message. The error can be attached |
// as error details in the returned rpc Status. In particular, the use of error |
// codes allows more structured error handling, and reduces the need to evaluate |
// unstructured error text strings. |
message StorageError { |
// Error code for `StorageError`. |
enum StorageErrorCode { |
// Default error. |
// Table is not found in the system. |
// Stream is already committed. |
// Stream is not found. |
// Invalid Stream type. |
// For example, you try to commit a stream that is not pending. |
// Invalid Stream state. |
// For example, you try to commit a stream that is not finalized or is |
// garbaged. |
// Stream is finalized. |
// There is a schema mismatch and it is caused by user schema has extra |
// field than bigquery schema. |
// Offset already exists. |
// Offset out of range. |
} |
// BigQuery Storage specific error code. |
StorageErrorCode code = 1; |
// Name of the failed entity. |
string entity = 2; |
// Message that describes the error. |
string error_message = 3; |