429 lines
17 KiB
429 lines
17 KiB
// Copyright 2023 Google LLC |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
syntax = "proto3"; |
|
|
|
package google.cloud.bigquery.storage.v1beta1; |
|
|
|
import "google/api/annotations.proto"; |
|
import "google/api/client.proto"; |
|
import "google/api/field_behavior.proto"; |
|
import "google/api/resource.proto"; |
|
import "google/cloud/bigquery/storage/v1beta1/arrow.proto"; |
|
import "google/cloud/bigquery/storage/v1beta1/avro.proto"; |
|
import "google/cloud/bigquery/storage/v1beta1/read_options.proto"; |
|
import "google/cloud/bigquery/storage/v1beta1/table_reference.proto"; |
|
import "google/protobuf/empty.proto"; |
|
import "google/protobuf/timestamp.proto"; |
|
|
|
option go_package = "cloud.google.com/go/bigquery/storage/apiv1beta1/storagepb;storagepb"; |
|
option java_package = "com.google.cloud.bigquery.storage.v1beta1"; |
|
|
|
// BigQuery storage API. |
|
// |
|
// The BigQuery storage API can be used to read data stored in BigQuery. |
|
// |
|
// The v1beta1 API is not yet officially deprecated, and will go through a full |
|
// deprecation cycle (https://cloud.google.com/products#product-launch-stages) |
|
// before the service is turned down. However, new code should use the v1 API |
|
// going forward. |
|
service BigQueryStorage { |
|
option (google.api.default_host) = "bigquerystorage.googleapis.com"; |
|
option (google.api.oauth_scopes) = |
|
"https://www.googleapis.com/auth/bigquery," |
|
"https://www.googleapis.com/auth/cloud-platform"; |
|
|
|
// Creates a new read session. A read session divides the contents of a |
|
// BigQuery table into one or more streams, which can then be used to read |
|
// data from the table. The read session also specifies properties of the |
|
// data to be read, such as a list of columns or a push-down filter describing |
|
// the rows to be returned. |
|
// |
|
// A particular row can be read by at most one stream. When the caller has |
|
// reached the end of each stream in the session, then all the data in the |
|
// table has been read. |
|
// |
|
// Read sessions automatically expire 6 hours after they are created and do |
|
// not require manual clean-up by the caller. |
|
rpc CreateReadSession(CreateReadSessionRequest) returns (ReadSession) { |
|
option (google.api.http) = { |
|
post: "/v1beta1/{table_reference.project_id=projects/*}" |
|
body: "*" |
|
additional_bindings { |
|
post: "/v1beta1/{table_reference.dataset_id=projects/*/datasets/*}" |
|
body: "*" |
|
} |
|
}; |
|
option (google.api.method_signature) = |
|
"table_reference,parent,requested_streams"; |
|
} |
|
|
|
// Reads rows from the table in the format prescribed by the read session. |
|
// Each response contains one or more table rows, up to a maximum of 10 MiB |
|
// per response; read requests which attempt to read individual rows larger |
|
// than this will fail. |
|
// |
|
// Each request also returns a set of stream statistics reflecting the |
|
// estimated total number of rows in the read stream. This number is computed |
|
// based on the total table size and the number of active streams in the read |
|
// session, and may change as other streams continue to read data. |
|
rpc ReadRows(ReadRowsRequest) returns (stream ReadRowsResponse) { |
|
option (google.api.http) = { |
|
get: "/v1beta1/{read_position.stream.name=projects/*/streams/*}" |
|
}; |
|
option (google.api.method_signature) = "read_position"; |
|
} |
|
|
|
// Creates additional streams for a ReadSession. This API can be used to |
|
// dynamically adjust the parallelism of a batch processing task upwards by |
|
// adding additional workers. |
|
rpc BatchCreateReadSessionStreams(BatchCreateReadSessionStreamsRequest) |
|
returns (BatchCreateReadSessionStreamsResponse) { |
|
option (google.api.http) = { |
|
post: "/v1beta1/{session.name=projects/*/sessions/*}" |
|
body: "*" |
|
}; |
|
option (google.api.method_signature) = "session,requested_streams"; |
|
} |
|
|
|
// Causes a single stream in a ReadSession to gracefully stop. This |
|
// API can be used to dynamically adjust the parallelism of a batch processing |
|
// task downwards without losing data. |
|
// |
|
// This API does not delete the stream -- it remains visible in the |
|
// ReadSession, and any data processed by the stream is not released to other |
|
// streams. However, no additional data will be assigned to the stream once |
|
// this call completes. Callers must continue reading data on the stream until |
|
// the end of the stream is reached so that data which has already been |
|
// assigned to the stream will be processed. |
|
// |
|
// This method will return an error if there are no other live streams |
|
// in the Session, or if SplitReadStream() has been called on the given |
|
// Stream. |
|
rpc FinalizeStream(FinalizeStreamRequest) returns (google.protobuf.Empty) { |
|
option (google.api.http) = { |
|
post: "/v1beta1/{stream.name=projects/*/streams/*}" |
|
body: "*" |
|
}; |
|
option (google.api.method_signature) = "stream"; |
|
} |
|
|
|
// Splits a given read stream into two Streams. These streams are referred to |
|
// as the primary and the residual of the split. The original stream can still |
|
// be read from in the same manner as before. Both of the returned streams can |
|
// also be read from, and the total rows return by both child streams will be |
|
// the same as the rows read from the original stream. |
|
// |
|
// Moreover, the two child streams will be allocated back to back in the |
|
// original Stream. Concretely, it is guaranteed that for streams Original, |
|
// Primary, and Residual, that Original[0-j] = Primary[0-j] and |
|
// Original[j-n] = Residual[0-m] once the streams have been read to |
|
// completion. |
|
// |
|
// This method is guaranteed to be idempotent. |
|
rpc SplitReadStream(SplitReadStreamRequest) |
|
returns (SplitReadStreamResponse) { |
|
option (google.api.http) = { |
|
get: "/v1beta1/{original_stream.name=projects/*/streams/*}" |
|
}; |
|
option (google.api.method_signature) = "original_stream"; |
|
} |
|
} |
|
|
|
// Information about a single data stream within a read session. |
|
message Stream { |
|
option (google.api.resource) = { |
|
type: "bigquerystorage.googleapis.com/Stream" |
|
pattern: "projects/{project}/locations/{location}/streams/{stream}" |
|
}; |
|
|
|
// Name of the stream, in the form |
|
// `projects/{project_id}/locations/{location}/streams/{stream_id}`. |
|
string name = 1; |
|
} |
|
|
|
// Expresses a point within a given stream using an offset position. |
|
message StreamPosition { |
|
// Identifier for a given Stream. |
|
Stream stream = 1; |
|
|
|
// Position in the stream. |
|
int64 offset = 2; |
|
} |
|
|
|
// Information returned from a `CreateReadSession` request. |
|
message ReadSession { |
|
option (google.api.resource) = { |
|
type: "bigquerystorage.googleapis.com/ReadSession" |
|
pattern: "projects/{project}/locations/{location}/sessions/{session}" |
|
}; |
|
|
|
// Unique identifier for the session, in the form |
|
// `projects/{project_id}/locations/{location}/sessions/{session_id}`. |
|
string name = 1; |
|
|
|
// Time at which the session becomes invalid. After this time, subsequent |
|
// requests to read this Session will return errors. |
|
google.protobuf.Timestamp expire_time = 2; |
|
|
|
// The schema for the read. If read_options.selected_fields is set, the |
|
// schema may be different from the table schema as it will only contain |
|
// the selected fields. |
|
oneof schema { |
|
// Avro schema. |
|
AvroSchema avro_schema = 5; |
|
|
|
// Arrow schema. |
|
ArrowSchema arrow_schema = 6; |
|
} |
|
|
|
// Streams associated with this session. |
|
repeated Stream streams = 4; |
|
|
|
// Table that this ReadSession is reading from. |
|
TableReference table_reference = 7; |
|
|
|
// Any modifiers which are applied when reading from the specified table. |
|
TableModifiers table_modifiers = 8; |
|
|
|
// The strategy to use for distributing data among the streams. |
|
ShardingStrategy sharding_strategy = 9; |
|
} |
|
|
|
// Data format for input or output data. |
|
enum DataFormat { |
|
// Data format is unspecified. |
|
DATA_FORMAT_UNSPECIFIED = 0; |
|
|
|
// Avro is a standard open source row based file format. |
|
// See https://avro.apache.org/ for more details. |
|
AVRO = 1; |
|
|
|
// Arrow is a standard open source column-based message format. |
|
// See https://arrow.apache.org/ for more details. |
|
ARROW = 3; |
|
} |
|
|
|
// Strategy for distributing data among multiple streams in a read session. |
|
enum ShardingStrategy { |
|
// Same as LIQUID. |
|
SHARDING_STRATEGY_UNSPECIFIED = 0; |
|
|
|
// Assigns data to each stream based on the client's read rate. The faster the |
|
// client reads from a stream, the more data is assigned to the stream. In |
|
// this strategy, it's possible to read all data from a single stream even if |
|
// there are other streams present. |
|
LIQUID = 1; |
|
|
|
// Assigns data to each stream such that roughly the same number of rows can |
|
// be read from each stream. Because the server-side unit for assigning data |
|
// is collections of rows, the API does not guarantee that each stream will |
|
// return the same number or rows. Additionally, the limits are enforced based |
|
// on the number of pre-filtering rows, so some filters can lead to lopsided |
|
// assignments. |
|
BALANCED = 2; |
|
} |
|
|
|
// Creates a new read session, which may include additional options such as |
|
// requested parallelism, projection filters and constraints. |
|
message CreateReadSessionRequest { |
|
// Required. Reference to the table to read. |
|
TableReference table_reference = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Required. String of the form `projects/{project_id}` indicating the |
|
// project this ReadSession is associated with. This is the project that will |
|
// be billed for usage. |
|
string parent = 6 [ |
|
(google.api.field_behavior) = REQUIRED, |
|
(google.api.resource_reference) = { |
|
type: "cloudresourcemanager.googleapis.com/Project" |
|
} |
|
]; |
|
|
|
// Any modifiers to the Table (e.g. snapshot timestamp). |
|
TableModifiers table_modifiers = 2; |
|
|
|
// Initial number of streams. If unset or 0, we will |
|
// provide a value of streams so as to produce reasonable throughput. Must be |
|
// non-negative. The number of streams may be lower than the requested number, |
|
// depending on the amount parallelism that is reasonable for the table and |
|
// the maximum amount of parallelism allowed by the system. |
|
// |
|
// Streams must be read starting from offset 0. |
|
int32 requested_streams = 3; |
|
|
|
// Read options for this session (e.g. column selection, filters). |
|
TableReadOptions read_options = 4; |
|
|
|
// Data output format. Currently default to Avro. |
|
// DATA_FORMAT_UNSPECIFIED not supported. |
|
DataFormat format = 5; |
|
|
|
// The strategy to use for distributing data among multiple streams. Currently |
|
// defaults to liquid sharding. |
|
ShardingStrategy sharding_strategy = 7; |
|
} |
|
|
|
// Requesting row data via `ReadRows` must provide Stream position information. |
|
message ReadRowsRequest { |
|
// Required. Identifier of the position in the stream to start reading from. |
|
// The offset requested must be less than the last row read from ReadRows. |
|
// Requesting a larger offset is undefined. |
|
StreamPosition read_position = 1 [(google.api.field_behavior) = REQUIRED]; |
|
} |
|
|
|
// Progress information for a given Stream. |
|
message StreamStatus { |
|
// Number of estimated rows in the current stream. May change over time as |
|
// different readers in the stream progress at rates which are relatively fast |
|
// or slow. |
|
int64 estimated_row_count = 1; |
|
|
|
// A value in the range [0.0, 1.0] that represents the fraction of rows |
|
// assigned to this stream that have been processed by the server. In the |
|
// presence of read filters, the server may process more rows than it returns, |
|
// so this value reflects progress through the pre-filtering rows. |
|
// |
|
// This value is only populated for sessions created through the BALANCED |
|
// sharding strategy. |
|
float fraction_consumed = 2; |
|
|
|
// Represents the progress of the current stream. |
|
Progress progress = 4; |
|
|
|
// Whether this stream can be split. For sessions that use the LIQUID sharding |
|
// strategy, this value is always false. For BALANCED sessions, this value is |
|
// false when enough data have been read such that no more splits are possible |
|
// at that point or beyond. For small tables or streams that are the result of |
|
// a chain of splits, this value may never be true. |
|
bool is_splittable = 3; |
|
} |
|
|
|
message Progress { |
|
// The fraction of rows assigned to the stream that have been processed by the |
|
// server so far, not including the rows in the current response message. |
|
// |
|
// This value, along with `at_response_end`, can be used to interpolate the |
|
// progress made as the rows in the message are being processed using the |
|
// following formula: `at_response_start + (at_response_end - |
|
// at_response_start) * rows_processed_from_response / rows_in_response`. |
|
// |
|
// Note that if a filter is provided, the `at_response_end` value of the |
|
// previous response may not necessarily be equal to the `at_response_start` |
|
// value of the current response. |
|
float at_response_start = 1; |
|
|
|
// Similar to `at_response_start`, except that this value includes the rows in |
|
// the current response. |
|
float at_response_end = 2; |
|
} |
|
|
|
// Information on if the current connection is being throttled. |
|
message ThrottleStatus { |
|
// How much this connection is being throttled. |
|
// 0 is no throttling, 100 is completely throttled. |
|
int32 throttle_percent = 1; |
|
} |
|
|
|
// Response from calling `ReadRows` may include row data, progress and |
|
// throttling information. |
|
message ReadRowsResponse { |
|
// Row data is returned in format specified during session creation. |
|
oneof rows { |
|
// Serialized row data in AVRO format. |
|
AvroRows avro_rows = 3; |
|
|
|
// Serialized row data in Arrow RecordBatch format. |
|
ArrowRecordBatch arrow_record_batch = 4; |
|
} |
|
|
|
// Number of serialized rows in the rows block. This value is recorded here, |
|
// in addition to the row_count values in the output-specific messages in |
|
// `rows`, so that code which needs to record progress through the stream can |
|
// do so in an output format-independent way. |
|
int64 row_count = 6; |
|
|
|
// Estimated stream statistics. |
|
StreamStatus status = 2; |
|
|
|
// Throttling status. If unset, the latest response still describes |
|
// the current throttling status. |
|
ThrottleStatus throttle_status = 5; |
|
|
|
// The schema for the read. If read_options.selected_fields is set, the |
|
// schema may be different from the table schema as it will only contain |
|
// the selected fields. This schema is equivalent to the one returned by |
|
// CreateSession. This field is only populated in the first ReadRowsResponse |
|
// RPC. |
|
oneof schema { |
|
// Output only. Avro schema. |
|
AvroSchema avro_schema = 7 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
// Output only. Arrow schema. |
|
ArrowSchema arrow_schema = 8 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
} |
|
} |
|
|
|
// Information needed to request additional streams for an established read |
|
// session. |
|
message BatchCreateReadSessionStreamsRequest { |
|
// Required. Must be a non-expired session obtained from a call to |
|
// CreateReadSession. Only the name field needs to be set. |
|
ReadSession session = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Required. Number of new streams requested. Must be positive. |
|
// Number of added streams may be less than this, see CreateReadSessionRequest |
|
// for more information. |
|
int32 requested_streams = 2 [(google.api.field_behavior) = REQUIRED]; |
|
} |
|
|
|
// The response from `BatchCreateReadSessionStreams` returns the stream |
|
// identifiers for the newly created streams. |
|
message BatchCreateReadSessionStreamsResponse { |
|
// Newly added streams. |
|
repeated Stream streams = 1; |
|
} |
|
|
|
// Request information for invoking `FinalizeStream`. |
|
message FinalizeStreamRequest { |
|
// Required. Stream to finalize. |
|
Stream stream = 2 [(google.api.field_behavior) = REQUIRED]; |
|
} |
|
|
|
// Request information for `SplitReadStream`. |
|
message SplitReadStreamRequest { |
|
// Required. Stream to split. |
|
Stream original_stream = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// A value in the range (0.0, 1.0) that specifies the fractional point at |
|
// which the original stream should be split. The actual split point is |
|
// evaluated on pre-filtered rows, so if a filter is provided, then there is |
|
// no guarantee that the division of the rows between the new child streams |
|
// will be proportional to this fractional value. Additionally, because the |
|
// server-side unit for assigning data is collections of rows, this fraction |
|
// will always map to to a data storage boundary on the server side. |
|
float fraction = 2; |
|
} |
|
|
|
// Response from `SplitReadStream`. |
|
message SplitReadStreamResponse { |
|
// Primary stream, which contains the beginning portion of |
|
// |original_stream|. An empty value indicates that the original stream can no |
|
// longer be split. |
|
Stream primary_stream = 1; |
|
|
|
// Remainder stream, which contains the tail of |original_stream|. An empty |
|
// value indicates that the original stream can no longer be split. |
|
Stream remainder_stream = 2; |
|
}
|
|
|