You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
354 lines
18 KiB
354 lines
18 KiB
// Copyright 2021 Google LLC |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
syntax = "proto3"; |
|
|
|
package google.cloud.aiplatform.v1; |
|
|
|
import "google/api/field_behavior.proto"; |
|
import "google/api/resource.proto"; |
|
import "google/cloud/aiplatform/v1/encryption_spec.proto"; |
|
import "google/cloud/aiplatform/v1/io.proto"; |
|
import "google/cloud/aiplatform/v1/machine_resources.proto"; |
|
import "google/cloud/aiplatform/v1/manual_batch_tuning_parameters.proto"; |
|
import "google/cloud/aiplatform/v1/model.proto"; |
|
import "google/cloud/aiplatform/v1/pipeline_state.proto"; |
|
import "google/protobuf/struct.proto"; |
|
import "google/protobuf/timestamp.proto"; |
|
import "google/rpc/status.proto"; |
|
import "google/api/annotations.proto"; |
|
|
|
option csharp_namespace = "Google.Cloud.AIPlatform.V1"; |
|
option go_package = "google.golang.org/genproto/googleapis/cloud/aiplatform/v1;aiplatform"; |
|
option java_multiple_files = true; |
|
option java_outer_classname = "TrainingPipelineProto"; |
|
option java_package = "com.google.cloud.aiplatform.v1"; |
|
option php_namespace = "Google\\Cloud\\AIPlatform\\V1"; |
|
option ruby_package = "Google::Cloud::AIPlatform::V1"; |
|
|
|
// The TrainingPipeline orchestrates tasks associated with training a Model. It |
|
// always executes the training task, and optionally may also |
|
// export data from Vertex AI's Dataset which becomes the training input, |
|
// [upload][google.cloud.aiplatform.v1.ModelService.UploadModel] the Model to Vertex AI, and evaluate the |
|
// Model. |
|
message TrainingPipeline { |
|
option (google.api.resource) = { |
|
type: "aiplatform.googleapis.com/TrainingPipeline" |
|
pattern: "projects/{project}/locations/{location}/trainingPipelines/{training_pipeline}" |
|
}; |
|
|
|
// Output only. Resource name of the TrainingPipeline. |
|
string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
// Required. The user-defined name of this TrainingPipeline. |
|
string display_name = 2 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Specifies Vertex AI owned input data that may be used for training the |
|
// Model. The TrainingPipeline's [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition] should make |
|
// clear whether this config is used and if there are any special requirements |
|
// on how it should be filled. If nothing about this config is mentioned in |
|
// the [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition], then it should be assumed that the |
|
// TrainingPipeline does not depend on this configuration. |
|
InputDataConfig input_data_config = 3; |
|
|
|
// Required. A Google Cloud Storage path to the YAML file that defines the training task |
|
// which is responsible for producing the model artifact, and may also include |
|
// additional auxiliary work. |
|
// The definition files that can be used here are found in |
|
// gs://google-cloud-aiplatform/schema/trainingjob/definition/. |
|
// Note: The URI given on output will be immutable and probably different, |
|
// including the URI scheme, than the one given on input. The output URI will |
|
// point to a location where the user only has a read access. |
|
string training_task_definition = 4 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Required. The training task's parameter(s), as specified in the |
|
// [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]'s `inputs`. |
|
google.protobuf.Value training_task_inputs = 5 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Output only. The metadata information as specified in the [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]'s |
|
// `metadata`. This metadata is an auxiliary runtime and final information |
|
// about the training task. While the pipeline is running this information is |
|
// populated only at a best effort basis. Only present if the |
|
// pipeline's [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition] contains `metadata` object. |
|
google.protobuf.Value training_task_metadata = 6 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
// Describes the Model that may be uploaded (via [ModelService.UploadModel][google.cloud.aiplatform.v1.ModelService.UploadModel]) |
|
// by this TrainingPipeline. The TrainingPipeline's |
|
// [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition] should make clear whether this Model |
|
// description should be populated, and if there are any special requirements |
|
// regarding how it should be filled. If nothing is mentioned in the |
|
// [training_task_definition][google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition], then it should be assumed that this field |
|
// should not be filled and the training task either uploads the Model without |
|
// a need of this information, or that training task does not support |
|
// uploading a Model as part of the pipeline. |
|
// When the Pipeline's state becomes `PIPELINE_STATE_SUCCEEDED` and |
|
// the trained Model had been uploaded into Vertex AI, then the |
|
// model_to_upload's resource [name][google.cloud.aiplatform.v1.Model.name] is populated. The Model |
|
// is always uploaded into the Project and Location in which this pipeline |
|
// is. |
|
Model model_to_upload = 7; |
|
|
|
// Output only. The detailed state of the pipeline. |
|
PipelineState state = 9 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
// Output only. Only populated when the pipeline's state is `PIPELINE_STATE_FAILED` or |
|
// `PIPELINE_STATE_CANCELLED`. |
|
google.rpc.Status error = 10 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
// Output only. Time when the TrainingPipeline was created. |
|
google.protobuf.Timestamp create_time = 11 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
// Output only. Time when the TrainingPipeline for the first time entered the |
|
// `PIPELINE_STATE_RUNNING` state. |
|
google.protobuf.Timestamp start_time = 12 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
// Output only. Time when the TrainingPipeline entered any of the following states: |
|
// `PIPELINE_STATE_SUCCEEDED`, `PIPELINE_STATE_FAILED`, |
|
// `PIPELINE_STATE_CANCELLED`. |
|
google.protobuf.Timestamp end_time = 13 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
// Output only. Time when the TrainingPipeline was most recently updated. |
|
google.protobuf.Timestamp update_time = 14 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
// The labels with user-defined metadata to organize TrainingPipelines. |
|
// |
|
// Label keys and values can be no longer than 64 characters |
|
// (Unicode codepoints), can only contain lowercase letters, numeric |
|
// characters, underscores and dashes. International characters are allowed. |
|
// |
|
// See https://goo.gl/xmQnxf for more information and examples of labels. |
|
map<string, string> labels = 15; |
|
|
|
// Customer-managed encryption key spec for a TrainingPipeline. If set, this |
|
// TrainingPipeline will be secured by this key. |
|
// |
|
// Note: Model trained by this TrainingPipeline is also secured by this key if |
|
// [model_to_upload][google.cloud.aiplatform.v1.TrainingPipeline.encryption_spec] is not set separately. |
|
EncryptionSpec encryption_spec = 18; |
|
} |
|
|
|
// Specifies Vertex AI owned input data to be used for training, and |
|
// possibly evaluating, the Model. |
|
message InputDataConfig { |
|
// The instructions how the input data should be split between the |
|
// training, validation and test sets. |
|
// If no split type is provided, the [fraction_split][google.cloud.aiplatform.v1.InputDataConfig.fraction_split] is used by default. |
|
oneof split { |
|
// Split based on fractions defining the size of each set. |
|
FractionSplit fraction_split = 2; |
|
|
|
// Split based on the provided filters for each set. |
|
FilterSplit filter_split = 3; |
|
|
|
// Supported only for tabular Datasets. |
|
// |
|
// Split based on a predefined key. |
|
PredefinedSplit predefined_split = 4; |
|
|
|
// Supported only for tabular Datasets. |
|
// |
|
// Split based on the timestamp of the input data pieces. |
|
TimestampSplit timestamp_split = 5; |
|
} |
|
|
|
// Only applicable to Custom and Hyperparameter Tuning TrainingPipelines. |
|
// |
|
// The destination of the training data to be written to. |
|
// |
|
// Supported destination file formats: |
|
// * For non-tabular data: "jsonl". |
|
// * For tabular data: "csv" and "bigquery". |
|
// |
|
// The following Vertex AI environment variables are passed to containers |
|
// or python modules of the training task when this field is set: |
|
// |
|
// * AIP_DATA_FORMAT : Exported data format. |
|
// * AIP_TRAINING_DATA_URI : Sharded exported training data uris. |
|
// * AIP_VALIDATION_DATA_URI : Sharded exported validation data uris. |
|
// * AIP_TEST_DATA_URI : Sharded exported test data uris. |
|
oneof destination { |
|
// The Cloud Storage location where the training data is to be |
|
// written to. In the given directory a new directory is created with |
|
// name: |
|
// `dataset-<dataset-id>-<annotation-type>-<timestamp-of-training-call>` |
|
// where timestamp is in YYYY-MM-DDThh:mm:ss.sssZ ISO-8601 format. |
|
// All training input data is written into that directory. |
|
// |
|
// The Vertex AI environment variables representing Cloud Storage |
|
// data URIs are represented in the Cloud Storage wildcard |
|
// format to support sharded data. e.g.: "gs://.../training-*.jsonl" |
|
// |
|
// * AIP_DATA_FORMAT = "jsonl" for non-tabular data, "csv" for tabular data |
|
// * AIP_TRAINING_DATA_URI = |
|
// "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/training-*.${AIP_DATA_FORMAT}" |
|
// |
|
// * AIP_VALIDATION_DATA_URI = |
|
// "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/validation-*.${AIP_DATA_FORMAT}" |
|
// |
|
// * AIP_TEST_DATA_URI = |
|
// "gcs_destination/dataset-<dataset-id>-<annotation-type>-<time>/test-*.${AIP_DATA_FORMAT}" |
|
GcsDestination gcs_destination = 8; |
|
|
|
// Only applicable to custom training with tabular Dataset with BigQuery |
|
// source. |
|
// |
|
// The BigQuery project location where the training data is to be written |
|
// to. In the given project a new dataset is created with name |
|
// `dataset_<dataset-id>_<annotation-type>_<timestamp-of-training-call>` |
|
// where timestamp is in YYYY_MM_DDThh_mm_ss_sssZ format. All training |
|
// input data is written into that dataset. In the dataset three |
|
// tables are created, `training`, `validation` and `test`. |
|
// |
|
// * AIP_DATA_FORMAT = "bigquery". |
|
// * AIP_TRAINING_DATA_URI = |
|
// "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.training" |
|
// |
|
// * AIP_VALIDATION_DATA_URI = |
|
// "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.validation" |
|
// |
|
// * AIP_TEST_DATA_URI = |
|
// "bigquery_destination.dataset_<dataset-id>_<annotation-type>_<time>.test" |
|
BigQueryDestination bigquery_destination = 10; |
|
} |
|
|
|
// Required. The ID of the Dataset in the same Project and Location which data will be |
|
// used to train the Model. The Dataset must use schema compatible with |
|
// Model being trained, and what is compatible should be described in the |
|
// used TrainingPipeline's [training_task_definition] |
|
// [google.cloud.aiplatform.v1.TrainingPipeline.training_task_definition]. |
|
// For tabular Datasets, all their data is exported to training, to pick |
|
// and choose from. |
|
string dataset_id = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Applicable only to Datasets that have DataItems and Annotations. |
|
// |
|
// A filter on Annotations of the Dataset. Only Annotations that both |
|
// match this filter and belong to DataItems not ignored by the split method |
|
// are used in respectively training, validation or test role, depending on |
|
// the role of the DataItem they are on (for the auto-assigned that role is |
|
// decided by Vertex AI). A filter with same syntax as the one used in |
|
// [ListAnnotations][google.cloud.aiplatform.v1.DatasetService.ListAnnotations] may be used, but note |
|
// here it filters across all Annotations of the Dataset, and not just within |
|
// a single DataItem. |
|
string annotations_filter = 6; |
|
|
|
// Applicable only to custom training with Datasets that have DataItems and |
|
// Annotations. |
|
// |
|
// Cloud Storage URI that points to a YAML file describing the annotation |
|
// schema. The schema is defined as an OpenAPI 3.0.2 [Schema |
|
// Object](https://github.com/OAI/OpenAPI-Specification/blob/main/versions/3.0.2.md#schemaObject). |
|
// The schema files that can be used here are found in |
|
// gs://google-cloud-aiplatform/schema/dataset/annotation/ , note that the |
|
// chosen schema must be consistent with |
|
// [metadata][google.cloud.aiplatform.v1.Dataset.metadata_schema_uri] of the Dataset specified by |
|
// [dataset_id][google.cloud.aiplatform.v1.InputDataConfig.dataset_id]. |
|
// |
|
// Only Annotations that both match this schema and belong to DataItems not |
|
// ignored by the split method are used in respectively training, validation |
|
// or test role, depending on the role of the DataItem they are on. |
|
// |
|
// When used in conjunction with [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter], the Annotations used |
|
// for training are filtered by both [annotations_filter][google.cloud.aiplatform.v1.InputDataConfig.annotations_filter] and |
|
// [annotation_schema_uri][google.cloud.aiplatform.v1.InputDataConfig.annotation_schema_uri]. |
|
string annotation_schema_uri = 9; |
|
} |
|
|
|
// Assigns the input data to training, validation, and test sets as per the |
|
// given fractions. Any of `training_fraction`, `validation_fraction` and |
|
// `test_fraction` may optionally be provided, they must sum to up to 1. If the |
|
// provided ones sum to less than 1, the remainder is assigned to sets as |
|
// decided by Vertex AI. If none of the fractions are set, by default roughly |
|
// 80% of data is used for training, 10% for validation, and 10% for test. |
|
message FractionSplit { |
|
// The fraction of the input data that is to be used to train the Model. |
|
double training_fraction = 1; |
|
|
|
// The fraction of the input data that is to be used to validate the Model. |
|
double validation_fraction = 2; |
|
|
|
// The fraction of the input data that is to be used to evaluate the Model. |
|
double test_fraction = 3; |
|
} |
|
|
|
// Assigns input data to training, validation, and test sets based on the given |
|
// filters, data pieces not matched by any filter are ignored. Currently only |
|
// supported for Datasets containing DataItems. |
|
// If any of the filters in this message are to match nothing, then they can be |
|
// set as '-' (the minus sign). |
|
// |
|
// Supported only for unstructured Datasets. |
|
// |
|
message FilterSplit { |
|
// Required. A filter on DataItems of the Dataset. DataItems that match |
|
// this filter are used to train the Model. A filter with same syntax |
|
// as the one used in [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] may be used. If a |
|
// single DataItem is matched by more than one of the FilterSplit filters, |
|
// then it is assigned to the first set that applies to it in the |
|
// training, validation, test order. |
|
string training_filter = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Required. A filter on DataItems of the Dataset. DataItems that match |
|
// this filter are used to validate the Model. A filter with same syntax |
|
// as the one used in [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] may be used. If a |
|
// single DataItem is matched by more than one of the FilterSplit filters, |
|
// then it is assigned to the first set that applies to it in the |
|
// training, validation, test order. |
|
string validation_filter = 2 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Required. A filter on DataItems of the Dataset. DataItems that match |
|
// this filter are used to test the Model. A filter with same syntax |
|
// as the one used in [DatasetService.ListDataItems][google.cloud.aiplatform.v1.DatasetService.ListDataItems] may be used. If a |
|
// single DataItem is matched by more than one of the FilterSplit filters, |
|
// then it is assigned to the first set that applies to it in the |
|
// training, validation, test order. |
|
string test_filter = 3 [(google.api.field_behavior) = REQUIRED]; |
|
} |
|
|
|
// Assigns input data to training, validation, and test sets based on the |
|
// value of a provided key. |
|
// |
|
// Supported only for tabular Datasets. |
|
message PredefinedSplit { |
|
// Required. The key is a name of one of the Dataset's data columns. |
|
// The value of the key (either the label's value or value in the column) |
|
// must be one of {`training`, `validation`, `test`}, and it defines to which |
|
// set the given piece of data is assigned. If for a piece of data the key |
|
// is not present or has an invalid value, that piece is ignored by the |
|
// pipeline. |
|
string key = 1 [(google.api.field_behavior) = REQUIRED]; |
|
} |
|
|
|
// Assigns input data to training, validation, and test sets based on a |
|
// provided timestamps. The youngest data pieces are assigned to training set, |
|
// next to validation set, and the oldest to the test set. |
|
// |
|
// Supported only for tabular Datasets. |
|
message TimestampSplit { |
|
// The fraction of the input data that is to be used to train the Model. |
|
double training_fraction = 1; |
|
|
|
// The fraction of the input data that is to be used to validate the Model. |
|
double validation_fraction = 2; |
|
|
|
// The fraction of the input data that is to be used to evaluate the Model. |
|
double test_fraction = 3; |
|
|
|
// Required. The key is a name of one of the Dataset's data columns. |
|
// The values of the key (the values in the column) must be in RFC 3339 |
|
// `date-time` format, where `time-offset` = `"Z"` |
|
// (e.g. 1985-04-12T23:20:50.52Z). If for a piece of data the key is not |
|
// present or has an invalid value, that piece is ignored by the pipeline. |
|
string key = 4 [(google.api.field_behavior) = REQUIRED]; |
|
}
|
|
|