feat: DataScans service

feat: added StorageFormat.iceberg
chore: formatting changes

PiperOrigin-RevId: 496586743
pull/763/head
Google APIs 2 years ago committed by Copybara-Service
parent cffce7313a
commit 58f5c43973
  1. 6
      google/cloud/dataplex/v1/BUILD.bazel
  2. 207
      google/cloud/dataplex/v1/data_profile.proto
  3. 236
      google/cloud/dataplex/v1/data_quality.proto
  4. 28
      google/cloud/dataplex/v1/dataplex_v1.yaml
  5. 535
      google/cloud/dataplex/v1/datascans.proto
  6. 122
      google/cloud/dataplex/v1/logs.proto
  7. 13
      google/cloud/dataplex/v1/metadata.proto
  8. 94
      google/cloud/dataplex/v1/processing.proto

@ -23,8 +23,12 @@ proto_library(
srcs = [
"analyze.proto",
"content.proto",
"data_profile.proto",
"data_quality.proto",
"datascans.proto",
"logs.proto",
"metadata.proto",
"processing.proto",
"resources.proto",
"service.proto",
"tasks.proto",
@ -103,6 +107,8 @@ java_gapic_test(
test_classes = [
"com.google.cloud.dataplex.v1.ContentServiceClientHttpJsonTest",
"com.google.cloud.dataplex.v1.ContentServiceClientTest",
"com.google.cloud.dataplex.v1.DataScanServiceClientHttpJsonTest",
"com.google.cloud.dataplex.v1.DataScanServiceClientTest",
"com.google.cloud.dataplex.v1.DataplexServiceClientHttpJsonTest",
"com.google.cloud.dataplex.v1.DataplexServiceClientTest",
"com.google.cloud.dataplex.v1.MetadataServiceClientHttpJsonTest",

@ -0,0 +1,207 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.dataplex.v1;
import "google/cloud/dataplex/v1/processing.proto";
option go_package = "google.golang.org/genproto/googleapis/cloud/dataplex/v1;dataplex";
option java_multiple_files = true;
option java_outer_classname = "DataProfileProto";
option java_package = "com.google.cloud.dataplex.v1";
// DataProfileScan related setting.
message DataProfileSpec {}
// DataProfileResult defines the output of DataProfileScan.
// Each field of the table will have field type specific profile result.
message DataProfileResult {
// Profile information describing the structure and layout of the data
// and contains the profile info.
message Profile {
// Represents a column field within a table schema.
message Field {
// ProfileInfo defines the profile information for each schema field type.
message ProfileInfo {
// StringFieldInfo defines output info for any string type field.
message StringFieldInfo {
// The minimum length of the string field in the sampled data.
// Optional if zero non-null rows.
int64 min_length = 1;
// The maximum length of a string field in the sampled data.
// Optional if zero non-null rows.
int64 max_length = 2;
// The average length of a string field in the sampled data.
// Optional if zero non-null rows.
double average_length = 3;
}
// IntegerFieldInfo defines output for any integer type field.
message IntegerFieldInfo {
// The average of non-null values of integer field in the sampled
// data. Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
double average = 1;
// The standard deviation of non-null of integer field in the sampled
// data. Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
double standard_deviation = 3;
// The minimum value of an integer field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
int64 min = 4;
// A quartile divide the number of data points into four parts, or
// quarters, of more-or-less equal size. Three main quartiles used
// are: The first quartile (Q1) splits off the lowest 25% of data from
// the highest 75%. It is also known as the lower or 25th empirical
// quartile, as 25% of the data is below this point. The second
// quartile (Q2) is the median of a data set. So, 50% of the data lies
// below this point. The third quartile (Q3) splits off the highest
// 25% of data from the lowest 75%. It is known as the upper or 75th
// empirical quartile, as 75% of the data lies below this point. So,
// here the quartiles is provided as an ordered list of quartile
// values, occurring in order Q1, median, Q3.
repeated int64 quartiles = 6;
// The maximum value of an integer field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
int64 max = 5;
}
// DoubleFieldInfo defines output for any double type field.
message DoubleFieldInfo {
// The average of non-null values of double field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null rows.
double average = 1;
// The standard deviation of non-null of double field in the sampled
// data. Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
double standard_deviation = 3;
// The minimum value of a double field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
double min = 4;
// A quartile divide the numebr of data points into four parts, or
// quarters, of more-or-less equal size. Three main quartiles used
// are: The first quartile (Q1) splits off the lowest 25% of data from
// the highest 75%. It is also known as the lower or 25th empirical
// quartile, as 25% of the data is below this point. The second
// quartile (Q2) is the median of a data set. So, 50% of the data lies
// below this point. The third quartile (Q3) splits off the highest
// 25% of data from the lowest 75%. It is known as the upper or 75th
// empirical quartile, as 75% of the data lies below this point. So,
// here the quartiles is provided as an ordered list of quartile
// values, occurring in order Q1, median, Q3.
repeated double quartiles = 6;
// The maximum value of a double field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
double max = 5;
}
// The TopNValue defines the structure of output of top N values of a
// field.
message TopNValue {
// The value is the string value of the actual value from the field.
string value = 1;
// The frequency count of the corresponding value in the field.
int64 count = 2;
}
// The ratio of null rows against the rows in the sampled data.
double null_ratio = 2;
// The ratio of rows that are distinct against the rows in the sampled
// data.
double distinct_ratio = 3;
// The array of top N values of the field in the sampled data.
// Currently N is set as 10 or equal to distinct values in the field,
// whichever is smaller. This will be optional for complex non-groupable
// data-types such as JSON, ARRAY, JSON, STRUCT.
repeated TopNValue top_n_values = 4;
// The corresponding profile for specific field type.
// Each field will have only one field type specific profile output.
oneof field_info {
// The corresponding string field profile.
StringFieldInfo string_profile = 101;
// The corresponding integer field profile.
IntegerFieldInfo integer_profile = 102;
// The corresponding double field profile.
DoubleFieldInfo double_profile = 103;
}
}
// The name of the field.
string name = 1;
// The field data type. Possible values include:
//
// * STRING
// * BYTE
// * INT64
// * INT32
// * INT16
// * DOUBLE
// * FLOAT
// * DECIMAL
// * BOOLEAN
// * BINARY
// * TIMESTAMP
// * DATE
// * TIME
// * NULL
// * RECORD
string type = 2;
// The mode of the field. Its value will be:
// REQUIRED, if it is a required field.
// NULLABLE, if it is an optional field.
// REPEATED, if it is a repeated field.
string mode = 3;
// The profile information for the corresponding field.
ProfileInfo profile = 4;
}
// The sequence of fields describing data in table entities.
repeated Field fields = 2;
}
// The count of all rows in the sampled data.
// Return 0, if zero rows.
int64 row_count = 3;
// This represents the profile information per field.
Profile profile = 4;
// The data scanned for this profile.
ScannedData scanned_data = 5;
}

@ -0,0 +1,236 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.dataplex.v1;
import "google/api/field_behavior.proto";
import "google/cloud/dataplex/v1/processing.proto";
option go_package = "google.golang.org/genproto/googleapis/cloud/dataplex/v1;dataplex";
option java_multiple_files = true;
option java_outer_classname = "DataQualityProto";
option java_package = "com.google.cloud.dataplex.v1";
// DataQualityScan related setting.
message DataQualitySpec {
// The list of rules to evaluate against a data source. At least one rule is
// required.
repeated DataQualityRule rules = 1;
}
// The output of a DataQualityScan.
message DataQualityResult {
// Overall data quality result -- `true` if all rules passed.
bool passed = 5;
// A list of results at the dimension-level.
repeated DataQualityDimensionResult dimensions = 2;
// A list of all the rules in a job, and their results.
repeated DataQualityRuleResult rules = 3;
// The count of rows processed.
int64 row_count = 4;
// The data scanned for this result.
ScannedData scanned_data = 7;
}
// DataQualityRuleResult provides a more detailed, per-rule level view of the
// results.
message DataQualityRuleResult {
// The rule specified in the DataQualitySpec, as is.
DataQualityRule rule = 1;
// Whether the rule passed or failed.
bool passed = 7;
// The number of rows a rule was evaluated against.
// This field is only valid for ColumnMap type rules.
// Evaluated count can be configured to either
// (1) include all rows (default) - with null rows automatically failing rule
// evaluation OR (2) exclude null rows from the evaluated_count, by setting
// ignore_nulls = true
int64 evaluated_count = 9;
// The number of rows which passed a rule evaluation.
// This field is only valid for ColumnMap type rules.
int64 passed_count = 8;
// The number of rows with null values in the specified column.
int64 null_count = 5;
// The ratio of passed_count / evaluated_count.
// This field is only valid for ColumnMap type rules.
double pass_ratio = 6;
// The query to find rows that did not pass this rule.
// Only applies to ColumnMap and RowCondition rules.
string failing_rows_query = 10;
}
// DataQualityDimensionResult provides a more detailed, per-dimension level view
// of the results.
message DataQualityDimensionResult {
// Whether the dimension passed or failed.
bool passed = 3;
}
// A rule captures data quality intent about a data source.
message DataQualityRule {
// Evaluates whether each column value lies between a specified range.
message RangeExpectation {
// Optional. The minimum column value allowed for a row to pass this
// validation. At least one of min_value and max_value need to be provided.
string min_value = 1 [(google.api.field_behavior) = OPTIONAL];
// Optional. The maximum column value allowed for a row to pass this
// validation. At least one of min_value and max_value need to be provided.
string max_value = 2 [(google.api.field_behavior) = OPTIONAL];
// Optional. Whether each value needs to be strictly greater than ('>') the
// minimum, or if equality is allowed. Only relevant if a min_value has been
// defined. Default = false.
bool strict_min_enabled = 3 [(google.api.field_behavior) = OPTIONAL];
// Optional. Whether each value needs to be strictly lesser than ('<') the
// maximum, or if equality is allowed. Only relevant if a max_value has been
// defined. Default = false.
bool strict_max_enabled = 4 [(google.api.field_behavior) = OPTIONAL];
}
// Evaluates whether each column value is null.
message NonNullExpectation {}
// Evaluates whether each column value is contained by a specified set.
message SetExpectation {
repeated string values = 1;
}
// Evaluates whether each column value matches a specified regex.
message RegexExpectation {
string regex = 1;
}
// Evaluates whether the column has duplicates.
message UniquenessExpectation {}
// Evaluates whether the column aggregate statistic lies between a specified
// range.
message StatisticRangeExpectation {
enum ColumnStatistic {
// Unspecified statistic type
STATISTIC_UNDEFINED = 0;
// Evaluate the column mean
MEAN = 1;
// Evaluate the column min
MIN = 2;
// Evaluate the column max
MAX = 3;
}
ColumnStatistic statistic = 1;
// The minimum column statistic value allowed for a row to pass this
// validation.
// At least one of min_value and max_value need to be provided.
string min_value = 2;
// The maximum column statistic value allowed for a row to pass this
// validation.
// At least one of min_value and max_value need to be provided.
string max_value = 3;
// Whether column statistic needs to be strictly greater than ('>')
// the minimum, or if equality is allowed. Only relevant if a min_value has
// been defined. Default = false.
bool strict_min_enabled = 4;
// Whether column statistic needs to be strictly lesser than ('<') the
// maximum, or if equality is allowed. Only relevant if a max_value has been
// defined. Default = false.
bool strict_max_enabled = 5;
}
// Evaluates whether each row passes the specified condition.
// The SQL expression needs to use BigQuery standard SQL syntax and should
// produce a boolean per row as the result.
// Example: col1 >= 0 AND col2 < 10
message RowConditionExpectation {
string sql_expression = 1;
}
// Evaluates whether the provided expression is true.
// The SQL expression needs to use BigQuery standard SQL syntax and should
// produce a scalar boolean result.
// Example: MIN(col1) >= 0
message TableConditionExpectation {
string sql_expression = 1;
}
oneof rule_type {
// ColumnMap rule which evaluates whether each column value lies between a
// specified range.
RangeExpectation range_expectation = 1;
// ColumnMap rule which evaluates whether each column value is null.
NonNullExpectation non_null_expectation = 2;
// ColumnMap rule which evaluates whether each column value is contained by
// a specified set.
SetExpectation set_expectation = 3;
// ColumnMap rule which evaluates whether each column value matches a
// specified regex.
RegexExpectation regex_expectation = 4;
// ColumnAggregate rule which evaluates whether the column has duplicates.
UniquenessExpectation uniqueness_expectation = 100;
// ColumnAggregate rule which evaluates whether the column aggregate
// statistic lies between a specified range.
StatisticRangeExpectation statistic_range_expectation = 101;
// Table rule which evaluates whether each row passes the specified
// condition.
RowConditionExpectation row_condition_expectation = 200;
// Table rule which evaluates whether the provided expression is true.
TableConditionExpectation table_condition_expectation = 201;
}
// Optional. The unnested column which this rule is evaluated against.
string column = 500 [(google.api.field_behavior) = OPTIONAL];
// Optional. Rows with null values will automatically fail a rule, unless
// ignore_null is true. In that case, such null rows are trivially considered
// passing. Only applicable to ColumnMap rules.
bool ignore_null = 501 [(google.api.field_behavior) = OPTIONAL];
// Required. The dimension a rule belongs to. Results are also aggregated at
// the dimension-level. Supported dimensions are ["COMPLETENESS", "ACCURACY",
// "CONSISTENCY", "VALIDITY", "UNIQUENESS", "INTEGRITY"]
string dimension = 502 [(google.api.field_behavior) = REQUIRED];
// Optional. The minimum ratio of passing_rows / total_rows required to pass
// this rule, with a range of [0.0, 1.0]
//
// 0 indicates default value (i.e. 1.0)
double threshold = 503 [(google.api.field_behavior) = OPTIONAL];
}

@ -5,6 +5,7 @@ title: Cloud Dataplex API
apis:
- name: google.cloud.dataplex.v1.ContentService
- name: google.cloud.dataplex.v1.DataScanService
- name: google.cloud.dataplex.v1.DataplexService
- name: google.cloud.dataplex.v1.MetadataService
- name: google.cloud.location.Locations
@ -12,6 +13,7 @@ apis:
- name: google.longrunning.Operations
types:
- name: google.cloud.dataplex.v1.DataScanEvent
- name: google.cloud.dataplex.v1.DiscoveryEvent
- name: google.cloud.dataplex.v1.JobEvent
- name: google.cloud.dataplex.v1.OperationMetadata
@ -53,6 +55,8 @@ backend:
rules:
- selector: 'google.cloud.dataplex.v1.ContentService.*'
deadline: 60.0
- selector: 'google.cloud.dataplex.v1.DataScanService.*'
deadline: 60.0
- selector: 'google.cloud.dataplex.v1.DataplexService.*'
deadline: 60.0
- selector: 'google.cloud.dataplex.v1.MetadataService.*'
@ -79,6 +83,10 @@ http:
- get: '/v1/{resource=projects/*/locations/*/lakes/*/zones/*/assets/*}:getIamPolicy'
- get: '/v1/{resource=projects/*/locations/*/lakes/*/tasks/*}:getIamPolicy'
- get: '/v1/{resource=projects/*/locations/*/lakes/*/environments/*}:getIamPolicy'
- get: '/v1/{resource=projects/*/locations/*/dataScans/*}:getIamPolicy'
- get: '/v1/{resource=projects/*/locations/*/dataTaxonomies/*}:getIamPolicy'
- get: '/v1/{resource=projects/*/locations/*/dataTaxonomies/*/attributes/*}:getIamPolicy'
- get: '/v1/{resource=projects/*/locations/*/dataAttributeBindings/*}:getIamPolicy'
- selector: google.iam.v1.IAMPolicy.SetIamPolicy
post: '/v1/{resource=projects/*/locations/*/lakes/*}:setIamPolicy'
body: '*'
@ -91,6 +99,14 @@ http:
body: '*'
- post: '/v1/{resource=projects/*/locations/*/lakes/*/environments/*}:setIamPolicy'
body: '*'
- post: '/v1/{resource=projects/*/locations/*/dataScans/*}:setIamPolicy'
body: '*'
- post: '/v1/{resource=projects/*/locations/*/dataTaxonomies/*}:setIamPolicy'
body: '*'
- post: '/v1/{resource=projects/*/locations/*/dataTaxonomies/*/attributes/*}:setIamPolicy'
body: '*'
- post: '/v1/{resource=projects/*/locations/*/dataAttributeBindings/*}:setIamPolicy'
body: '*'
- selector: google.iam.v1.IAMPolicy.TestIamPermissions
post: '/v1/{resource=projects/*/locations/*/lakes/*}:testIamPermissions'
body: '*'
@ -103,6 +119,14 @@ http:
body: '*'
- post: '/v1/{resource=projects/*/locations/*/lakes/*/environments/*}:testIamPermissions'
body: '*'
- post: '/v1/{resource=projects/*/locations/*/dataScans/*}:testIamPermissions'
body: '*'
- post: '/v1/{resource=projects/*/locations/*/dataTaxonomies/*}:testIamPermissions'
body: '*'
- post: '/v1/{resource=projects/*/locations/*/dataTaxonomies/*/attributes/*}:testIamPermissions'
body: '*'
- post: '/v1/{resource=projects/*/locations/*/dataAttributeBindings/*}:testIamPermissions'
body: '*'
- selector: google.longrunning.Operations.CancelOperation
post: '/v1/{name=projects/*/locations/*/operations/*}:cancel'
body: '*'
@ -119,6 +143,10 @@ authentication:
oauth:
canonical_scopes: |-
https://www.googleapis.com/auth/cloud-platform
- selector: 'google.cloud.dataplex.v1.DataScanService.*'
oauth:
canonical_scopes: |-
https://www.googleapis.com/auth/cloud-platform
- selector: 'google.cloud.dataplex.v1.DataplexService.*'
oauth:
canonical_scopes: |-

@ -0,0 +1,535 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.dataplex.v1;
import "google/api/annotations.proto";
import "google/api/client.proto";
import "google/api/field_behavior.proto";
import "google/api/resource.proto";
import "google/cloud/dataplex/v1/data_profile.proto";
import "google/cloud/dataplex/v1/data_quality.proto";
import "google/cloud/dataplex/v1/processing.proto";
import "google/cloud/dataplex/v1/resources.proto";
import "google/longrunning/operations.proto";
import "google/protobuf/field_mask.proto";
import "google/protobuf/timestamp.proto";
option go_package = "google.golang.org/genproto/googleapis/cloud/dataplex/v1;dataplex";
option java_multiple_files = true;
option java_outer_classname = "DataScansProto";
option java_package = "com.google.cloud.dataplex.v1";
service DataScanService {
option (google.api.default_host) = "dataplex.googleapis.com";
option (google.api.oauth_scopes) =
"https://www.googleapis.com/auth/cloud-platform";
// Creates a dataScan resource.
rpc CreateDataScan(CreateDataScanRequest)
returns (google.longrunning.Operation) {
option (google.api.http) = {
post: "/v1/{parent=projects/*/locations/*}/dataScans"
body: "data_scan"
};
option (google.api.method_signature) = "parent,data_scan,data_scan_id";
option (google.longrunning.operation_info) = {
response_type: "DataScan"
metadata_type: "OperationMetadata"
};
}
// Update the dataScan resource.
rpc UpdateDataScan(UpdateDataScanRequest)
returns (google.longrunning.Operation) {
option (google.api.http) = {
patch: "/v1/{data_scan.name=projects/*/locations/*/dataScans/*}"
body: "data_scan"
};
option (google.api.method_signature) = "data_scan,update_mask";
option (google.longrunning.operation_info) = {
response_type: "DataScan"
metadata_type: "OperationMetadata"
};
}
// Delete the dataScan resource.
rpc DeleteDataScan(DeleteDataScanRequest)
returns (google.longrunning.Operation) {
option (google.api.http) = {
delete: "/v1/{name=projects/*/locations/*/dataScans/*}"
};
option (google.api.method_signature) = "name";
option (google.longrunning.operation_info) = {
response_type: "google.protobuf.Empty"
metadata_type: "OperationMetadata"
};
}
// Get dataScan resource.
rpc GetDataScan(GetDataScanRequest) returns (DataScan) {
option (google.api.http) = {
get: "/v1/{name=projects/*/locations/*/dataScans/*}"
};
option (google.api.method_signature) = "name";
}
// Lists dataScans.
rpc ListDataScans(ListDataScansRequest) returns (ListDataScansResponse) {
option (google.api.http) = {
get: "/v1/{parent=projects/*/locations/*}/dataScans"
};
option (google.api.method_signature) = "parent";
}
// Run an on demand execution of a DataScan.
rpc RunDataScan(RunDataScanRequest) returns (RunDataScanResponse) {
option (google.api.http) = {
post: "/v1/{name=projects/*/locations/*/dataScans/*}:run"
body: "*"
};
option (google.api.method_signature) = "name";
}
// Get DataScanJob resource.
rpc GetDataScanJob(GetDataScanJobRequest) returns (DataScanJob) {
option (google.api.http) = {
get: "/v1/{name=projects/*/locations/*/dataScans/*/jobs/*}"
};
option (google.api.method_signature) = "name";
}
// Lists DataScanJobs under the given dataScan.
rpc ListDataScanJobs(ListDataScanJobsRequest)
returns (ListDataScanJobsResponse) {
option (google.api.http) = {
get: "/v1/{parent=projects/*/locations/*/dataScans/*}/jobs"
};
option (google.api.method_signature) = "parent";
}
}
// Create dataScan request.
message CreateDataScanRequest {
// Required. The resource name of the parent location:
// projects/{project}/locations/{location_id}
// where `{project}` refers to a project_id or project_number and
// `location_id` refers to a GCP region.
string parent = 1 [
(google.api.field_behavior) = REQUIRED,
(google.api.resource_reference) = {
type: "locations.googleapis.com/Location"
}
];
// Required. DataScan resource.
DataScan data_scan = 2 [(google.api.field_behavior) = REQUIRED];
// Required. DataScan identifier.
// * Must contain only lowercase letters, numbers and hyphens.
// * Must start with a letter.
// * Must end with a number or a letter.
// * Must be between 1-63 characters.
// * Must be unique within the customer project / location.
string data_scan_id = 3 [(google.api.field_behavior) = REQUIRED];
}
// Update dataScan request.
message UpdateDataScanRequest {
// Required. Update description.
// Only fields specified in `update_mask` are updated.
DataScan data_scan = 1 [(google.api.field_behavior) = REQUIRED];
// Required. Mask of fields to update.
google.protobuf.FieldMask update_mask = 2
[(google.api.field_behavior) = REQUIRED];
}
// Delete dataScan request.
message DeleteDataScanRequest {
// Required. The resource name of the dataScan:
// projects/{project}/locations/{location_id}/dataScans/{data_scan_id}
// where `{project}` refers to a project_id or project_number and
// `location_id` refers to a GCP region.
string name = 1 [
(google.api.field_behavior) = REQUIRED,
(google.api.resource_reference) = {
type: "dataplex.googleapis.com/DataScan"
}
];
}
// Get dataScan request.
message GetDataScanRequest {
// DataScan views for getting a partial dataScan.
enum DataScanView {
// The API will default to the `BASIC` view.
DATA_SCAN_VIEW_UNSPECIFIED = 0;
// Basic view that does not include spec and result.
BASIC = 1;
// Include everything.
FULL = 10;
}
// Required. The resource name of the dataScan:
// projects/{project}/locations/{location_id}/dataScans/{data_scan_id}
// where `{project}` refers to a project_id or project_number and
// `location_id` refers to a GCP region.
string name = 1 [
(google.api.field_behavior) = REQUIRED,
(google.api.resource_reference) = {
type: "dataplex.googleapis.com/DataScan"
}
];
// Optional. Used to select the subset of DataScan information to return.
// Defaults to `BASIC`.
DataScanView view = 2 [(google.api.field_behavior) = OPTIONAL];
}
// List dataScans request.
message ListDataScansRequest {
// Required. projects/{project}/locations/{location_id}
// where `{project}` refers to a project_id or project_number and
// `location_id` refers to a GCP region.
string parent = 1 [
(google.api.field_behavior) = REQUIRED,
(google.api.resource_reference) = {
type: "locations.googleapis.com/Location"
}
];
// Optional. Maximum number of dataScans to return. The service may return
// fewer than this value. If unspecified, at most 10 scans will be returned.
// The maximum value is 1000; values above 1000 will be coerced to 1000.
int32 page_size = 2 [(google.api.field_behavior) = OPTIONAL];
// Optional. Page token received from a previous `ListDataScans` call. Provide
// this to retrieve the subsequent page. When paginating, all other parameters
// provided to `ListDataScans` must match the call that provided the
// page token.
string page_token = 3 [(google.api.field_behavior) = OPTIONAL];
// Optional. Filter request.
string filter = 4 [(google.api.field_behavior) = OPTIONAL];
// Optional. Order by fields (name or create_time) for the result.
// If not specified, the ordering is undefined.
string order_by = 5 [(google.api.field_behavior) = OPTIONAL];
}
// List dataScans response.
message ListDataScansResponse {
// DataScans (metadata only) under the given parent location.
repeated DataScan data_scans = 1;
// Token to retrieve the next page of results, or empty if there are no more
// results in the list.
string next_page_token = 2;
// Locations that could not be reached.
repeated string unreachable = 3;
}
// Run DataScan Request
message RunDataScanRequest {
// Required. The resource name of the DataScan:
// projects/{project}/locations/{location_id}/dataScans/{data_scan_id}.
// where `{project}` refers to a project_id or project_number and
// `location_id` refers to a GCP region.
// Only on-demand DataScans are allowed.
string name = 1 [
(google.api.field_behavior) = REQUIRED,
(google.api.resource_reference) = {
type: "dataplex.googleapis.com/DataScan"
}
];
}
// Run DataScan Response.
message RunDataScanResponse {
// DataScanJob created by RunDataScan API.
DataScanJob job = 1;
}
// Get DataScanJob request.
message GetDataScanJobRequest {
// DataScanJob views for getting a partial dataScanJob.
enum DataScanJobView {
// The API will default to the `BASIC` view.
DATA_SCAN_JOB_VIEW_UNSPECIFIED = 0;
// Basic view that does not include spec and result.
BASIC = 1;
// Include everything.
FULL = 10;
}
// Required. The resource name of the DataScanJob:
// projects/{project}/locations/{location_id}/dataScans/{data_scan_id}/dataScanJobs/{data_scan_job_id}
// where `{project}` refers to a project_id or project_number and
// `location_id` refers to a GCP region.
string name = 1 [
(google.api.field_behavior) = REQUIRED,
(google.api.resource_reference) = {
type: "dataplex.googleapis.com/DataScanJob"
}
];
// Optional. Used to select the subset of DataScan information to return.
// Defaults to `BASIC`.
DataScanJobView view = 2 [(google.api.field_behavior) = OPTIONAL];
}
// List DataScanJobs request.
message ListDataScanJobsRequest {
// Required. The resource name of the parent environment:
// projects/{project}/locations/{location_id}/dataScans/{data_scan_id}
// where `{project}` refers to a project_id or project_number and
// `location_id` refers to a GCP region.
string parent = 1 [
(google.api.field_behavior) = REQUIRED,
(google.api.resource_reference) = {
type: "dataplex.googleapis.com/DataScan"
}
];
// Optional. Maximum number of DataScanJobs to return. The service may return
// fewer than this value. If unspecified, at most 10 DataScanJobs will be
// returned. The maximum value is 1000; values above 1000 will be coerced to
// 1000.
int32 page_size = 2 [(google.api.field_behavior) = OPTIONAL];
// Optional. Page token received from a previous `ListDataScanJobs` call.
// Provide this to retrieve the subsequent page. When paginating, all other
// parameters provided to `ListDataScanJobs` must match the call that provided
// the page token.
string page_token = 3 [(google.api.field_behavior) = OPTIONAL];
}
// List DataScanJobs response.
message ListDataScanJobsResponse {
// DataScanJobs (metadata only) under a given dataScan.
repeated DataScanJob data_scan_jobs = 1;
// Token to retrieve the next page of results, or empty if there are no more
// results in the list.
string next_page_token = 2;
}
// Represents a user-visible job which provides the insights for the related
// data source.
// For examples:
// - Data Quality: generates queries based on the rules and run against the
// data to get data quality check results.
// - Data Profile: analyzes the data in table(s) and generates insights about
// the structure, content and relationships (such as null percent,
// cardinality, min/max/mean, etc).
message DataScan {
option (google.api.resource) = {
type: "dataplex.googleapis.com/DataScan"
pattern: "projects/{project}/locations/{location}/dataScans/{dataScan}"
};
// DataScan execution settings.
message ExecutionSpec {
// Optional. Spec related to how often and when a scan should be triggered.
// If not specified, the default is OnDemand, which means the scan will not
// run until the user calls RunDataScan API.
Trigger trigger = 1 [(google.api.field_behavior) = OPTIONAL];
// If not specified, run a data scan on all data in the table.
// The incremental is immutable, which means once the field is set,
// it cannot be unset, and vice versa.
oneof incremental {
// Immutable. The unnested field (Date or Timestamp) that contains values
// that monotonically increase over time.
string field = 100 [(google.api.field_behavior) = IMMUTABLE];
}
}
// Status of the data scan execution.
message ExecutionStatus {
// The time when the latest DataScanJob started.
google.protobuf.Timestamp latest_job_start_time = 4;
// The time when the latest DataScanJob ended.
google.protobuf.Timestamp latest_job_end_time = 5;
}
// Output only. The relative resource name of the scan, of the form:
// projects/{project}/locations/{location_id}/dataScans/{datascan_id}.
// where `{project}` refers to a project_id or project_number and
// `location_id` refers to a GCP region.
string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. System generated globally unique ID for the scan. This ID will
// be different if the scan is deleted and re-created with the same name.
string uid = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
// Optional. Description of the scan.
// * Must be between 1-1024 characters.
string description = 3 [(google.api.field_behavior) = OPTIONAL];
// Optional. User friendly display name.
// * Must be between 1-256 characters.
string display_name = 4 [(google.api.field_behavior) = OPTIONAL];
// Optional. User-defined labels for the scan.
map<string, string> labels = 5 [(google.api.field_behavior) = OPTIONAL];
// Output only. Current state of the DataScan.
State state = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. The time when the scan was created.
google.protobuf.Timestamp create_time = 7
[(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. The time when the scan was last updated.
google.protobuf.Timestamp update_time = 8
[(google.api.field_behavior) = OUTPUT_ONLY];
// Required. The data source for DataScan.
DataSource data = 9 [(google.api.field_behavior) = REQUIRED];
// Optional. DataScan execution settings.
// If not specified, the fields under it will use their default values.
ExecutionSpec execution_spec = 10 [(google.api.field_behavior) = OPTIONAL];
// Output only. Status of the data scan execution.
ExecutionStatus execution_status = 11
[(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. The type of DataScan.
DataScanType type = 12 [(google.api.field_behavior) = OUTPUT_ONLY];
// Data Scan related setting.
// It is required and immutable which means once data_quality_spec is set, it
// cannot be changed to data_profile_spec.
oneof spec {
// DataQualityScan related setting.
DataQualitySpec data_quality_spec = 100;
// DataProfileScan related setting.
DataProfileSpec data_profile_spec = 101;
}
// The result of the data scan.
oneof result {
// Output only. The result of the data quality scan.
DataQualityResult data_quality_result = 200
[(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. The result of the data profile scan.
DataProfileResult data_profile_result = 201
[(google.api.field_behavior) = OUTPUT_ONLY];
}
}
// A DataScanJob represents an instance of a data scan.
message DataScanJob {
option (google.api.resource) = {
type: "dataplex.googleapis.com/DataScanJob"
pattern: "projects/{project}/locations/{location}/dataScans/{dataScan}/jobs/{job}"
};
// Execution state for the DataScanJob.
enum State {
// The DataScanJob state is unspecified.
STATE_UNSPECIFIED = 0;
// The DataScanJob is running.
RUNNING = 1;
// The DataScanJob is canceling.
CANCELING = 2;
// The DataScanJob cancellation was successful.
CANCELLED = 3;
// The DataScanJob completed successfully.
SUCCEEDED = 4;
// The DataScanJob is no longer running due to an error.
FAILED = 5;
// The DataScanJob has been created but not started to run yet.
PENDING = 7;
}
// Output only. The relative resource name of the DataScanJob, of the form:
// projects/{project}/locations/{location_id}/dataScans/{datascan_id}/jobs/{job_id}.
// where `{project}` refers to a project_id or project_number and
// `location_id` refers to a GCP region.
string name = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. System generated globally unique ID for the DataScanJob.
string uid = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. The time when the DataScanJob was started.
google.protobuf.Timestamp start_time = 3
[(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. The time when the DataScanJob ended.
google.protobuf.Timestamp end_time = 4
[(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. Execution state for the DataScanJob.
State state = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. Additional information about the current state.
string message = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. The type of the parent DataScan.
DataScanType type = 7 [(google.api.field_behavior) = OUTPUT_ONLY];
// Data Scan related setting.
oneof spec {
// Output only. DataQualityScan related setting.
DataQualitySpec data_quality_spec = 100
[(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. DataProfileScan related setting.
DataProfileSpec data_profile_spec = 101
[(google.api.field_behavior) = OUTPUT_ONLY];
}
// The result of the data scan.
oneof result {
// Output only. The result of the data quality scan.
DataQualityResult data_quality_result = 200
[(google.api.field_behavior) = OUTPUT_ONLY];
// Output only. The result of the data profile scan.
DataProfileResult data_profile_result = 201
[(google.api.field_behavior) = OUTPUT_ONLY];
}
}
// The type of DataScan.
enum DataScanType {
// The DataScan Type is unspecified.
DATA_SCAN_TYPE_UNSPECIFIED = 0;
// Data Quality Scan.
DATA_QUALITY = 1;
// Data Profile Scan.
DATA_PROFILE = 2;
}

@ -291,10 +291,128 @@ message SessionEvent {
// The status of the event.
bool event_succeeded = 6;
// If the session is associated with an Environment with fast startup enabled,
// and was pre-created before being assigned to a user.
// If the session is associated with an environment with fast startup enabled,
// and was created before being assigned to a user.
bool fast_startup_enabled = 7;
// The idle duration of a warm pooled session before it is assigned to user.
google.protobuf.Duration unassigned_duration = 8;
}
// These messages contain information about the execution of a datascan.
// The monitored resource is 'DataScan'
message DataScanEvent {
// The type of the data scan.
enum ScanType {
// An unspecified data scan type.
SCAN_TYPE_UNSPECIFIED = 0;
// Data scan for data profile.
DATA_PROFILE = 1;
// Data scan for data quality.
DATA_QUALITY = 2;
}
// The job state of the data scan.
enum State {
// Unspecified job state.
STATE_UNSPECIFIED = 0;
// Data scan started.
STARTED = 1;
// Data scan successfully completed.
SUCCEEDED = 2;
// Data scan was unsuccessful.
FAILED = 3;
// Data scan was cancelled.
CANCELLED = 4;
}
// The trigger type for the data scan.
enum Trigger {
// An unspecified trigger type.
TRIGGER_UNSPECIFIED = 0;
// Data scan triggers on demand.
ON_DEMAND = 1;
// Data scan triggers as per schedule.
SCHEDULE = 2;
}
// The scope of job for the data scan.
enum Scope {
// An unspecified scope type.
SCOPE_UNSPECIFIED = 0;
// Data scan runs on all of the data.
FULL = 1;
// Data scan runs on incremental data.
INCREMENTAL = 2;
}
// Data profile result for data scan job.
message DataProfileResult {
// The count of rows processed in the data scan job.
int64 row_count = 1;
}
// Data quality result for data scan job.
message DataQualityResult {
// The count of rows processed in the data scan job.
int64 row_count = 1;
// Whether the data quality result was `pass` or not.
bool passed = 2;
// The result of each dimension for data quality result.
// The key of the map is the name of the dimension.
// The value is the bool value depicting whether the dimension result was
// `pass` or not.
map<string, bool> dimension_passed = 3;
}
// The data source of the data scan
string data_source = 1;
// The identifier of the specific data scan job this log entry is for.
string job_id = 2;
// The time when the data scan job started to run.
google.protobuf.Timestamp start_time = 3;
// The time when the data scan job finished.
google.protobuf.Timestamp end_time = 4;
// The type of the data scan.
ScanType type = 5;
// The status of the data scan job.
State state = 6;
// The message describing the data scan job event.
string message = 7;
// A version identifier of the spec which was used to execute this job.
string spec_version = 8;
// The trigger type of the data scan job.
Trigger trigger = 9;
// The scope of the data scan (e.g. full, incremental).
Scope scope = 10;
// The result of the data scan job.
oneof result {
// Data profile result for data profile type data scan.
DataProfileResult data_profile = 101;
// Data quality result for data quality type data scan.
DataQualityResult data_quality = 102;
}
}

@ -682,6 +682,13 @@ message StorageFormat {
string encoding = 1 [(google.api.field_behavior) = OPTIONAL];
}
// Describes Iceberg data format.
message IcebergOptions {
// Optional. The location of where the iceberg metadata is present, must be
// within the table path
string metadata_location = 1 [(google.api.field_behavior) = OPTIONAL];
}
// The specific file format of the data.
enum Format {
// Format unspecified.
@ -752,6 +759,9 @@ message StorageFormat {
// - application/x-avro
// - application/x-orc
// - application/x-tfrecord
// - application/x-parquet+iceberg
// - application/x-avro+iceberg
// - application/x-orc+iceberg
// - application/json
// - application/{subtypes}
// - text/csv
@ -768,6 +778,9 @@ message StorageFormat {
// Optional. Additional information about CSV formatted data.
JsonOptions json = 11 [(google.api.field_behavior) = OPTIONAL];
// Optional. Additional information about iceberg tables.
IcebergOptions iceberg = 12 [(google.api.field_behavior) = OPTIONAL];
}
}

@ -0,0 +1,94 @@
// Copyright 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.dataplex.v1;
import "google/api/field_behavior.proto";
import "google/api/resource.proto";
option go_package = "google.golang.org/genproto/googleapis/cloud/dataplex/v1;dataplex";
option java_multiple_files = true;
option java_outer_classname = "ProcessingProto";
option java_package = "com.google.cloud.dataplex.v1";
// DataScan scheduling and trigger settings.
message Trigger {
// The scan runs one-time via RunDataScan API.
message OnDemand {}
// The scan is scheduled to run periodically.
message Schedule {
// Required. Cron schedule (https://en.wikipedia.org/wiki/Cron) for running
// scans periodically.
// To explicitly set a timezone to the cron tab, apply a prefix in the
// cron tab: "CRON_TZ=${IANA_TIME_ZONE}" or "TZ=${IANA_TIME_ZONE}".
// The ${IANA_TIME_ZONE} may only be a valid string from IANA time zone
// database. For example, "CRON_TZ=America/New_York 1 * * * *", or
// "TZ=America/New_York 1 * * * *".
// This field is required for Schedule scans.
string cron = 1 [(google.api.field_behavior) = REQUIRED];
}
// DataScan scheduling and trigger settings.
// If not specified, the default is OnDemand, which means the scan will not
// run until the user calls RunDataScan API.
oneof mode {
// The scan runs one-time shortly after DataScan Creation.
OnDemand on_demand = 100;
// The scan is scheduled to run periodically.
Schedule schedule = 101;
}
}
// The data source for DataScan.
message DataSource {
// The source is required and immutable which means once entity is set, it
// cannot be change to others, and vice versa.
oneof source {
// Immutable. The dataplex entity that contains the data for DataScan, of
// the form:
// `projects/{project_number}/locations/{location_id}/lakes/{lake_id}/zones/{zone_id}/entities/{entity_id}`.
string entity = 100 [
(google.api.field_behavior) = IMMUTABLE,
(google.api.resource_reference) = {
type: "dataplex.googleapis.com/Entity"
}
];
}
}
// The data scanned during processing (e.g. in incremental DataScan)
message ScannedData {
// A data range denoted by a pair of start/end values of a field.
message IncrementalField {
// The field that contains values which monotonically increases over time
// (e.g. timestamp).
string field = 1;
// Value that marks the start of the range
string start = 2;
// Value that marks the end of the range
string end = 3;
}
// The range of scanned data
oneof data_range {
// The range denoted by values of an incremental field
IncrementalField incremental_field = 1;
}
}
Loading…
Cancel
Save