@ -17,23 +17,34 @@ syntax = "proto3";
package google . cloud.dataproc.v1 ;
import "google/api/field_behavior.proto" ;
import "google/api/resource.proto" ;
import "google/protobuf/duration.proto" ;
import "google/protobuf/timestamp.proto" ;
option go_package = "cloud.google.com/go/dataproc/apiv1/dataprocpb;dataprocpb" ;
option java_multiple_files = true ;
option java_outer_classname = "SharedProto" ;
option java_package = "com.google.cloud.dataproc.v1" ;
option ( google.api.resource_definition ) = {
type : "container.googleapis.com/Cluster"
pattern : "projects/{project}/locations/{location}/clusters/{cluster}"
} ;
option ( google.api.resource_definition ) = {
type : "metastore.googleapis.com/Service"
pattern : "projects/{project}/locations/{location}/services/{service}"
} ;
/ / Runtime configuration for a workload.
message RuntimeConfig {
/ / Optional. Version of the batch runtime.
string version = 1 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. Optional custom container image for the job runtime environment. If
/ / not specified , a default container image will be used.
/ / Optional. Optional custom container image for the job runtime environment.
/ / If not specified , a default container image will be used.
string container_image = 2 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. A mapping of property names to values , which are used to configure workload
/ / execution.
/ / Optional. A mapping of property names to values , which are used to
/ / configure workload execution.
map < string , string > properties = 3 [ ( google.api.field_behavior ) = OPTIONAL ] ;
}
@ -43,7 +54,8 @@ message EnvironmentConfig {
ExecutionConfig execution_config = 1 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. Peripherals configuration that workload has access to .
PeripheralsConfig peripherals_config = 2 [ ( google.api.field_behavior ) = OPTIONAL ] ;
PeripheralsConfig peripherals_config = 2
[ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / Execution configuration for a workload.
@ -65,19 +77,39 @@ message ExecutionConfig {
/ / Optional. The Cloud KMS key to use for encryption.
string kms_key = 7 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. The duration after which the workload will be terminated.
/ / When the workload passes this ttl , it will be unconditionally killed
/ / without waiting for ongoing work to finish.
/ / Minimum value is 10 minutes ; maximum value is 14 days ( see JSON
/ / representation of
/ / [ Duration ] ( https : / / developers.google.com / protocol - buffers / docs / proto3 # json ) ) .
/ / If both ttl and idle_ttl are specified , the conditions are treated as
/ / and OR : the workload will be terminated when it has been idle for idle_ttl
/ / or when the ttl has passed , whichever comes first.
/ / If ttl is not specified for a session , it defaults to 24 h.
google.protobuf.Duration ttl = 9 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. A Cloud Storage bucket used to stage workload dependencies ,
/ / config files , and store workload output and other ephemeral data , such as
/ / Spark history files. If you do not specify a staging bucket , Cloud Dataproc
/ / will determine a Cloud Storage location according to the region where your
/ / workload is running , and then create and manage project - level , per - location
/ / staging and temporary buckets.
/ / * * This field requires a Cloud Storage bucket name , not a ` gs : / / . . . ` URI to
/ / a Cloud Storage bucket. * *
string staging_bucket = 10 [ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / Spark History Server configuration for the workload.
message SparkHistoryServerConfig {
/ / Optional. Resource name of an existing Dataproc Cluster to act as a Spark History
/ / Server for the workload.
/ / Optional. Resource name of an existing Dataproc Cluster to act as a Spark
/ / History Server for the workload.
/ /
/ / Example :
/ /
/ / * ` projects / [ project_id ] / regions / [ region ] / clusters / [ cluster_name ] `
string dataproc_cluster = 1 [
( google.api.field_behavior ) = OPTIONAL
] ;
string dataproc_cluster = 1 [ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / Auxiliary services configuration for a workload.
@ -88,58 +120,111 @@ message PeripheralsConfig {
/ /
/ / * ` projects / [ project_id ] / locations / [ region ] / services / [ service_id ] `
string metastore_service = 1 [
( google.api.field_behavior ) = OPTIONAL
( google.api.field_behavior ) = OPTIONAL ,
( google.api.resource_reference ) = {
type : "metastore.googleapis.com/Service"
}
] ;
/ / Optional. The Spark History Server configuration for the workload.
SparkHistoryServerConfig spark_history_server_config = 2 [ ( google.api.field_behavior ) = OPTIONAL ] ;
SparkHistoryServerConfig spark_history_server_config = 2
[ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / Runtime information about workload execution.
message RuntimeInfo {
/ / Output only. Map of remote access endpoints ( such as web interfaces and APIs ) to their
/ / URIs.
/ / Output only. Map of remote access endpoints ( such as web interfaces and
/ / APIs ) to their URIs.
map < string , string > endpoints = 1 [ ( google.api.field_behavior ) = OUTPUT_ONLY ] ;
/ / Output only. A URI pointing to the location of the stdout and stderr of the workload.
/ / Output only. A URI pointing to the location of the stdout and stderr of the
/ / workload.
string output_uri = 2 [ ( google.api.field_behavior ) = OUTPUT_ONLY ] ;
/ / Output only. A URI pointing to the location of the diagnostics tarball.
string diagnostic_output_uri = 3 [ ( google.api.field_behavior ) = OUTPUT_ONLY ] ;
/ / Output only. Approximate workload resource usage calculated after workload
/ / finishes ( see [ Dataproc Serverless pricing ]
/ / ( https : / / cloud.google.com / dataproc - serverless / pricing ) ) .
UsageMetrics approximate_usage = 6
[ ( google.api.field_behavior ) = OUTPUT_ONLY ] ;
/ / Output only. Snapshot of current workload resource usage.
UsageSnapshot current_usage = 7 [ ( google.api.field_behavior ) = OUTPUT_ONLY ] ;
}
/ / Usage metrics represent approximate total resources consumed by a workload.
message UsageMetrics {
/ / Optional. DCU ( Dataproc Compute Units ) usage in ( ` milliDCU ` x ` seconds ` )
/ / ( see [ Dataproc Serverless pricing ]
/ / ( https : / / cloud.google.com / dataproc - serverless / pricing ) ) .
int64 milli_dcu_seconds = 1 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. Shuffle storage usage in ( ` GB ` x ` seconds ` ) ( see
/ / [ Dataproc Serverless pricing ]
/ / ( https : / / cloud.google.com / dataproc - serverless / pricing ) ) .
int64 shuffle_storage_gb_seconds = 2 [ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / The usage snaphot represents the resources consumed by a workload at a
/ / specified time.
message UsageSnapshot {
/ / Optional. Milli ( one - thousandth ) Dataproc Compute Units ( DCUs ) ( see
/ / [ Dataproc Serverless pricing ]
/ / ( https : / / cloud.google.com / dataproc - serverless / pricing ) ) .
int64 milli_dcu = 1 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. Shuffle Storage in gigabytes ( GB ) . ( see [ Dataproc Serverless
/ / pricing ] ( https : / / cloud.google.com / dataproc - serverless / pricing ) )
int64 shuffle_storage_gb = 2 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. The timestamp of the usage snapshot.
google.protobuf.Timestamp snapshot_time = 3
[ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / The cluster ' s GKE config.
message GkeClusterConfig {
/ / Optional. A target GKE cluster to deploy to . It must be in the same project and
/ / region as the Dataproc cluster ( the GKE cluster can be zonal or regional ) .
/ / Format : 'projects/{project}/locations/{location}/clusters/{cluster_id}'
/ / Optional. A target GKE cluster to deploy to . It must be in the same project
/ / and region as the Dataproc cluster ( the GKE cluster can be zonal or
/ / regional ) . Format :
/ / 'projects/{project}/locations/{location}/clusters/{cluster_id}'
string gke_cluster_target = 2 [
( google.api.field_behavior ) = OPTIONAL
( google.api.field_behavior ) = OPTIONAL ,
( google.api.resource_reference ) = {
type : "container.googleapis.com/Cluster"
}
] ;
/ / Optional. GKE NodePools where workloads will be scheduled. At least one node pool
/ / must be assigned the 'default' role. Each role can be given to only a
/ / single NodePoolTarget. All NodePools must have the same location settings.
/ / If a nodePoolTarget is not specified , Dataproc constructs a default
/ / nodePoolTarget.
repeated GkeNodePoolTarget node_pool_target = 3 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. GKE node pools where workloads will be scheduled. At least one
/ / node pool must be assigned the ` DEFAULT `
/ / [ GkeNodePoolTarget.Role ] [ google.cloud.dataproc.v1.GkeNodePoolTarget.Role ] .
/ / If a ` GkeNodePoolTarget ` is not specified , Dataproc constructs a ` DEFAULT `
/ / ` GkeNodePoolTarget ` . Each role can be given to only one
/ / ` GkeNodePoolTarget ` . All node pools must have the same location settings.
repeated GkeNodePoolTarget node_pool_target = 3
[ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / The configuration for running the Dataproc cluster on Kubernetes.
message KubernetesClusterConfig {
/ / Optional. A namespace within the Kubernetes cluster to deploy into. If this namespace
/ / does not exist , it is created. If it exists , Dataproc
/ / verifies that another Dataproc VirtualCluster is not installed
/ / into it. If not specified , the name of the Dataproc Cluster is used.
/ / Optional. A namespace within the Kubernetes cluster to deploy into. If this
/ / namespace does not exist , it is created. If it exists , Dataproc verifies
/ / that another Dataproc VirtualCluster is not installed into it. If not
/ / specified , the name of the Dataproc Cluster is used.
string kubernetes_namespace = 1 [ ( google.api.field_behavior ) = OPTIONAL ] ;
oneof config {
/ / Required. The configuration for running the Dataproc cluster on GKE.
GkeClusterConfig gke_cluster_config = 2 [ ( google.api.field_behavior ) = REQUIRED ] ;
GkeClusterConfig gke_cluster_config = 2
[ ( google.api.field_behavior ) = REQUIRED ] ;
}
/ / Optional. The software configuration for this Dataproc cluster running on Kubernetes.
KubernetesSoftwareConfig kubernetes_software_config = 3 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. The software configuration for this Dataproc cluster running on
/ / Kubernetes.
KubernetesSoftwareConfig kubernetes_software_config = 3
[ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / The software configuration for this Dataproc cluster running on Kubernetes.
@ -163,54 +248,60 @@ message KubernetesSoftwareConfig {
map < string , string > properties = 2 ;
}
/ / GKE NodeP ools that Dataproc workloads run on.
/ / GKE node p ools that Dataproc workloads run on.
message GkeNodePoolTarget {
/ / ` Role ` specifies whose tasks will run on the NodePool. The roles can be
/ / specific to workloads. Exactly one GkeNodePoolTarget within the
/ / VirtualCluster must have 'default' role , which is used to run all workloads
/ / that are not associated with a NodePool.
/ / ` Role ` specifies the tasks that will run on the node pool. Roles can be
/ / specific to workloads. Exactly one
/ / [ GkeNodePoolTarget ] [ google.cloud.dataproc.v1.GkeNodePoolTarget ] within the
/ / virtual cluster must have the ` DEFAULT ` role , which is used to run all
/ / workloads that are not associated with a node pool.
enum Role {
/ / Role is unspecified.
ROLE_UNSPECIFIED = 0 ;
/ / Any roles that are not directly assigned to a NodePool run on the
/ / ` default ` role ' s NodePool.
/ / At least one node pool must have the ` DEFAULT ` role.
/ / Work assigned to a role that is not associated with a node pool
/ / is assigned to the node pool with the ` DEFAULT ` role. For example ,
/ / work assigned to the ` CONTROLLER ` role will be assigned to the node pool
/ / with the ` DEFAULT ` role if no node pool has the ` CONTROLLER ` role.
DEFAULT = 1 ;
/ / Run controllers and webhooks.
/ / Run work associated with the Dataproc control plane ( for example ,
/ / controllers and webhooks ) . Very low resource requirements.
CONTROLLER = 2 ;
/ / Run spa rk driver.
/ / Run work associated with a Spa rk driver of a job .
SPARK_DRIVER = 3 ;
/ / Run spa rk executors .
/ / Run work associated with a Spa rk executor of a job .
SPARK_EXECUTOR = 4 ;
}
/ / Required. The target GKE NodeP ool.
/ / Required. The target GKE node p ool.
/ / Format :
/ / 'projects/{project}/locations/{location}/clusters/{cluster}/nodePools/{node_pool}'
string node_pool = 1 [
( google.api.field_behavior ) = REQUIRED
] ;
string node_pool = 1 [ ( google.api.field_behavior ) = REQUIRED ] ;
/ / Required. The types of role for a GKE NodePool
/ / Required. The roles associated with the GKE node pool.
repeated Role roles = 2 [ ( google.api.field_behavior ) = REQUIRED ] ;
/ / Optional . The configuration for the GKE NodeP ool.
/ / Input only . The configuration for the GKE node p ool.
/ /
/ / If specified , Dataproc attempts to create a NodeP ool with the
/ / If specified , Dataproc attempts to create a node p ool with the
/ / specified shape. If one with the same name already exists , it is
/ / verified against all specified fields. If a field differs , the
/ / virtual cluster creation will fail.
/ /
/ / If omitted , any NodePool with the specified name is used. If a
/ / NodePool with the specified name does not exist , Dataproc create a NodePool
/ / with default values.
GkeNodePoolConfig node_pool_config = 3 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / If omitted , any node pool with the specified name is used. If a
/ / node pool with the specified name does not exist , Dataproc create a
/ / node pool with default values.
/ /
/ / This is an input only field. It will not be returned by the API.
GkeNodePoolConfig node_pool_config = 3
[ ( google.api.field_behavior ) = INPUT_ONLY ] ;
}
/ / The configuration of a GKE NodePool used by a [ Dataproc - on - GKE
/ / The configuration of a GKE node p ool used by a [ Dataproc - on - GKE
/ / cluster ] ( https : / / cloud.google.com / dataproc / docs / concepts / jobs / dataproc - gke # create - a - dataproc - on - gke - cluster ) .
message GkeNodePoolConfig {
/ / Parameters that describe cluster nodes.
@ -219,19 +310,28 @@ message GkeNodePoolConfig {
/ / type ] ( https : / / cloud.google.com / compute / docs / machine - types ) .
string machine_type = 1 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. Whether the nodes are created as [ preemptible VM
/ / instances ] ( https : / / cloud.google.com / compute / docs / instances / preemptible ) .
bool preemptible = 10 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. The number of local SSD disks to attach to the node , which is limited by
/ / the maximum number of disks allowable per zone ( see [ Adding Local
/ / SSDs ] ( https : / / cloud.google.com / compute / docs / disks / local - ssd ) ) .
/ / Optional. The number of local SSD disks to attach to the node , which is
/ / limited by the maximum number of disks allowable per zone ( see [ Adding
/ / Local SSDs ] ( https : / / cloud.google.com / compute / docs / disks / local - ssd ) ) .
int32 local_ssd_count = 7 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. Whether the nodes are created as legacy [ preemptible VM
/ / instances ] ( https : / / cloud.google.com / compute / docs / instances / preemptible ) .
/ / Also see
/ / [ Spot ] [ google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.spot ]
/ / VMs , preemptible VM instances without a maximum lifetime. Legacy and Spot
/ / preemptible nodes cannot be used in a node pool with the ` CONTROLLER `
/ / [ role ]
/ / ( / dataproc / docs / reference / rest / v1 / projects.regions.clusters # role )
/ / or in the DEFAULT node pool if the CONTROLLER role is not assigned ( the
/ / DEFAULT node pool will assume the CONTROLLER role ) .
bool preemptible = 10 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. A list of [ hardware
/ / accelerators ] ( https : / / cloud.google.com / compute / docs / gpus ) to attach to
/ / each node.
repeated GkeNodePoolAcceleratorConfig accelerators = 11 [ ( google.api.field_behavior ) = OPTIONAL ] ;
repeated GkeNodePoolAcceleratorConfig accelerators = 11
[ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. [ Minimum CPU
/ / platform ] ( https : / / cloud.google.com / compute / docs / instances / specify - min - cpu - platform )
@ -239,26 +339,51 @@ message GkeNodePoolConfig {
/ / specified or a newer CPU platform. Specify the friendly names of CPU
/ / platforms , such as "Intel Haswell" ` or Intel Sandy Bridge " .
string min_cpu_platform = 13 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. The [ Customer Managed Encryption Key ( CMEK ) ]
/ / ( https : / / cloud.google.com / kubernetes - engine / docs / how - to / using - cmek )
/ / used to encrypt the boot disk attached to each node in the node pool.
/ / Specify the key using the following format :
/ / < code > projects / < var > KEY_PROJECT_ID < / var > / locations / < var > LOCATION < / var > / keyRings / < var > RING_NAME < / var > / cryptoKeys / < var > KEY_NAME < / var > < / code > .
string boot_disk_kms_key = 23 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. Whether the nodes are created as [ Spot VM instances ]
/ / ( https : / / cloud.google.com / compute / docs / instances / spot ) .
/ / Spot VMs are the latest update to legacy
/ / [ preemptible
/ / VMs ] [ google.cloud.dataproc.v1.GkeNodePoolConfig.GkeNodeConfig.preemptible ] .
/ / Spot VMs do not have a maximum lifetime. Legacy and Spot preemptible
/ / nodes cannot be used in a node pool with the ` CONTROLLER `
/ / [ role ] ( / dataproc / docs / reference / rest / v1 / projects.regions.clusters # role )
/ / or in the DEFAULT node pool if the CONTROLLER role is not assigned ( the
/ / DEFAULT node pool will assume the CONTROLLER role ) .
bool spot = 32 [ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / A GkeNodeConfigAcceleratorConfig represents a Hardware Accelerator request
/ / for a NodePool.
/ / for a node p ool.
message GkeNodePoolAcceleratorConfig {
/ / The number of accelerator cards exposed to an instance.
int64 accelerator_count = 1 ;
/ / The accelerator type resource namename ( see GPUs on Compute Engine ) .
string accelerator_type = 2 ;
/ / Size of partitions to create on the GPU. Valid values are described in
/ / the NVIDIA [ mig user
/ / guide ] ( https : / / docs.nvidia.com / datacenter / tesla / mig - user - guide / # partitioning ) .
string gpu_partition_size = 3 ;
}
/ / GkeNodePoolAutoscaling contains information the cluster autoscaler needs to
/ / adjust the size of the node pool to the current cluster usage.
message GkeNodePoolAutoscalingConfig {
/ / The minimum number of nodes in the NodePool. Must be > = 0 and < =
/ / The minimum number of nodes in the node p ool. Must be > = 0 and < =
/ / max_node_count.
int32 min_node_count = 2 ;
/ / The maximum number of nodes in the NodePool. Must be > = min_node_count.
/ / The maximum number of nodes in the node pool. Must be > = min_node_count ,
/ / and must be > 0.
/ / * * Note : * * Quota must be sufficient to scale up the cluster.
int32 max_node_count = 3 ;
}
@ -268,17 +393,21 @@ message GkeNodePoolConfig {
/ / Optional. The list of Compute Engine
/ / [ zones ] ( https : / / cloud.google.com / compute / docs / zones # available ) where
/ / NodePool ' s nodes will be located.
/ / node pool nodes associated with a Dataproc on GKE virtual cluster
/ / will be located.
/ /
/ / * * Note : * * Currently , only one zone may be specified.
/ / * * Note : * * All node pools associated with a virtual cluster
/ / must be located in the same region as the virtual cluster , and they must
/ / be located in the same zone within that region.
/ /
/ / If a location is not specified during NodePool creation , Dataproc will
/ / choose a location .
/ / If a location is not specified during node p ool creation , Dataproc on GKE
/ / will choose the zone .
repeated string locations = 13 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. The autoscaler configuration for this NodePool. The autoscaler is enabled
/ / only when a valid configuration is present.
GkeNodePoolAutoscalingConfig autoscaling = 4 [ ( google.api.field_behavior ) = OPTIONAL ] ;
/ / Optional. The autoscaler configuration for this node pool. The autoscaler
/ / is enabled only when a valid configuration is present.
GkeNodePoolAutoscalingConfig autoscaling = 4
[ ( google.api.field_behavior ) = OPTIONAL ] ;
}
/ / Cluster components that can be activated.
@ -308,12 +437,18 @@ enum Component {
/ / The Hive Web HCatalog ( the REST service for accessing HCatalog ) .
HIVE_WEBHCAT = 3 ;
/ / Hudi.
HUDI = 18 ;
/ / The Jupyter Notebook.
JUPYTER = 1 ;
/ / The Presto query engine.
PRESTO = 6 ;
/ / The Trino query engine.
TRINO = 17 ;
/ / The Ranger service.
RANGER = 12 ;