diff --git a/google/genomics/README.md b/google/genomics/README.md new file mode 100644 index 000000000..8bc621804 --- /dev/null +++ b/google/genomics/README.md @@ -0,0 +1,14 @@ +Stores, processes, explores and shares genomic data. This API implements +the Global Alliance for Genomics and Health (GA4GH) v0.5.1 API as well as +several extensions. + +The Google Genomics API supports access via both +[JSON/REST](https://cloud.google.com/genomics/reference/rest) and +[gRPC](https://cloud.google.com/genomics/reference/rpc). JSON/REST is more +broadly available and is easier for getting started with Google Genomics; it +works well for small metadata resources (datasets, variant sets, read group +sets) and for browsing small genomic regions for datasets of any size. For +performant bulk data access (reads and variants), use gRPC. + +See also an [overview of genomic resources](https://cloud.google.com/genomics/v1/users-guide) +and an overview of [Genomics on Google Cloud](https://cloud.google.com/genomics/overview). \ No newline at end of file diff --git a/google/genomics/v1/annotations.proto b/google/genomics/v1/annotations.proto new file mode 100644 index 000000000..a06bc7284 --- /dev/null +++ b/google/genomics/v1/annotations.proto @@ -0,0 +1,662 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; +import "google/protobuf/empty.proto"; +import "google/protobuf/field_mask.proto"; +import "google/protobuf/struct.proto"; +import "google/protobuf/wrappers.proto"; +import "google/rpc/status.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "AnnotationsProto"; +option java_package = "com.google.genomics.v1"; + + +// This service provides storage and positional retrieval of genomic +// reference annotations, including variant annotations. +service AnnotationServiceV1 { + // Creates a new annotation set. Caller must have WRITE permission for the + // associated dataset. + // + // The following fields are required: + // + // * [datasetId][google.genomics.v1.AnnotationSet.dataset_id] + // * [referenceSetId][google.genomics.v1.AnnotationSet.reference_set_id] + // + // All other fields may be optionally specified, unless documented as being + // server-generated (for example, the `id` field). + rpc CreateAnnotationSet(CreateAnnotationSetRequest) returns (AnnotationSet) { + option (google.api.http) = { post: "/v1/annotationsets" body: "annotation_set" }; + } + + // Gets an annotation set. Caller must have READ permission for + // the associated dataset. + rpc GetAnnotationSet(GetAnnotationSetRequest) returns (AnnotationSet) { + option (google.api.http) = { get: "/v1/annotationsets/{annotation_set_id}" }; + } + + // Updates an annotation set. The update must respect all mutability + // restrictions and other invariants described on the annotation set resource. + // Caller must have WRITE permission for the associated dataset. + rpc UpdateAnnotationSet(UpdateAnnotationSetRequest) returns (AnnotationSet) { + option (google.api.http) = { put: "/v1/annotationsets/{annotation_set_id}" body: "annotation_set" }; + } + + // Deletes an annotation set. Caller must have WRITE permission + // for the associated annotation set. + rpc DeleteAnnotationSet(DeleteAnnotationSetRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { delete: "/v1/annotationsets/{annotation_set_id}" }; + } + + // Searches for annotation sets that match the given criteria. Annotation sets + // are returned in an unspecified order. This order is consistent, such that + // two queries for the same content (regardless of page size) yield annotation + // sets in the same order across their respective streams of paginated + // responses. Caller must have READ permission for the queried datasets. + rpc SearchAnnotationSets(SearchAnnotationSetsRequest) returns (SearchAnnotationSetsResponse) { + option (google.api.http) = { post: "/v1/annotationsets/search" body: "*" }; + } + + // Creates a new annotation. Caller must have WRITE permission + // for the associated annotation set. + // + // The following fields are required: + // + // * [annotationSetId][google.genomics.v1.Annotation.annotation_set_id] + // * [referenceName][google.genomics.v1.Annotation.reference_name] or + // [referenceId][google.genomics.v1.Annotation.reference_id] + // + // ### Transcripts + // + // For annotations of type TRANSCRIPT, the following fields of + // [transcript][google.genomics.v1.Annotation.transcript] must be provided: + // + // * [exons.start][google.genomics.v1.Transcript.Exon.start] + // * [exons.end][google.genomics.v1.Transcript.Exon.end] + // + // All other fields may be optionally specified, unless documented as being + // server-generated (for example, the `id` field). The annotated + // range must be no longer than 100Mbp (mega base pairs). See the + // [Annotation resource][google.genomics.v1.Annotation] + // for additional restrictions on each field. + rpc CreateAnnotation(CreateAnnotationRequest) returns (Annotation) { + option (google.api.http) = { post: "/v1/annotations" body: "annotation" }; + } + + // Creates one or more new annotations atomically. All annotations must + // belong to the same annotation set. Caller must have WRITE + // permission for this annotation set. For optimal performance, batch + // positionally adjacent annotations together. + // + // If the request has a systemic issue, such as an attempt to write to + // an inaccessible annotation set, the entire RPC will fail accordingly. For + // lesser data issues, when possible an error will be isolated to the + // corresponding batch entry in the response; the remaining well formed + // annotations will be created normally. + // + // For details on the requirements for each individual annotation resource, + // see + // [CreateAnnotation][google.genomics.v1.AnnotationServiceV1.CreateAnnotation]. + rpc BatchCreateAnnotations(BatchCreateAnnotationsRequest) returns (BatchCreateAnnotationsResponse) { + option (google.api.http) = { post: "/v1/annotations:batchCreate" body: "*" }; + } + + // Gets an annotation. Caller must have READ permission + // for the associated annotation set. + rpc GetAnnotation(GetAnnotationRequest) returns (Annotation) { + option (google.api.http) = { get: "/v1/annotations/{annotation_id}" }; + } + + // Updates an annotation. Caller must have + // WRITE permission for the associated dataset. + rpc UpdateAnnotation(UpdateAnnotationRequest) returns (Annotation) { + option (google.api.http) = { put: "/v1/annotations/{annotation_id}" body: "annotation" }; + } + + // Deletes an annotation. Caller must have WRITE permission for + // the associated annotation set. + rpc DeleteAnnotation(DeleteAnnotationRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { delete: "/v1/annotations/{annotation_id}" }; + } + + // Searches for annotations that match the given criteria. Results are + // ordered by genomic coordinate (by reference sequence, then position). + // Annotations with equivalent genomic coordinates are returned in an + // unspecified order. This order is consistent, such that two queries for the + // same content (regardless of page size) yield annotations in the same order + // across their respective streams of paginated responses. Caller must have + // READ permission for the queried annotation sets. + rpc SearchAnnotations(SearchAnnotationsRequest) returns (SearchAnnotationsResponse) { + option (google.api.http) = { post: "/v1/annotations/search" body: "*" }; + } +} + +// An annotation set is a logical grouping of annotations that share consistent +// type information and provenance. Examples of annotation sets include 'all +// genes from refseq', and 'all variant annotations from ClinVar'. +message AnnotationSet { + // The server-generated annotation set ID, unique across all annotation sets. + string id = 1; + + // The dataset to which this annotation set belongs. + string dataset_id = 2; + + // The ID of the reference set that defines the coordinate space for this + // set's annotations. + string reference_set_id = 3; + + // The display name for this annotation set. + string name = 4; + + // The source URI describing the file from which this annotation set was + // generated, if any. + string source_uri = 5; + + // The type of annotations contained within this set. + AnnotationType type = 6; + + // A map of additional read alignment information. This must be of the form + // map (string key mapping to a list of string values). + map info = 17; +} + +// An annotation describes a region of reference genome. The value of an +// annotation may be one of several canonical types, supplemented by arbitrary +// info tags. An annotation is not inherently associated with a specific +// sample or individual (though a client could choose to use annotations in +// this way). Example canonical annotation types are `GENE` and +// `VARIANT`. +message Annotation { + // The server-generated annotation ID, unique across all annotations. + string id = 1; + + // The annotation set to which this annotation belongs. + string annotation_set_id = 2; + + // The display name of this annotation. + string name = 3; + + // The ID of the Google Genomics reference associated with this range. + string reference_id = 4; + + // The display name corresponding to the reference specified by + // `referenceId`, for example `chr1`, `1`, or `chrX`. + string reference_name = 5; + + // The start position of the range on the reference, 0-based inclusive. + int64 start = 6; + + // The end position of the range on the reference, 0-based exclusive. + int64 end = 7; + + // Whether this range refers to the reverse strand, as opposed to the forward + // strand. Note that regardless of this field, the start/end position of the + // range always refer to the forward strand. + bool reverse_strand = 8; + + // The data type for this annotation. Must match the containing annotation + // set's type. + AnnotationType type = 9; + + oneof value { + // A variant annotation, which describes the effect of a variant on the + // genome, the coding sequence, and/or higher level consequences at the + // organism level e.g. pathogenicity. This field is only set for annotations + // of type `VARIANT`. + VariantAnnotation variant = 10; + + // A transcript value represents the assertion that a particular region of + // the reference genome may be transcribed as RNA. An alternative splicing + // pattern would be represented as a separate transcript object. This field + // is only set for annotations of type `TRANSCRIPT`. + Transcript transcript = 11; + } + + // A map of additional read alignment information. This must be of the form + // map (string key mapping to a list of string values). + map info = 12; +} + +message VariantAnnotation { + message ClinicalCondition { + // A set of names for the condition. + repeated string names = 1; + + // The set of external IDs for this condition. + repeated ExternalId external_ids = 2; + + // The MedGen concept id associated with this gene. + // Search for these IDs at http://www.ncbi.nlm.nih.gov/medgen/ + string concept_id = 3; + + // The OMIM id for this condition. + // Search for these IDs at http://omim.org/ + string omim_id = 4; + } + + enum Type { + TYPE_UNSPECIFIED = 0; + + // `TYPE_OTHER` should be used when no other Type will suffice. + // Further explanation of the variant type may be included in the + // [info][google.genomics.v1.Annotation.info] field. + TYPE_OTHER = 1; + + // `INSERTION` indicates an insertion. + INSERTION = 2; + + // `DELETION` indicates a deletion. + DELETION = 3; + + // `SUBSTITUTION` indicates a block substitution of + // two or more nucleotides. + SUBSTITUTION = 4; + + // `SNP` indicates a single nucleotide polymorphism. + SNP = 5; + + // `STRUCTURAL` indicates a large structural variant, + // including chromosomal fusions, inversions, etc. + STRUCTURAL = 6; + + // `CNV` indicates a variation in copy number. + CNV = 7; + } + + enum Effect { + EFFECT_UNSPECIFIED = 0; + + // `EFFECT_OTHER` should be used when no other Effect + // will suffice. + EFFECT_OTHER = 1; + + // `FRAMESHIFT` indicates a mutation in which the insertion or + // deletion of nucleotides resulted in a frameshift change. + FRAMESHIFT = 2; + + // `FRAME_PRESERVING_INDEL` indicates a mutation in which a + // multiple of three nucleotides has been inserted or deleted, resulting + // in no change to the reading frame of the coding sequence. + FRAME_PRESERVING_INDEL = 3; + + // `SYNONYMOUS_SNP` indicates a single nucleotide polymorphism + // mutation that results in no amino acid change. + SYNONYMOUS_SNP = 4; + + // `NONSYNONYMOUS_SNP` indicates a single nucleotide + // polymorphism mutation that results in an amino acid change. + NONSYNONYMOUS_SNP = 5; + + // `STOP_GAIN` indicates a mutation that leads to the creation + // of a stop codon at the variant site. Frameshift mutations creating + // downstream stop codons do not count as `STOP_GAIN`. + STOP_GAIN = 6; + + // `STOP_LOSS` indicates a mutation that eliminates a + // stop codon at the variant site. + STOP_LOSS = 7; + + // `SPLICE_SITE_DISRUPTION` indicates that this variant is + // found in a splice site for the associated transcript, and alters the + // normal splicing pattern. + SPLICE_SITE_DISRUPTION = 8; + } + + enum ClinicalSignificance { + CLINICAL_SIGNIFICANCE_UNSPECIFIED = 0; + + // `OTHER` should be used when no other clinical significance + // value will suffice. + CLINICAL_SIGNIFICANCE_OTHER = 1; + + UNCERTAIN = 2; + + BENIGN = 3; + + LIKELY_BENIGN = 4; + + LIKELY_PATHOGENIC = 5; + + PATHOGENIC = 6; + + DRUG_RESPONSE = 7; + + HISTOCOMPATIBILITY = 8; + + CONFERS_SENSITIVITY = 9; + + RISK_FACTOR = 10; + + ASSOCIATION = 11; + + PROTECTIVE = 12; + + // `MULTIPLE_REPORTED` should be used when multiple clinical + // signficances are reported for a variant. The original clinical + // significance values may be provided in the `info` field. + MULTIPLE_REPORTED = 13; + } + + // Type has been adapted from ClinVar's list of variant types. + Type type = 1; + + // Effect of the variant on the coding sequence. + Effect effect = 2; + + // The alternate allele for this variant. If multiple alternate alleles + // exist at this location, create a separate variant for each one, as they + // may represent distinct conditions. + string alternate_bases = 3; + + // Google annotation ID of the gene affected by this variant. This should + // be provided when the variant is created. + string gene_id = 4; + + // Google annotation IDs of the transcripts affected by this variant. These + // should be provided when the variant is created. + repeated string transcript_ids = 5; + + // The set of conditions associated with this variant. + // A condition describes the way a variant influences human health. + repeated ClinicalCondition conditions = 6; + + // Describes the clinical significance of a variant. + // It is adapted from the ClinVar controlled vocabulary for clinical + // significance described at: + // http://www.ncbi.nlm.nih.gov/clinvar/docs/clinsig/ + ClinicalSignificance clinical_significance = 7; +} + +// A transcript represents the assertion that a particular region of the +// reference genome may be transcribed as RNA. +message Transcript { + message Exon { + // The start position of the exon on this annotation's reference sequence, + // 0-based inclusive. Note that this is relative to the reference start, and + // **not** the containing annotation start. + int64 start = 1; + + // The end position of the exon on this annotation's reference sequence, + // 0-based exclusive. Note that this is relative to the reference start, and + // *not* the containing annotation start. + int64 end = 2; + + // The frame of this exon. Contains a value of 0, 1, or 2, which indicates + // the offset of the first coding base of the exon within the reading frame + // of the coding DNA sequence, if any. This field is dependent on the + // strandedness of this annotation (see + // [Annotation.reverse_strand][google.genomics.v1.Annotation.reverse_strand]). + // For forward stranded annotations, this offset is relative to the + // [exon.start][google.genomics.v1.Transcript.Exon.start]. For reverse + // strand annotations, this offset is relative to the + // [exon.end][google.genomics.v1.Transcript.Exon.end] `- 1`. + // + // Unset if this exon does not intersect the coding sequence. Upon creation + // of a transcript, the frame must be populated for all or none of the + // coding exons. + google.protobuf.Int32Value frame = 3; + } + + message CodingSequence { + // The start of the coding sequence on this annotation's reference sequence, + // 0-based inclusive. Note that this position is relative to the reference + // start, and *not* the containing annotation start. + int64 start = 1; + + // The end of the coding sequence on this annotation's reference sequence, + // 0-based exclusive. Note that this position is relative to the reference + // start, and *not* the containing annotation start. + int64 end = 2; + } + + // The annotation ID of the gene from which this transcript is transcribed. + string gene_id = 1; + + // The exons that compose + // this transcript. This field should be unset for genomes where transcript + // splicing does not occur, for example prokaryotes. + // + // Introns are regions of the transcript that are not included in the + // spliced RNA product. Though not explicitly modeled here, intron ranges can + // be deduced; all regions of this transcript that are not exons are introns. + // + // Exonic sequences do not necessarily code for a translational product + // (amino acids). Only the regions of exons bounded by the + // [codingSequence][google.genomics.v1.Transcript.coding_sequence] correspond + // to coding DNA sequence. + // + // Exons are ordered by start position and may not overlap. + repeated Exon exons = 2; + + // The range of the coding sequence for this transcript, if any. To determine + // the exact ranges of coding sequence, intersect this range with those of the + // [exons][google.genomics.v1.Transcript.exons], if any. If there are any + // [exons][google.genomics.v1.Transcript.exons], the + // [codingSequence][google.genomics.v1.Transcript.coding_sequence] must start + // and end within them. + // + // Note that in some cases, the reference genome will not exactly match the + // observed mRNA transcript e.g. due to variance in the source genome from + // reference. In these cases, + // [exon.frame][google.genomics.v1.Transcript.Exon.frame] will not necessarily + // match the expected reference reading frame and coding exon reference bases + // cannot necessarily be concatenated to produce the original transcript mRNA. + CodingSequence coding_sequence = 3; +} + +message ExternalId { + // The name of the source of this data. + string source_name = 1; + + // The id used by the source of this data. + string id = 2; +} + +message CreateAnnotationSetRequest { + // The annotation set to create. + AnnotationSet annotation_set = 1; +} + +message GetAnnotationSetRequest { + // The ID of the annotation set to be retrieved. + string annotation_set_id = 1; +} + +message UpdateAnnotationSetRequest { + // The ID of the annotation set to be updated. + string annotation_set_id = 1; + + // The new annotation set. + AnnotationSet annotation_set = 2; + + // An optional mask specifying which fields to update. Mutable fields are + // [name][google.genomics.v1.AnnotationSet.name], + // [source_uri][google.genomics.v1.AnnotationSet.source_uri], and + // [info][google.genomics.v1.AnnotationSet.info]. If unspecified, all + // mutable fields will be updated. + google.protobuf.FieldMask update_mask = 3; +} + +message DeleteAnnotationSetRequest { + // The ID of the annotation set to be deleted. + string annotation_set_id = 1; +} + +message SearchAnnotationSetsRequest { + // Required. The dataset IDs to search within. Caller must have `READ` access + // to these datasets. + repeated string dataset_ids = 1; + + // If specified, only annotation sets associated with the given reference set + // are returned. + string reference_set_id = 2; + + // Only return annotations sets for which a substring of the name matches this + // string (case insensitive). + string name = 3; + + // If specified, only annotation sets that have any of these types are + // returned. + repeated AnnotationType types = 4; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 5; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 128. The maximum value is 1024. + int32 page_size = 6; +} + +message SearchAnnotationSetsResponse { + // The matching annotation sets. + repeated AnnotationSet annotation_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +message CreateAnnotationRequest { + // The annotation to be created. + Annotation annotation = 1; +} + +message BatchCreateAnnotationsRequest { + // The annotations to be created. At most 4096 can be specified in a single + // request. + repeated Annotation annotations = 1; +} + +message BatchCreateAnnotationsResponse { + message Entry { + // The creation status. + google.rpc.Status status = 1; + + // The created annotation, if creation was successful. + Annotation annotation = 2; + } + + // The resulting per-annotation entries, ordered consistently with the + // original request. + repeated Entry entries = 1; +} + +message GetAnnotationRequest { + // The ID of the annotation to be retrieved. + string annotation_id = 1; +} + +message UpdateAnnotationRequest { + // The ID of the annotation to be updated. + string annotation_id = 1; + + // The new annotation. + Annotation annotation = 2; + + // An optional mask specifying which fields to update. Mutable fields are + // [name][google.genomics.v1.Annotation.name], + // [variant][google.genomics.v1.Annotation.variant], + // [transcript][google.genomics.v1.Annotation.transcript], and + // [info][google.genomics.v1.Annotation.info]. If unspecified, all mutable + // fields will be updated. + google.protobuf.FieldMask update_mask = 3; +} + +message DeleteAnnotationRequest { + // The ID of the annotation to be deleted. + string annotation_id = 1; +} + +message SearchAnnotationsRequest { + // Required. The annotation sets to search within. The caller must have + // `READ` access to these annotation sets. + // All queried annotation sets must have the same type. + repeated string annotation_set_ids = 1; + + // Required. `reference_id` or `reference_name` must be set. + oneof reference { + // The ID of the reference to query. + string reference_id = 2; + + // The name of the reference to query, within the reference set associated + // with this query. + string reference_name = 3; + } + + // The start position of the range on the reference, 0-based inclusive. If + // specified, + // [referenceId][google.genomics.v1.SearchAnnotationsRequest.reference_id] or + // [referenceName][google.genomics.v1.SearchAnnotationsRequest.reference_name] + // must be specified. Defaults to 0. + int64 start = 4; + + // The end position of the range on the reference, 0-based exclusive. If + // [referenceId][google.genomics.v1.SearchAnnotationsRequest.reference_id] or + // [referenceName][google.genomics.v1.SearchAnnotationsRequest.reference_name] + // must be specified, Defaults to the length of the reference. + int64 end = 5; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 6; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 256. The maximum value is 2048. + int32 page_size = 7; +} + +message SearchAnnotationsResponse { + // The matching annotations. + repeated Annotation annotations = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +// When an [Annotation][google.genomics.v1.Annotation] or +// [AnnotationSet][google.genomics.v1.AnnotationSet] is created, if `type` is +// not specified it will be set to `GENERIC`. +enum AnnotationType { + ANNOTATION_TYPE_UNSPECIFIED = 0; + + // A `GENERIC` annotation type should be used when no other annotation + // type will suffice. This represents an untyped annotation of the reference + // genome. + GENERIC = 1; + + // A `VARIANT` annotation type. + VARIANT = 2; + + // A `GENE` annotation type represents the existence of a gene at the + // associated reference coordinates. The start coordinate is typically the + // gene's transcription start site and the end is typically the end of the + // gene's last exon. + GENE = 3; + + // A `TRANSCRIPT` annotation type represents the assertion that a + // particular region of the reference genome may be transcribed as RNA. + TRANSCRIPT = 4; +} diff --git a/google/genomics/v1/cigar.proto b/google/genomics/v1/cigar.proto new file mode 100644 index 000000000..8d41f5577 --- /dev/null +++ b/google/genomics/v1/cigar.proto @@ -0,0 +1,98 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "CigarProto"; +option java_package = "com.google.genomics.v1"; + + +// A single CIGAR operation. +message CigarUnit { + // Describes the different types of CIGAR alignment operations that exist. + // Used wherever CIGAR alignments are used. + enum Operation { + OPERATION_UNSPECIFIED = 0; + + // An alignment match indicates that a sequence can be aligned to the + // reference without evidence of an INDEL. Unlike the + // `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, + // the `ALIGNMENT_MATCH` operator does not indicate whether the + // reference and read sequences are an exact match. This operator is + // equivalent to SAM's `M`. + ALIGNMENT_MATCH = 1; + + // The insert operator indicates that the read contains evidence of bases + // being inserted into the reference. This operator is equivalent to SAM's + // `I`. + INSERT = 2; + + // The delete operator indicates that the read contains evidence of bases + // being deleted from the reference. This operator is equivalent to SAM's + // `D`. + DELETE = 3; + + // The skip operator indicates that this read skips a long segment of the + // reference, but the bases have not been deleted. This operator is commonly + // used when working with RNA-seq data, where reads may skip long segments + // of the reference between exons. This operator is equivalent to SAM's + // `N`. + SKIP = 4; + + // The soft clip operator indicates that bases at the start/end of a read + // have not been considered during alignment. This may occur if the majority + // of a read maps, except for low quality bases at the start/end of a read. + // This operator is equivalent to SAM's `S`. Bases that are soft + // clipped will still be stored in the read. + CLIP_SOFT = 5; + + // The hard clip operator indicates that bases at the start/end of a read + // have been omitted from this alignment. This may occur if this linear + // alignment is part of a chimeric alignment, or if the read has been + // trimmed (for example, during error correction or to trim poly-A tails for + // RNA-seq). This operator is equivalent to SAM's `H`. + CLIP_HARD = 6; + + // The pad operator indicates that there is padding in an alignment. This + // operator is equivalent to SAM's `P`. + PAD = 7; + + // This operator indicates that this portion of the aligned sequence exactly + // matches the reference. This operator is equivalent to SAM's `=`. + SEQUENCE_MATCH = 8; + + // This operator indicates that this portion of the aligned sequence is an + // alignment match to the reference, but a sequence mismatch. This can + // indicate a SNP or a read error. This operator is equivalent to SAM's + // `X`. + SEQUENCE_MISMATCH = 9; + } + + Operation operation = 1; + + // The number of genomic bases that the operation runs for. Required. + int64 operation_length = 2; + + // `referenceSequence` is only used at mismatches + // (`SEQUENCE_MISMATCH`) and deletions (`DELETE`). + // Filling this field replaces SAM's MD tag. If the relevant information is + // not available, this field is unset. + string reference_sequence = 3; +} diff --git a/google/genomics/v1/datasets.proto b/google/genomics/v1/datasets.proto new file mode 100644 index 000000000..ad38332a4 --- /dev/null +++ b/google/genomics/v1/datasets.proto @@ -0,0 +1,211 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; +import "google/iam/v1/iam_policy.proto"; +import "google/iam/v1/policy.proto"; +import "google/protobuf/empty.proto"; +import "google/protobuf/field_mask.proto"; +import "google/protobuf/timestamp.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "DatasetsProto"; +option java_package = "com.google.genomics.v1"; + + +// This service manages datasets, which are collections of genomic data. +service DatasetServiceV1 { + // Lists datasets within a project. + // + // For the definitions of datasets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc ListDatasets(ListDatasetsRequest) returns (ListDatasetsResponse) { + option (google.api.http) = { get: "/v1/datasets" }; + } + + // Creates a new dataset. + // + // For the definitions of datasets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc CreateDataset(CreateDatasetRequest) returns (Dataset) { + option (google.api.http) = { post: "/v1/datasets" body: "dataset" }; + } + + // Gets a dataset by ID. + // + // For the definitions of datasets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc GetDataset(GetDatasetRequest) returns (Dataset) { + option (google.api.http) = { get: "/v1/datasets/{dataset_id}" }; + } + + // Updates a dataset. + // + // For the definitions of datasets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // This method supports patch semantics. + rpc UpdateDataset(UpdateDatasetRequest) returns (Dataset) { + option (google.api.http) = { patch: "/v1/datasets/{dataset_id}" body: "dataset" }; + } + + // Deletes a dataset and all of its contents (all read group sets, + // reference sets, variant sets, call sets, annotation sets, etc.) + // This is reversible (up to one week after the deletion) via + // the + // [datasets.undelete][google.genomics.v1.DatasetServiceV1.UndeleteDataset] + // operation. + // + // For the definitions of datasets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc DeleteDataset(DeleteDatasetRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { delete: "/v1/datasets/{dataset_id}" }; + } + + // Undeletes a dataset by restoring a dataset which was deleted via this API. + // + // For the definitions of datasets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // This operation is only possible for a week after the deletion occurred. + rpc UndeleteDataset(UndeleteDatasetRequest) returns (Dataset) { + option (google.api.http) = { post: "/v1/datasets/{dataset_id}:undelete" body: "*" }; + } + + // Sets the access control policy on the specified dataset. Replaces any + // existing policy. + // + // For the definitions of datasets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // See Setting a + // Policy for more information. + rpc SetIamPolicy(google.iam.v1.SetIamPolicyRequest) returns (google.iam.v1.Policy) { + option (google.api.http) = { post: "/v1/{resource=datasets/*}:setIamPolicy" body: "*" }; + } + + // Gets the access control policy for the dataset. This is empty if the + // policy or resource does not exist. + // + // See Getting a + // Policy for more information. + // + // For the definitions of datasets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc GetIamPolicy(google.iam.v1.GetIamPolicyRequest) returns (google.iam.v1.Policy) { + option (google.api.http) = { post: "/v1/{resource=datasets/*}:getIamPolicy" body: "*" }; + } + + // Returns permissions that a caller has on the specified resource. + // See Testing + // Permissions for more information. + // + // For the definitions of datasets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc TestIamPermissions(google.iam.v1.TestIamPermissionsRequest) returns (google.iam.v1.TestIamPermissionsResponse) { + option (google.api.http) = { post: "/v1/{resource=datasets/*}:testIamPermissions" body: "*" }; + } +} + +// A Dataset is a collection of genomic data. +// +// For more genomics resource definitions, see [Fundamentals of Google +// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) +message Dataset { + // The server-generated dataset ID, unique across all datasets. + string id = 1; + + // The Google Developers Console project ID that this dataset belongs to. + string project_id = 2; + + // The dataset name. + string name = 3; + + // The time this dataset was created, in seconds from the epoch. + google.protobuf.Timestamp create_time = 4; +} + +// The dataset list request. +message ListDatasetsRequest { + // Required. The project to list datasets for. + string project_id = 1; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 50. The maximum value is 1024. + int32 page_size = 2; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 3; +} + +// The dataset list response. +message ListDatasetsResponse { + // The list of matching Datasets. + repeated Dataset datasets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +message CreateDatasetRequest { + // The dataset to be created. Must contain projectId and name. + Dataset dataset = 1; +} + +message UpdateDatasetRequest { + // The ID of the dataset to be updated. + string dataset_id = 1; + + // The new dataset data. + Dataset dataset = 2; + + // An optional mask specifying which fields to update. At this time, the only + // mutable field is [name][google.genomics.v1.Dataset.name]. The only + // acceptable value is "name". If unspecified, all mutable fields will be + // updated. + google.protobuf.FieldMask update_mask = 3; +} + +message DeleteDatasetRequest { + // The ID of the dataset to be deleted. + string dataset_id = 1; +} + +message UndeleteDatasetRequest { + // The ID of the dataset to be undeleted. + string dataset_id = 1; +} + +message GetDatasetRequest { + // The ID of the dataset. + string dataset_id = 1; +} diff --git a/google/genomics/v1/operations.proto b/google/genomics/v1/operations.proto new file mode 100644 index 000000000..465bdd268 --- /dev/null +++ b/google/genomics/v1/operations.proto @@ -0,0 +1,58 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; +import "google/protobuf/any.proto"; +import "google/protobuf/timestamp.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "OperationsProto"; +option java_package = "com.google.genomics.v1"; + + +// Metadata describing an [Operation][google.longrunning.Operation]. +message OperationMetadata { + // The Google Cloud Project in which the job is scoped. + string project_id = 1; + + // The time at which the job was submitted to the Genomics service. + google.protobuf.Timestamp create_time = 2; + + // The time at which the job stopped running. + google.protobuf.Timestamp end_time = 4; + + // The original request that started the operation. Note that this will be in + // current version of the API. If the operation was started with v1beta2 API + // and a GetOperation is performed on v1 API, a v1 request will be returned. + google.protobuf.Any request = 5; + + // Optional event messages that were generated during the job's execution. + // This also contains any warnings that were generated during import + // or export. + repeated OperationEvent events = 6; + + // Runtime metadata on this Operation. + google.protobuf.Any runtime_metadata = 8; +} + +// An event that occurred during an [Operation][google.longrunning.Operation]. +message OperationEvent { + // Required description of event. + string description = 3; +} diff --git a/google/genomics/v1/position.proto b/google/genomics/v1/position.proto new file mode 100644 index 000000000..e860ff009 --- /dev/null +++ b/google/genomics/v1/position.proto @@ -0,0 +1,41 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "PositionProto"; +option java_package = "com.google.genomics.v1"; + + +// An abstraction for referring to a genomic position, in relation to some +// already known reference. For now, represents a genomic position as a +// reference name, a base number on that reference (0-based), and a +// determination of forward or reverse strand. +message Position { + // The name of the reference in whatever reference set is being used. + string reference_name = 1; + + // The 0-based offset from the start of the forward strand for that reference. + int64 position = 2; + + // Whether this position is on the reverse strand, as opposed to the forward + // strand. + bool reverse_strand = 3; +} diff --git a/google/genomics/v1/range.proto b/google/genomics/v1/range.proto new file mode 100644 index 000000000..6b300aa68 --- /dev/null +++ b/google/genomics/v1/range.proto @@ -0,0 +1,38 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "RangeProto"; +option java_package = "com.google.genomics.v1"; + + +// A 0-based half-open genomic coordinate range for search requests. +message Range { + // The reference sequence name, for example `chr1`, + // `1`, or `chrX`. + string reference_name = 1; + + // The start position of the range on the reference, 0-based inclusive. + int64 start = 2; + + // The end position of the range on the reference, 0-based exclusive. + int64 end = 3; +} diff --git a/google/genomics/v1/readalignment.proto b/google/genomics/v1/readalignment.proto new file mode 100644 index 000000000..71655863e --- /dev/null +++ b/google/genomics/v1/readalignment.proto @@ -0,0 +1,220 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; +import "google/genomics/v1/cigar.proto"; +import "google/genomics/v1/position.proto"; +import "google/protobuf/struct.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "ReadAlignmentProto"; +option java_package = "com.google.genomics.v1"; + + +// A linear alignment can be represented by one CIGAR string. Describes the +// mapped position and local alignment of the read to the reference. +message LinearAlignment { + // The position of this alignment. + Position position = 1; + + // The mapping quality of this alignment. Represents how likely + // the read maps to this position as opposed to other locations. + // + // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to + // the nearest integer. + int32 mapping_quality = 2; + + // Represents the local alignment of this sequence (alignment matches, indels, + // etc) against the reference. + repeated CigarUnit cigar = 3; +} + +// A read alignment describes a linear alignment of a string of DNA to a +// [reference sequence][google.genomics.v1.Reference], in addition to metadata +// about the fragment (the molecule of DNA sequenced) and the read (the bases +// which were read by the sequencer). A read is equivalent to a line in a SAM +// file. A read belongs to exactly one read group and exactly one +// [read group set][google.genomics.v1.ReadGroupSet]. +// +// For more genomics resource definitions, see [Fundamentals of Google +// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) +// +// ### Reverse-stranded reads +// +// Mapped reads (reads having a non-null `alignment`) can be aligned to either +// the forward or the reverse strand of their associated reference. Strandedness +// of a mapped read is encoded by `alignment.position.reverseStrand`. +// +// If we consider the reference to be a forward-stranded coordinate space of +// `[0, reference.length)` with `0` as the left-most position and +// `reference.length` as the right-most position, reads are always aligned left +// to right. That is, `alignment.position.position` always refers to the +// left-most reference coordinate and `alignment.cigar` describes the alignment +// of this read to the reference from left to right. All per-base fields such as +// `alignedSequence` and `alignedQuality` share this same left-to-right +// orientation; this is true of reads which are aligned to either strand. For +// reverse-stranded reads, this means that `alignedSequence` is the reverse +// complement of the bases that were originally reported by the sequencing +// machine. +// +// ### Generating a reference-aligned sequence string +// +// When interacting with mapped reads, it's often useful to produce a string +// representing the local alignment of the read to reference. The following +// pseudocode demonstrates one way of doing this: +// +// out = "" +// offset = 0 +// for c in read.alignment.cigar { +// switch c.operation { +// case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH": +// out += read.alignedSequence[offset:offset+c.operationLength] +// offset += c.operationLength +// break +// case "CLIP_SOFT", "INSERT": +// offset += c.operationLength +// break +// case "PAD": +// out += repeat("*", c.operationLength) +// break +// case "DELETE": +// out += repeat("-", c.operationLength) +// break +// case "SKIP": +// out += repeat(" ", c.operationLength) +// break +// case "CLIP_HARD": +// break +// } +// } +// return out +// +// ### Converting to SAM's CIGAR string +// +// The following pseudocode generates a SAM CIGAR string from the +// `cigar` field. Note that this is a lossy conversion +// (`cigar.referenceSequence` is lost). +// +// cigarMap = { +// "ALIGNMENT_MATCH": "M", +// "INSERT": "I", +// "DELETE": "D", +// "SKIP": "N", +// "CLIP_SOFT": "S", +// "CLIP_HARD": "H", +// "PAD": "P", +// "SEQUENCE_MATCH": "=", +// "SEQUENCE_MISMATCH": "X", +// } +// cigarStr = "" +// for c in read.alignment.cigar { +// cigarStr += c.operationLength + cigarMap[c.operation] +// } +// return cigarStr +message Read { + // The server-generated read ID, unique across all reads. This is different + // from the `fragmentName`. + string id = 1; + + // The ID of the read group this read belongs to. A read belongs to exactly + // one read group. This is a server-generated ID which is distinct from SAM's + // RG tag (for that value, see + // [ReadGroup.name][google.genomics.v1.ReadGroup.name]). + string read_group_id = 2; + + // The ID of the read group set this read belongs to. A read belongs to + // exactly one read group set. + string read_group_set_id = 3; + + // The fragment name. Equivalent to QNAME (query template name) in SAM. + string fragment_name = 4; + + // The orientation and the distance between reads from the fragment are + // consistent with the sequencing protocol (SAM flag 0x2). + bool proper_placement = 5; + + // The fragment is a PCR or optical duplicate (SAM flag 0x400). + bool duplicate_fragment = 6; + + // The observed length of the fragment, equivalent to TLEN in SAM. + int32 fragment_length = 7; + + // The read number in sequencing. 0-based and less than numberReads. This + // field replaces SAM flag 0x40 and 0x80. + int32 read_number = 8; + + // The number of reads in the fragment (extension to SAM flag 0x1). + int32 number_reads = 9; + + // Whether this read did not pass filters, such as platform or vendor quality + // controls (SAM flag 0x200). + bool failed_vendor_quality_checks = 10; + + // The linear alignment for this alignment record. This field is null for + // unmapped reads. + LinearAlignment alignment = 11; + + // Whether this alignment is secondary. Equivalent to SAM flag 0x100. + // A secondary alignment represents an alternative to the primary alignment + // for this read. Aligners may return secondary alignments if a read can map + // ambiguously to multiple coordinates in the genome. By convention, each read + // has one and only one alignment where both `secondaryAlignment` + // and `supplementaryAlignment` are false. + bool secondary_alignment = 12; + + // Whether this alignment is supplementary. Equivalent to SAM flag 0x800. + // Supplementary alignments are used in the representation of a chimeric + // alignment. In a chimeric alignment, a read is split into multiple + // linear alignments that map to different reference contigs. The first + // linear alignment in the read will be designated as the representative + // alignment; the remaining linear alignments will be designated as + // supplementary alignments. These alignments may have different mapping + // quality scores. In each linear alignment in a chimeric alignment, the read + // will be hard clipped. The `alignedSequence` and + // `alignedQuality` fields in the alignment record will only + // represent the bases for its respective linear alignment. + bool supplementary_alignment = 13; + + // The bases of the read sequence contained in this alignment record, + // **without CIGAR operations applied** (equivalent to SEQ in SAM). + // `alignedSequence` and `alignedQuality` may be + // shorter than the full read sequence and quality. This will occur if the + // alignment is part of a chimeric alignment, or if the read was trimmed. When + // this occurs, the CIGAR for this read will begin/end with a hard clip + // operator that will indicate the length of the excised sequence. + string aligned_sequence = 14; + + // The quality of the read sequence contained in this alignment record + // (equivalent to QUAL in SAM). + // `alignedSequence` and `alignedQuality` may be shorter than the full read + // sequence and quality. This will occur if the alignment is part of a + // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR + // for this read will begin/end with a hard clip operator that will indicate + // the length of the excised sequence. + repeated int32 aligned_quality = 15; + + // The mapping of the primary alignment of the + // `(readNumber+1)%numberReads` read in the fragment. It replaces + // mate position and mate strand in SAM. + Position next_mate_position = 16; + + // A map of additional read alignment information. This must be of the form + // map (string key mapping to a list of string values). + map info = 17; +} diff --git a/google/genomics/v1/readgroup.proto b/google/genomics/v1/readgroup.proto new file mode 100644 index 000000000..93c1e11cc --- /dev/null +++ b/google/genomics/v1/readgroup.proto @@ -0,0 +1,105 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; +import "google/protobuf/struct.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "ReadGroupProto"; +option java_package = "com.google.genomics.v1"; + + +// A read group is all the data that's processed the same way by the sequencer. +message ReadGroup { + message Experiment { + // A client-supplied library identifier; a library is a collection of DNA + // fragments which have been prepared for sequencing from a sample. This + // field is important for quality control as error or bias can be introduced + // during sample preparation. + string library_id = 1; + + // The platform unit used as part of this experiment, for example + // flowcell-barcode.lane for Illumina or slide for SOLiD. Corresponds to the + // @RG PU field in the SAM spec. + string platform_unit = 2; + + // The sequencing center used as part of this experiment. + string sequencing_center = 3; + + // The instrument model used as part of this experiment. This maps to + // sequencing technology in the SAM spec. + string instrument_model = 4; + } + + message Program { + // The command line used to run this program. + string command_line = 1; + + // The user specified locally unique ID of the program. Used along with + // `prevProgramId` to define an ordering between programs. + string id = 2; + + // The display name of the program. This is typically the colloquial name of + // the tool used, for example 'bwa' or 'picard'. + string name = 3; + + // The ID of the program run before this one. + string prev_program_id = 4; + + // The version of the program run. + string version = 5; + } + + // The server-generated read group ID, unique for all read groups. + // Note: This is different than the @RG ID field in the SAM spec. For that + // value, see [name][google.genomics.v1.ReadGroup.name]. + string id = 1; + + // The dataset to which this read group belongs. + string dataset_id = 2; + + // The read group name. This corresponds to the @RG ID field in the SAM spec. + string name = 3; + + // A free-form text description of this read group. + string description = 4; + + // A client-supplied sample identifier for the reads in this read group. + string sample_id = 5; + + // The experiment used to generate this read group. + Experiment experiment = 6; + + // The predicted insert size of this read group. The insert size is the length + // the sequenced DNA fragment from end-to-end, not including the adapters. + int32 predicted_insert_size = 7; + + // The programs used to generate this read group. Programs are always + // identical for all read groups within a read group set. For this reason, + // only the first read group in a returned set will have this field + // populated. + repeated Program programs = 10; + + // The reference set the reads in this read group are aligned to. + string reference_set_id = 11; + + // A map of additional read group information. This must be of the form + // map (string key mapping to a list of string values). + map info = 12; +} diff --git a/google/genomics/v1/readgroupset.proto b/google/genomics/v1/readgroupset.proto new file mode 100644 index 000000000..fdf9585d0 --- /dev/null +++ b/google/genomics/v1/readgroupset.proto @@ -0,0 +1,63 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; +import "google/genomics/v1/readgroup.proto"; +import "google/protobuf/struct.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "ReadGroupSetProto"; +option java_package = "com.google.genomics.v1"; + + +// A read group set is a logical collection of read groups, which are +// collections of reads produced by a sequencer. A read group set typically +// models reads corresponding to one sample, sequenced one way, and aligned one +// way. +// +// * A read group set belongs to one dataset. +// * A read group belongs to one read group set. +// * A read belongs to one read group. +// +// For more genomics resource definitions, see [Fundamentals of Google +// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) +message ReadGroupSet { + // The server-generated read group set ID, unique for all read group sets. + string id = 1; + + // The dataset to which this read group set belongs. + string dataset_id = 2; + + // The reference set to which the reads in this read group set are aligned. + string reference_set_id = 3; + + // The read group set name. By default this will be initialized to the sample + // name of the sequenced data contained in this set. + string name = 4; + + // The filename of the original source file for this read group set, if any. + string filename = 5; + + // The read groups in this set. There are typically 1-10 read groups in a read + // group set. + repeated ReadGroup read_groups = 6; + + // A map of additional read group set information. + map info = 7; +} diff --git a/google/genomics/v1/reads.proto b/google/genomics/v1/reads.proto new file mode 100644 index 000000000..e5e3b7817 --- /dev/null +++ b/google/genomics/v1/reads.proto @@ -0,0 +1,461 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; +import "google/genomics/v1/range.proto"; +import "google/genomics/v1/readalignment.proto"; +import "google/genomics/v1/readgroupset.proto"; +import "google/longrunning/operations.proto"; +import "google/protobuf/empty.proto"; +import "google/protobuf/field_mask.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "ReadsProto"; +option java_package = "com.google.genomics.v1"; + + +service StreamingReadService { + // Returns a stream of all the reads matching the search request, ordered + // by reference name, position, and ID. + rpc StreamReads(StreamReadsRequest) returns (stream StreamReadsResponse) { + option (google.api.http) = { post: "/v1/reads:stream" body: "*" }; + } +} + +// The Readstore. A data store for DNA sequencing Reads. +// +service ReadServiceV1 { + // Creates read group sets by asynchronously importing the provided + // information. + // + // For the definitions of read group sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // The caller must have WRITE permissions to the dataset. + // + // ## Notes on [BAM](https://samtools.github.io/hts-specs/SAMv1.pdf) import + // + // - Tags will be converted to strings - tag types are not preserved + // - Comments (`@CO`) in the input file header will not be preserved + // - Original header order of references (`@SQ`) will not be preserved + // - Any reverse stranded unmapped reads will be reverse complemented, and + // their qualities (also the "BQ" and "OQ" tags, if any) will be reversed + // - Unmapped reads will be stripped of positional information (reference name + // and position) + rpc ImportReadGroupSets(ImportReadGroupSetsRequest) returns (google.longrunning.Operation) { + option (google.api.http) = { post: "/v1/readgroupsets:import" body: "*" }; + } + + // Exports a read group set to a BAM file in Google Cloud Storage. + // + // For the definitions of read group sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Note that currently there may be some differences between exported BAM + // files and the original BAM file at the time of import. See + // [ImportReadGroupSets](google.genomics.v1.ReadServiceV1.ImportReadGroupSets) + // for caveats. + rpc ExportReadGroupSet(ExportReadGroupSetRequest) returns (google.longrunning.Operation) { + option (google.api.http) = { post: "/v1/readgroupsets/{read_group_set_id}:export" body: "*" }; + } + + // Searches for read group sets matching the criteria. + // + // For the definitions of read group sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Implements + // [GlobalAllianceApi.searchReadGroupSets](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/readmethods.avdl#L135). + rpc SearchReadGroupSets(SearchReadGroupSetsRequest) returns (SearchReadGroupSetsResponse) { + option (google.api.http) = { post: "/v1/readgroupsets/search" body: "*" }; + } + + // Updates a read group set. + // + // For the definitions of read group sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // This method supports patch semantics. + rpc UpdateReadGroupSet(UpdateReadGroupSetRequest) returns (ReadGroupSet) { + option (google.api.http) = { patch: "/v1/readgroupsets/{read_group_set_id}" body: "read_group_set" }; + } + + // Deletes a read group set. + // + // For the definitions of read group sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc DeleteReadGroupSet(DeleteReadGroupSetRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { delete: "/v1/readgroupsets/{read_group_set_id}" }; + } + + // Gets a read group set by ID. + // + // For the definitions of read group sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc GetReadGroupSet(GetReadGroupSetRequest) returns (ReadGroupSet) { + option (google.api.http) = { get: "/v1/readgroupsets/{read_group_set_id}" }; + } + + // Lists fixed width coverage buckets for a read group set, each of which + // correspond to a range of a reference sequence. Each bucket summarizes + // coverage information across its corresponding genomic range. + // + // For the definitions of read group sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Coverage is defined as the number of reads which are aligned to a given + // base in the reference sequence. Coverage buckets are available at several + // precomputed bucket widths, enabling retrieval of various coverage 'zoom + // levels'. The caller must have READ permissions for the target read group + // set. + rpc ListCoverageBuckets(ListCoverageBucketsRequest) returns (ListCoverageBucketsResponse) { + option (google.api.http) = { get: "/v1/readgroupsets/{read_group_set_id}/coveragebuckets" }; + } + + // Gets a list of reads for one or more read group sets. + // + // For the definitions of read group sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Reads search operates over a genomic coordinate space of reference sequence + // & position defined over the reference sequences to which the requested + // read group sets are aligned. + // + // If a target positional range is specified, search returns all reads whose + // alignment to the reference genome overlap the range. A query which + // specifies only read group set IDs yields all reads in those read group + // sets, including unmapped reads. + // + // All reads returned (including reads on subsequent pages) are ordered by + // genomic coordinate (by reference sequence, then position). Reads with + // equivalent genomic coordinates are returned in an unspecified order. This + // order is consistent, such that two queries for the same content (regardless + // of page size) yield reads in the same order across their respective streams + // of paginated responses. + // + // Implements + // [GlobalAllianceApi.searchReads](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/readmethods.avdl#L85). + rpc SearchReads(SearchReadsRequest) returns (SearchReadsResponse) { + option (google.api.http) = { post: "/v1/reads/search" body: "*" }; + } +} + +// The read group set search request. +message SearchReadGroupSetsRequest { + // Restricts this query to read group sets within the given datasets. At least + // one ID must be provided. + repeated string dataset_ids = 1; + + // Only return read group sets for which a substring of the name matches this + // string. + string name = 3; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 2; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 256. The maximum value is 1024. + int32 page_size = 4; +} + +// The read group set search response. +message SearchReadGroupSetsResponse { + // The list of matching read group sets. + repeated ReadGroupSet read_group_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +// The read group set import request. +message ImportReadGroupSetsRequest { + enum PartitionStrategy { + PARTITION_STRATEGY_UNSPECIFIED = 0; + + // In most cases, this strategy yields one read group set per file. This is + // the default behavior. + // + // Allocate one read group set per file per sample. For BAM files, read + // groups are considered to share a sample if they have identical sample + // names. Furthermore, all reads for each file which do not belong to a read + // group, if any, will be grouped into a single read group set per-file. + PER_FILE_PER_SAMPLE = 1; + + // Includes all read groups in all imported files into a single read group + // set. Requires that the headers for all imported files are equivalent. All + // reads which do not belong to a read group, if any, will be grouped into a + // separate read group set. + MERGE_ALL = 2; + } + + // Required. The ID of the dataset these read group sets will belong to. The + // caller must have WRITE permissions to this dataset. + string dataset_id = 1; + + // The reference set to which the imported read group sets are aligned to, if + // any. The reference names of this reference set must be a superset of those + // found in the imported file headers. If no reference set id is provided, a + // best effort is made to associate with a matching reference set. + string reference_set_id = 4; + + // A list of URIs pointing at [BAM + // files](https://samtools.github.io/hts-specs/SAMv1.pdf) + // in Google Cloud Storage. + repeated string source_uris = 2; + + // The partition strategy describes how read groups are partitioned into read + // group sets. + PartitionStrategy partition_strategy = 5; +} + +// The read group set import response. +message ImportReadGroupSetsResponse { + // IDs of the read group sets that were created. + repeated string read_group_set_ids = 1; +} + +// The read group set export request. +message ExportReadGroupSetRequest { + // Required. The Google Developers Console project ID that owns this + // export. The caller must have WRITE access to this project. + string project_id = 1; + + // Required. A Google Cloud Storage URI for the exported BAM file. + // The currently authenticated user must have write access to the new file. + // An error will be returned if the URI already contains data. + string export_uri = 2; + + // Required. The ID of the read group set to export. The caller must have + // READ access to this read group set. + string read_group_set_id = 3; + + // The reference names to export. If this is not specified, all reference + // sequences, including unmapped reads, are exported. + // Use `*` to export only unmapped reads. + repeated string reference_names = 4; +} + +message UpdateReadGroupSetRequest { + // The ID of the read group set to be updated. The caller must have WRITE + // permissions to the dataset associated with this read group set. + string read_group_set_id = 1; + + // The new read group set data. See `updateMask` for details on mutability of + // fields. + ReadGroupSet read_group_set = 2; + + // An optional mask specifying which fields to update. Supported fields: + // + // * [name][google.genomics.v1.ReadGroupSet.name]. + // * [referenceSetId][google.genomics.v1.ReadGroupSet.reference_set_id]. + // + // Leaving `updateMask` unset is equivalent to specifying all mutable + // fields. + google.protobuf.FieldMask update_mask = 3; +} + +message DeleteReadGroupSetRequest { + // The ID of the read group set to be deleted. The caller must have WRITE + // permissions to the dataset associated with this read group set. + string read_group_set_id = 1; +} + +message GetReadGroupSetRequest { + // The ID of the read group set. + string read_group_set_id = 1; +} + +message ListCoverageBucketsRequest { + // Required. The ID of the read group set over which coverage is requested. + string read_group_set_id = 1; + + // The name of the reference to query, within the reference set associated + // with this query. Optional. + string reference_name = 3; + + // The start position of the range on the reference, 0-based inclusive. If + // specified, `referenceName` must also be specified. Defaults to 0. + int64 start = 4; + + // The end position of the range on the reference, 0-based exclusive. If + // specified, `referenceName` must also be specified. If unset or 0, defaults + // to the length of the reference. + int64 end = 5; + + // The desired width of each reported coverage bucket in base pairs. This + // will be rounded down to the nearest precomputed bucket width; the value + // of which is returned as `bucketWidth` in the response. Defaults + // to infinity (each bucket spans an entire reference sequence) or the length + // of the target range, if specified. The smallest precomputed + // `bucketWidth` is currently 2048 base pairs; this is subject to + // change. + int64 target_bucket_width = 6; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 7; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 1024. The maximum value is 2048. + int32 page_size = 8; +} + +// A bucket over which read coverage has been precomputed. A bucket corresponds +// to a specific range of the reference sequence. +message CoverageBucket { + // The genomic coordinate range spanned by this bucket. + Range range = 1; + + // The average number of reads which are aligned to each individual + // reference base in this bucket. + float mean_coverage = 2; +} + +message ListCoverageBucketsResponse { + // The length of each coverage bucket in base pairs. Note that buckets at the + // end of a reference sequence may be shorter. This value is omitted if the + // bucket width is infinity (the default behaviour, with no range or + // `targetBucketWidth`). + int64 bucket_width = 1; + + // The coverage buckets. The list of buckets is sparse; a bucket with 0 + // overlapping reads is not returned. A bucket never crosses more than one + // reference sequence. Each bucket has width `bucketWidth`, unless + // its end is the end of the reference sequence. + repeated CoverageBucket coverage_buckets = 2; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 3; +} + +// The read search request. +message SearchReadsRequest { + // The IDs of the read groups sets within which to search for reads. All + // specified read group sets must be aligned against a common set of reference + // sequences; this defines the genomic coordinates for the query. Must specify + // one of `readGroupSetIds` or `readGroupIds`. + repeated string read_group_set_ids = 1; + + // The IDs of the read groups within which to search for reads. All specified + // read groups must belong to the same read group sets. Must specify one of + // `readGroupSetIds` or `readGroupIds`. + repeated string read_group_ids = 5; + + // The reference sequence name, for example `chr1`, `1`, or `chrX`. If set to + // `*`, only unmapped reads are returned. If unspecified, all reads (mapped + // and unmapped) are returned. + string reference_name = 7; + + // The start position of the range on the reference, 0-based inclusive. If + // specified, `referenceName` must also be specified. + int64 start = 8; + + // The end position of the range on the reference, 0-based exclusive. If + // specified, `referenceName` must also be specified. + int64 end = 9; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 3; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 256. The maximum value is 2048. + int32 page_size = 4; +} + +// The read search response. +message SearchReadsResponse { + // The list of matching alignments sorted by mapped genomic coordinate, + // if any, ascending in position within the same reference. Unmapped reads, + // which have no position, are returned contiguously and are sorted in + // ascending lexicographic order by fragment name. + repeated Read alignments = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +// The stream reads request. +message StreamReadsRequest { + // The Google Developers Console project ID or number which will be billed + // for this access. The caller must have WRITE access to this project. + // Required. + string project_id = 1; + + // The ID of the read group set from which to stream reads. + string read_group_set_id = 2; + + // The reference sequence name, for example `chr1`, + // `1`, or `chrX`. If set to *, only unmapped reads are + // returned. + string reference_name = 3; + + // The start position of the range on the reference, 0-based inclusive. If + // specified, `referenceName` must also be specified. + int64 start = 4; + + // The end position of the range on the reference, 0-based exclusive. If + // specified, `referenceName` must also be specified. + int64 end = 5; + + // Restricts results to a shard containing approximately `1/totalShards` + // of the normal response payload for this query. Results from a sharded + // request are disjoint from those returned by all queries which differ only + // in their shard parameter. A shard may yield 0 results; this is especially + // likely for large values of `totalShards`. + // + // Valid values are `[0, totalShards)`. + int32 shard = 6; + + // Specifying `totalShards` causes a disjoint subset of the normal response + // payload to be returned for each query with a unique `shard` parameter + // specified. A best effort is made to yield equally sized shards. Sharding + // can be used to distribute processing amongst workers, where each worker is + // assigned a unique `shard` number and all workers specify the same + // `totalShards` number. The union of reads returned for all sharded queries + // `[0, totalShards)` is equal to those returned by a single unsharded query. + // + // Queries for different values of `totalShards` with common divisors will + // share shard boundaries. For example, streaming `shard` 2 of 5 + // `totalShards` yields the same results as streaming `shard`s 4 and 5 of 10 + // `totalShards`. This property can be leveraged for adaptive retries. + int32 total_shards = 7; +} + +message StreamReadsResponse { + repeated Read alignments = 1; +} diff --git a/google/genomics/v1/references.proto b/google/genomics/v1/references.proto new file mode 100644 index 000000000..e245e7744 --- /dev/null +++ b/google/genomics/v1/references.proto @@ -0,0 +1,281 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "ReferencesProto"; +option java_package = "com.google.genomics.v1"; + + +service ReferenceServiceV1 { + // Searches for reference sets which match the given criteria. + // + // For the definitions of references and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Implements + // [GlobalAllianceApi.searchReferenceSets](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/referencemethods.avdl#L71) + rpc SearchReferenceSets(SearchReferenceSetsRequest) returns (SearchReferenceSetsResponse) { + option (google.api.http) = { post: "/v1/referencesets/search" body: "*" }; + } + + // Gets a reference set. + // + // For the definitions of references and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Implements + // [GlobalAllianceApi.getReferenceSet](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/referencemethods.avdl#L83). + rpc GetReferenceSet(GetReferenceSetRequest) returns (ReferenceSet) { + option (google.api.http) = { get: "/v1/referencesets/{reference_set_id}" }; + } + + // Searches for references which match the given criteria. + // + // For the definitions of references and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Implements + // [GlobalAllianceApi.searchReferences](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/referencemethods.avdl#L146). + rpc SearchReferences(SearchReferencesRequest) returns (SearchReferencesResponse) { + option (google.api.http) = { post: "/v1/references/search" body: "*" }; + } + + // Gets a reference. + // + // For the definitions of references and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Implements + // [GlobalAllianceApi.getReference](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/referencemethods.avdl#L158). + rpc GetReference(GetReferenceRequest) returns (Reference) { + option (google.api.http) = { get: "/v1/references/{reference_id}" }; + } + + // Lists the bases in a reference, optionally restricted to a range. + // + // For the definitions of references and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Implements + // [GlobalAllianceApi.getReferenceBases](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/referencemethods.avdl#L221). + rpc ListBases(ListBasesRequest) returns (ListBasesResponse) { + option (google.api.http) = { get: "/v1/references/{reference_id}/bases" }; + } +} + +// A reference is a canonical assembled DNA sequence, intended to act as a +// reference coordinate space for other genomic annotations. A single reference +// might represent the human chromosome 1 or mitochandrial DNA, for instance. A +// reference belongs to one or more reference sets. +// +// For more genomics resource definitions, see [Fundamentals of Google +// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) +message Reference { + // The server-generated reference ID, unique across all references. + string id = 1; + + // The length of this reference's sequence. + int64 length = 2; + + // MD5 of the upper-case sequence excluding all whitespace characters (this + // is equivalent to SQ:M5 in SAM). This value is represented in lower case + // hexadecimal format. + string md5checksum = 3; + + // The name of this reference, for example `22`. + string name = 4; + + // The URI from which the sequence was obtained. Typically specifies a FASTA + // format file. + string source_uri = 5; + + // All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) ideally + // with a version number, for example `GCF_000001405.26`. + repeated string source_accessions = 6; + + // ID from http://www.ncbi.nlm.nih.gov/taxonomy. For example, 9606 for human. + int32 ncbi_taxon_id = 7; +} + +// A reference set is a set of references which typically comprise a reference +// assembly for a species, such as `GRCh38` which is representative +// of the human genome. A reference set defines a common coordinate space for +// comparing reference-aligned experimental data. A reference set contains 1 or +// more references. +// +// For more genomics resource definitions, see [Fundamentals of Google +// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) +message ReferenceSet { + // The server-generated reference set ID, unique across all reference sets. + string id = 1; + + // The IDs of the reference objects that are part of this set. + // `Reference.md5checksum` must be unique within this set. + repeated string reference_ids = 2; + + // Order-independent MD5 checksum which identifies this reference set. The + // checksum is computed by sorting all lower case hexidecimal string + // `reference.md5checksum` (for all reference in this set) in + // ascending lexicographic order, concatenating, and taking the MD5 of that + // value. The resulting value is represented in lower case hexadecimal format. + string md5checksum = 3; + + // ID from http://www.ncbi.nlm.nih.gov/taxonomy (for example, 9606 for human) + // indicating the species which this reference set is intended to model. Note + // that contained references may specify a different `ncbiTaxonId`, as + // assemblies may contain reference sequences which do not belong to the + // modeled species, for example EBV in a human reference genome. + int32 ncbi_taxon_id = 4; + + // Free text description of this reference set. + string description = 5; + + // Public id of this reference set, such as `GRCh37`. + string assembly_id = 6; + + // The URI from which the references were obtained. + string source_uri = 7; + + // All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) ideally + // with a version number, for example `NC_000001.11`. + repeated string source_accessions = 8; +} + +message SearchReferenceSetsRequest { + // If present, return reference sets for which the + // [md5checksum][google.genomics.v1.ReferenceSet.md5checksum] matches exactly. + repeated string md5checksums = 1; + + // If present, return reference sets for which a prefix of any of + // [sourceAccessions][google.genomics.v1.ReferenceSet.source_accessions] + // match any of these strings. Accession numbers typically have a main number + // and a version, for example `NC_000001.11`. + repeated string accessions = 2; + + // If present, return reference sets for which a substring of their + // `assemblyId` matches this string (case insensitive). + string assembly_id = 3; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 4; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 1024. The maximum value is 4096. + int32 page_size = 5; +} + +message SearchReferenceSetsResponse { + // The matching references sets. + repeated ReferenceSet reference_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +message GetReferenceSetRequest { + // The ID of the reference set. + string reference_set_id = 1; +} + +message SearchReferencesRequest { + // If present, return references for which the + // [md5checksum][google.genomics.v1.Reference.md5checksum] matches exactly. + repeated string md5checksums = 1; + + // If present, return references for which a prefix of any of + // [sourceAccessions][google.genomics.v1.Reference.source_accessions] match + // any of these strings. Accession numbers typically have a main number and a + // version, for example `GCF_000001405.26`. + repeated string accessions = 2; + + // If present, return only references which belong to this reference set. + string reference_set_id = 3; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 4; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 1024. The maximum value is 4096. + int32 page_size = 5; +} + +message SearchReferencesResponse { + // The matching references. + repeated Reference references = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +message GetReferenceRequest { + // The ID of the reference. + string reference_id = 1; +} + +message ListBasesRequest { + // The ID of the reference. + string reference_id = 1; + + // The start position (0-based) of this query. Defaults to 0. + int64 start = 2; + + // The end position (0-based, exclusive) of this query. Defaults to the length + // of this reference. + int64 end = 3; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 4; + + // The maximum number of bases to return in a single page. If unspecified, + // defaults to 200Kbp (kilo base pairs). The maximum value is 10Mbp (mega base + // pairs). + int32 page_size = 5; +} + +message ListBasesResponse { + // The offset position (0-based) of the given `sequence` from the + // start of this `Reference`. This value will differ for each page + // in a paginated request. + int64 offset = 1; + + // A substring of the bases that make up this reference. + string sequence = 2; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 3; +} diff --git a/google/genomics/v1/variants.proto b/google/genomics/v1/variants.proto new file mode 100644 index 000000000..1c132770f --- /dev/null +++ b/google/genomics/v1/variants.proto @@ -0,0 +1,903 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1; + +import "google/api/annotations.proto"; +import "google/longrunning/operations.proto"; +import "google/protobuf/empty.proto"; +import "google/protobuf/field_mask.proto"; +import "google/protobuf/struct.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "VariantsProto"; +option java_package = "com.google.genomics.v1"; + + +service StreamingVariantService { + // Returns a stream of all the variants matching the search request, ordered + // by reference name, position, and ID. + rpc StreamVariants(StreamVariantsRequest) returns (stream StreamVariantsResponse) { + option (google.api.http) = { post: "/v1/variants:stream" body: "*" }; + } +} + +service VariantServiceV1 { + // Creates variant data by asynchronously importing the provided information. + // + // For the definitions of variant sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // The variants for import will be merged with any existing variant that + // matches its reference sequence, start, end, reference bases, and + // alternative bases. If no such variant exists, a new one will be created. + // + // When variants are merged, the call information from the new variant + // is added to the existing variant, and Variant info fields are merged + // as specified in + // [infoMergeConfig][google.genomics.v1.ImportVariantsRequest.info_merge_config]. + // As a special case, for single-sample VCF files, QUAL and FILTER fields will + // be moved to the call level; these are sometimes interpreted in a + // call-specific context. + // Imported VCF headers are appended to the metadata already in a variant set. + rpc ImportVariants(ImportVariantsRequest) returns (google.longrunning.Operation) { + option (google.api.http) = { post: "/v1/variants:import" body: "*" }; + } + + // Creates a new variant set. + // + // For the definitions of variant sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // The provided variant set must have a valid `datasetId` set - all other + // fields are optional. Note that the `id` field will be ignored, as this is + // assigned by the server. + rpc CreateVariantSet(CreateVariantSetRequest) returns (VariantSet) { + option (google.api.http) = { post: "/v1/variantsets" body: "variant_set" }; + } + + // Exports variant set data to an external destination. + // + // For the definitions of variant sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc ExportVariantSet(ExportVariantSetRequest) returns (google.longrunning.Operation) { + option (google.api.http) = { post: "/v1/variantsets/{variant_set_id}:export" body: "*" }; + } + + // Gets a variant set by ID. + // + // For the definitions of variant sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc GetVariantSet(GetVariantSetRequest) returns (VariantSet) { + option (google.api.http) = { get: "/v1/variantsets/{variant_set_id}" }; + } + + // Returns a list of all variant sets matching search criteria. + // + // For the definitions of variant sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Implements + // [GlobalAllianceApi.searchVariantSets](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/variantmethods.avdl#L49). + rpc SearchVariantSets(SearchVariantSetsRequest) returns (SearchVariantSetsResponse) { + option (google.api.http) = { post: "/v1/variantsets/search" body: "*" }; + } + + // Deletes a variant set including all variants, call sets, and calls within. + // This is not reversible. + // + // For the definitions of variant sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc DeleteVariantSet(DeleteVariantSetRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { delete: "/v1/variantsets/{variant_set_id}" }; + } + + // Updates a variant set using patch semantics. + // + // For the definitions of variant sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc UpdateVariantSet(UpdateVariantSetRequest) returns (VariantSet) { + option (google.api.http) = { patch: "/v1/variantsets/{variant_set_id}" body: "variant_set" }; + } + + // Gets a list of variants matching the criteria. + // + // For the definitions of variants and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Implements + // [GlobalAllianceApi.searchVariants](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/variantmethods.avdl#L126). + rpc SearchVariants(SearchVariantsRequest) returns (SearchVariantsResponse) { + option (google.api.http) = { post: "/v1/variants/search" body: "*" }; + } + + // Creates a new variant. + // + // For the definitions of variants and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc CreateVariant(CreateVariantRequest) returns (Variant) { + option (google.api.http) = { post: "/v1/variants" body: "variant" }; + } + + // Updates a variant. + // + // For the definitions of variants and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // This method supports patch semantics. Returns the modified variant without + // its calls. + rpc UpdateVariant(UpdateVariantRequest) returns (Variant) { + option (google.api.http) = { patch: "/v1/variants/{variant_id}" body: "variant" }; + } + + // Deletes a variant. + // + // For the definitions of variants and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc DeleteVariant(DeleteVariantRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { delete: "/v1/variants/{variant_id}" }; + } + + // Gets a variant by ID. + // + // For the definitions of variants and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc GetVariant(GetVariantRequest) returns (Variant) { + option (google.api.http) = { get: "/v1/variants/{variant_id}" }; + } + + // Merges the given variants with existing variants. + // + // For the definitions of variants and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Each variant will be + // merged with an existing variant that matches its reference sequence, + // start, end, reference bases, and alternative bases. If no such variant + // exists, a new one will be created. + // + // When variants are merged, the call information from the new variant + // is added to the existing variant. Variant info fields are merged as + // specified in the + // [infoMergeConfig][google.genomics.v1.MergeVariantsRequest.info_merge_config] + // field of the MergeVariantsRequest. + // + // Please exercise caution when using this method! It is easy to introduce + // mistakes in existing variants and difficult to back out of them. For + // example, + // suppose you were trying to merge a new variant with an existing one and + // both + // variants contain calls that belong to callsets with the same callset ID. + // + // // Existing variant - irrelevant fields trimmed for clarity + // { + // "variantSetId": "10473108253681171589", + // "referenceName": "1", + // "start": "10582", + // "referenceBases": "G", + // "alternateBases": [ + // "A" + // ], + // "calls": [ + // { + // "callSetId": "10473108253681171589-0", + // "callSetName": "CALLSET0", + // "genotype": [ + // 0, + // 1 + // ], + // } + // ] + // } + // + // // New variant with conflicting call information + // { + // "variantSetId": "10473108253681171589", + // "referenceName": "1", + // "start": "10582", + // "referenceBases": "G", + // "alternateBases": [ + // "A" + // ], + // "calls": [ + // { + // "callSetId": "10473108253681171589-0", + // "callSetName": "CALLSET0", + // "genotype": [ + // 1, + // 1 + // ], + // } + // ] + // } + // + // The resulting merged variant would overwrite the existing calls with those + // from the new variant: + // + // { + // "variantSetId": "10473108253681171589", + // "referenceName": "1", + // "start": "10582", + // "referenceBases": "G", + // "alternateBases": [ + // "A" + // ], + // "calls": [ + // { + // "callSetId": "10473108253681171589-0", + // "callSetName": "CALLSET0", + // "genotype": [ + // 1, + // 1 + // ], + // } + // ] + // } + // + // This may be the desired outcome, but it is up to the user to determine if + // if that is indeed the case. + rpc MergeVariants(MergeVariantsRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { post: "/v1/variants:merge" body: "*" }; + } + + // Gets a list of call sets matching the criteria. + // + // For the definitions of call sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // Implements + // [GlobalAllianceApi.searchCallSets](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/variantmethods.avdl#L178). + rpc SearchCallSets(SearchCallSetsRequest) returns (SearchCallSetsResponse) { + option (google.api.http) = { post: "/v1/callsets/search" body: "*" }; + } + + // Creates a new call set. + // + // For the definitions of call sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc CreateCallSet(CreateCallSetRequest) returns (CallSet) { + option (google.api.http) = { post: "/v1/callsets" body: "call_set" }; + } + + // Updates a call set. + // + // For the definitions of call sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + // + // This method supports patch semantics. + rpc UpdateCallSet(UpdateCallSetRequest) returns (CallSet) { + option (google.api.http) = { patch: "/v1/callsets/{call_set_id}" body: "call_set" }; + } + + // Deletes a call set. + // + // For the definitions of call sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc DeleteCallSet(DeleteCallSetRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { delete: "/v1/callsets/{call_set_id}" }; + } + + // Gets a call set by ID. + // + // For the definitions of call sets and other genomics resources, see + // [Fundamentals of Google + // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) + rpc GetCallSet(GetCallSetRequest) returns (CallSet) { + option (google.api.http) = { get: "/v1/callsets/{call_set_id}" }; + } +} + +// Metadata describes a single piece of variant call metadata. +// These data include a top level key and either a single value string (value) +// or a list of key-value pairs (info.) +// Value and info are mutually exclusive. +message VariantSetMetadata { + enum Type { + TYPE_UNSPECIFIED = 0; + + INTEGER = 1; + + FLOAT = 2; + + FLAG = 3; + + CHARACTER = 4; + + STRING = 5; + } + + // The top-level key. + string key = 1; + + // The value field for simple metadata + string value = 2; + + // User-provided ID field, not enforced by this API. + // Two or more pieces of structured metadata with identical + // id and key fields are considered equivalent. + string id = 4; + + // The type of data. Possible types include: Integer, Float, + // Flag, Character, and String. + Type type = 5; + + // The number of values that can be included in a field described by this + // metadata. + string number = 8; + + // A textual description of this metadata. + string description = 7; + + // Remaining structured metadata key-value pairs. This must be of the form + // map (string key mapping to a list of string values). + map info = 3; +} + +// A variant set is a collection of call sets and variants. It contains summary +// statistics of those contents. A variant set belongs to a dataset. +// +// For more genomics resource definitions, see [Fundamentals of Google +// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) +message VariantSet { + // The dataset to which this variant set belongs. + string dataset_id = 1; + + // The server-generated variant set ID, unique across all variant sets. + string id = 2; + + // The reference set to which the variant set is mapped. The reference set + // describes the alignment provenance of the variant set, while the + // `referenceBounds` describe the shape of the actual variant data. The + // reference set's reference names are a superset of those found in the + // `referenceBounds`. + // + // For example, given a variant set that is mapped to the GRCh38 reference set + // and contains a single variant on reference 'X', `referenceBounds` would + // contain only an entry for 'X', while the associated reference set + // enumerates all possible references: '1', '2', 'X', 'Y', 'MT', etc. + string reference_set_id = 6; + + // A list of all references used by the variants in a variant set + // with associated coordinate upper bounds for each one. + repeated ReferenceBound reference_bounds = 5; + + // The metadata associated with this variant set. + repeated VariantSetMetadata metadata = 4; + + // User-specified, mutable name. + string name = 7; + + // A textual description of this variant set. + string description = 8; +} + +// A variant represents a change in DNA sequence relative to a reference +// sequence. For example, a variant could represent a SNP or an insertion. +// Variants belong to a variant set. +// +// For more genomics resource definitions, see [Fundamentals of Google +// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) +// +// Each of the calls on a variant represent a determination of genotype with +// respect to that variant. For example, a call might assign probability of 0.32 +// to the occurrence of a SNP named rs1234 in a sample named NA12345. A call +// belongs to a call set, which contains related calls typically from one +// sample. +message Variant { + // The ID of the variant set this variant belongs to. + string variant_set_id = 15; + + // The server-generated variant ID, unique across all variants. + string id = 2; + + // Names for the variant, for example a RefSNP ID. + repeated string names = 3; + + // The date this variant was created, in milliseconds from the epoch. + int64 created = 12; + + // The reference on which this variant occurs. + // (such as `chr20` or `X`) + string reference_name = 14; + + // The position at which this variant occurs (0-based). + // This corresponds to the first base of the string of reference bases. + int64 start = 16; + + // The end position (0-based) of this variant. This corresponds to the first + // base after the last base in the reference allele. So, the length of + // the reference allele is (end - start). This is useful for variants + // that don't explicitly give alternate bases, for example large deletions. + int64 end = 13; + + // The reference bases for this variant. They start at the given + // position. + string reference_bases = 6; + + // The bases that appear instead of the reference bases. + repeated string alternate_bases = 7; + + // A measure of how likely this variant is to be real. + // A higher value is better. + double quality = 8; + + // A list of filters (normally quality filters) this variant has failed. + // `PASS` indicates this variant has passed all filters. + repeated string filter = 9; + + // A map of additional variant information. This must be of the form + // map (string key mapping to a list of string values). + map info = 10; + + // The variant calls for this particular variant. Each one represents the + // determination of genotype with respect to this variant. + repeated VariantCall calls = 11; +} + +// A call represents the determination of genotype with respect to a particular +// variant. It may include associated information such as quality and phasing. +// For example, a call might assign a probability of 0.32 to the occurrence of +// a SNP named rs1234 in a call set with the name NA12345. +message VariantCall { + // The ID of the call set this variant call belongs to. + string call_set_id = 8; + + // The name of the call set this variant call belongs to. + string call_set_name = 9; + + // The genotype of this variant call. Each value represents either the value + // of the `referenceBases` field or a 1-based index into + // `alternateBases`. If a variant had a `referenceBases` + // value of `T` and an `alternateBases` + // value of `["A", "C"]`, and the `genotype` was + // `[2, 1]`, that would mean the call + // represented the heterozygous value `CA` for this variant. + // If the `genotype` was instead `[0, 1]`, the + // represented value would be `TA`. Ordering of the + // genotype values is important if the `phaseset` is present. + // If a genotype is not called (that is, a `.` is present in the + // GT string) -1 is returned. + repeated int32 genotype = 7; + + // If this field is present, this variant call's genotype ordering implies + // the phase of the bases and is consistent with any other variant calls in + // the same reference sequence which have the same phaseset value. + // When importing data from VCF, if the genotype data was phased but no + // phase set was specified this field will be set to `*`. + string phaseset = 5; + + // The genotype likelihoods for this variant call. Each array entry + // represents how likely a specific genotype is for this call. The value + // ordering is defined by the GL tag in the VCF spec. + // If Phred-scaled genotype likelihood scores (PL) are available and + // log10(P) genotype likelihood scores (GL) are not, PL scores are converted + // to GL scores. If both are available, PL scores are stored in `info`. + repeated double genotype_likelihood = 6; + + // A map of additional variant call information. This must be of the form + // map (string key mapping to a list of string values). + map info = 2; +} + +// A call set is a collection of variant calls, typically for one sample. It +// belongs to a variant set. +// +// For more genomics resource definitions, see [Fundamentals of Google +// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) +message CallSet { + // The server-generated call set ID, unique across all call sets. + string id = 1; + + // The call set name. + string name = 2; + + // The sample ID this call set corresponds to. + string sample_id = 7; + + // The IDs of the variant sets this call set belongs to. This field must + // have exactly length one, as a call set belongs to a single variant set. + // This field is repeated for compatibility with the + // [GA4GH 0.5.1 + // API](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/variants.avdl#L76). + repeated string variant_set_ids = 6; + + // The date this call set was created in milliseconds from the epoch. + int64 created = 5; + + // A map of additional call set information. This must be of the form + // map (string key mapping to a list of string values). + map info = 4; +} + +// ReferenceBound records an upper bound for the starting coordinate of +// variants in a particular reference. +message ReferenceBound { + // The name of the reference associated with this reference bound. + string reference_name = 1; + + // An upper bound (inclusive) on the starting coordinate of any + // variant in the reference sequence. + int64 upper_bound = 2; +} + +// The variant data import request. +message ImportVariantsRequest { + enum Format { + FORMAT_UNSPECIFIED = 0; + + // VCF (Variant Call Format). The VCF files should be uncompressed. gVCF is + // also supported. + FORMAT_VCF = 1; + + // Complete Genomics masterVarBeta format. The masterVarBeta files should + // be bzip2 compressed. + FORMAT_COMPLETE_GENOMICS = 2; + } + + // Required. The variant set to which variant data should be imported. + string variant_set_id = 1; + + // A list of URIs referencing variant files in Google Cloud Storage. URIs can + // include wildcards [as described + // here](https://cloud.google.com/storage/docs/gsutil/addlhelp/WildcardNames). + // Note that recursive wildcards ('**') are not supported. + repeated string source_uris = 2; + + // The format of the variant data being imported. If unspecified, defaults to + // to `VCF`. + Format format = 3; + + // Convert reference names to the canonical representation. + // hg19 haploytypes (those reference names containing "_hap") + // are not modified in any way. + // All other reference names are modified according to the following rules: + // The reference name is capitalized. + // The "chr" prefix is dropped for all autosomes and sex chromsomes. + // For example "chr17" becomes "17" and "chrX" becomes "X". + // All mitochondrial chromosomes ("chrM", "chrMT", etc) become "MT". + bool normalize_reference_names = 5; + + // A mapping between info field keys and the InfoMergeOperations to + // be performed on them. This is plumbed down to the MergeVariantRequests + // generated by the resulting import job. + map info_merge_config = 6; +} + +// The variant data import response. +message ImportVariantsResponse { + // IDs of the call sets created during the import. + repeated string call_set_ids = 1; +} + +// The CreateVariantSet request +message CreateVariantSetRequest { + // Required. The variant set to be created. Must have a valid `datasetId`. + VariantSet variant_set = 1; +} + +// The variant data export request. +message ExportVariantSetRequest { + enum Format { + FORMAT_UNSPECIFIED = 0; + + // Export the data to Google BigQuery. + FORMAT_BIGQUERY = 1; + } + + // Required. The ID of the variant set that contains variant data which + // should be exported. The caller must have READ access to this variant set. + string variant_set_id = 1; + + // If provided, only variant call information from the specified call sets + // will be exported. By default all variant calls are exported. + repeated string call_set_ids = 2; + + // Required. The Google Cloud project ID that owns the destination + // BigQuery dataset. The caller must have WRITE access to this project. This + // project will also own the resulting export job. + string project_id = 3; + + // The format for the exported data. + Format format = 4; + + // Required. The BigQuery dataset to export data to. This dataset must already + // exist. Note that this is distinct from the Genomics concept of "dataset". + string bigquery_dataset = 5; + + // Required. The BigQuery table to export data to. + // If the table doesn't exist, it will be created. If it already exists, it + // will be overwritten. + string bigquery_table = 6; +} + +// The variant set request. +message GetVariantSetRequest { + // Required. The ID of the variant set. + string variant_set_id = 1; +} + +// The search variant sets request. +message SearchVariantSetsRequest { + // Exactly one dataset ID must be provided here. Only variant sets which + // belong to this dataset will be returned. + repeated string dataset_ids = 1; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 2; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 1024. + int32 page_size = 3; +} + +// The search variant sets response. +message SearchVariantSetsResponse { + // The variant sets belonging to the requested dataset. + repeated VariantSet variant_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +// The delete variant set request. +message DeleteVariantSetRequest { + // The ID of the variant set to be deleted. + string variant_set_id = 1; +} + +message UpdateVariantSetRequest { + // The ID of the variant to be updated (must already exist). + string variant_set_id = 1; + + // The new variant data. Only the variant_set.metadata will be considered + // for update. + VariantSet variant_set = 2; + + // An optional mask specifying which fields to update. Supported fields: + // + // * [metadata][google.genomics.v1.VariantSet.metadata]. + // * [name][google.genomics.v1.VariantSet.name]. + // * [description][google.genomics.v1.VariantSet.description]. + // + // Leaving `updateMask` unset is equivalent to specifying all mutable + // fields. + google.protobuf.FieldMask update_mask = 5; +} + +// The variant search request. +message SearchVariantsRequest { + // At most one variant set ID must be provided. Only variants from this + // variant set will be returned. If omitted, a call set id must be included in + // the request. + repeated string variant_set_ids = 1; + + // Only return variants which have exactly this name. + string variant_name = 2; + + // Only return variant calls which belong to call sets with these ids. + // Leaving this blank returns all variant calls. If a variant has no + // calls belonging to any of these call sets, it won't be returned at all. + // Currently, variants with no calls from any call set will never be returned. + repeated string call_set_ids = 3; + + // Required. Only return variants in this reference sequence. + string reference_name = 4; + + // The beginning of the window (0-based, inclusive) for which + // overlapping variants should be returned. If unspecified, defaults to 0. + int64 start = 5; + + // The end of the window, 0-based exclusive. If unspecified or 0, defaults to + // the length of the reference. + int64 end = 6; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 7; + + // The maximum number of variants to return in a single page. If unspecified, + // defaults to 5000. The maximum value is 10000. + int32 page_size = 8; + + // The maximum number of calls to return in a single page. Note that this + // limit may be exceeded in the event that a matching variant contains more + // calls than the requested maximum. If unspecified, defaults to 5000. The + // maximum value is 10000. + int32 max_calls = 9; +} + +// The variant search response. +message SearchVariantsResponse { + // The list of matching Variants. + repeated Variant variants = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +message CreateVariantRequest { + // The variant to be created. + Variant variant = 1; +} + +message UpdateVariantRequest { + // The ID of the variant to be updated. + string variant_id = 1; + + // The new variant data. + Variant variant = 2; + + // An optional mask specifying which fields to update. At this time, mutable + // fields are [names][google.genomics.v1.Variant.names] and + // [info][google.genomics.v1.Variant.info]. Acceptable values are "names" and + // "info". If unspecified, all mutable fields will be updated. + google.protobuf.FieldMask update_mask = 3; +} + +message DeleteVariantRequest { + // The ID of the variant to be deleted. + string variant_id = 1; +} + +message GetVariantRequest { + // The ID of the variant. + string variant_id = 1; +} + +message MergeVariantsRequest { + // The destination variant set. + string variant_set_id = 1; + + // The variants to be merged with existing variants. + repeated Variant variants = 2; + + // A mapping between info field keys and the InfoMergeOperations to + // be performed on them. + map info_merge_config = 3; +} + +// The call set search request. +message SearchCallSetsRequest { + // Restrict the query to call sets within the given variant sets. At least one + // ID must be provided. + repeated string variant_set_ids = 1; + + // Only return call sets for which a substring of the name matches this + // string. + string name = 2; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `nextPageToken` from the previous response. + string page_token = 3; + + // The maximum number of results to return in a single page. If unspecified, + // defaults to 1024. + int32 page_size = 4; +} + +// The call set search response. +message SearchCallSetsResponse { + // The list of matching call sets. + repeated CallSet call_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +message CreateCallSetRequest { + // The call set to be created. + CallSet call_set = 1; +} + +message UpdateCallSetRequest { + // The ID of the call set to be updated. + string call_set_id = 1; + + // The new call set data. + CallSet call_set = 2; + + // An optional mask specifying which fields to update. At this time, the only + // mutable field is [name][google.genomics.v1.CallSet.name]. The only + // acceptable value is "name". If unspecified, all mutable fields will be + // updated. + google.protobuf.FieldMask update_mask = 3; +} + +message DeleteCallSetRequest { + // The ID of the call set to be deleted. + string call_set_id = 1; +} + +message GetCallSetRequest { + // The ID of the call set. + string call_set_id = 1; +} + +// The stream variants request. +message StreamVariantsRequest { + // The Google Developers Console project ID or number which will be billed + // for this access. The caller must have WRITE access to this project. + // Required. + string project_id = 1; + + // The variant set ID from which to stream variants. + string variant_set_id = 2; + + // Only return variant calls which belong to call sets with these IDs. + // Leaving this blank returns all variant calls. + repeated string call_set_ids = 3; + + // Required. Only return variants in this reference sequence. + string reference_name = 4; + + // The beginning of the window (0-based, inclusive) for which + // overlapping variants should be returned. + int64 start = 5; + + // The end of the window (0-based, exclusive) for which overlapping + // variants should be returned. + int64 end = 6; +} + +message StreamVariantsResponse { + repeated Variant variants = 1; +} + +// Operations to be performed during import on Variant info fields. +// These operations are set for each info field in the info_merge_config +// map of ImportVariantsRequest, which is plumbed down to the +// MergeVariantRequests generated by the import job. +enum InfoMergeOperation { + INFO_MERGE_OPERATION_UNSPECIFIED = 0; + + // By default, Variant info fields are persisted if the Variant doesn't + // already exist in the variantset. If the Variant is equivalent to a + // Variant already in the variantset, the incoming Variant's info field + // is ignored in favor of that of the already persisted Variant. + IGNORE_NEW = 1; + + // This operation removes an info field from the incoming Variant + // and persists this info field in each of the incoming Variant's Calls. + MOVE_TO_CALLS = 2; +} diff --git a/google/genomics/v1alpha2/pipelines.proto b/google/genomics/v1alpha2/pipelines.proto new file mode 100644 index 000000000..46d6d8d84 --- /dev/null +++ b/google/genomics/v1alpha2/pipelines.proto @@ -0,0 +1,586 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package google.genomics.v1alpha2; + +import "google/api/annotations.proto"; +import "google/longrunning/operations.proto"; +import "google/protobuf/empty.proto"; +import "google/protobuf/timestamp.proto"; +import "google/rpc/code.proto"; + +option cc_enable_arenas = true; +option java_multiple_files = true; +option java_outer_classname = "PipelinesProto"; +option java_package = "com.google.genomics.v1a"; + + +// A service for running genomics pipelines. +service PipelinesV1Alpha2 { + // Creates a pipeline that can be run later. Create takes a Pipeline that + // has all fields other than `pipelineId` populated, and then returns + // the same pipeline with `pipelineId` populated. This id can be used + // to run the pipeline. + // + // Caller must have WRITE permission to the project. + rpc CreatePipeline(CreatePipelineRequest) returns (Pipeline) { + option (google.api.http) = { post: "/v1alpha2/pipelines" body: "pipeline" }; + } + + // Runs a pipeline. If `pipelineId` is specified in the request, then + // run a saved pipeline. If `ephemeralPipeline` is specified, then run + // that pipeline once without saving a copy. + // + // The caller must have READ permission to the project where the pipeline + // is stored and WRITE permission to the project where the pipeline will be + // run, as VMs will be created and storage will be used. + rpc RunPipeline(RunPipelineRequest) returns (google.longrunning.Operation) { + option (google.api.http) = { post: "/v1alpha2/pipelines:run" body: "*" }; + } + + // Retrieves a pipeline based on ID. + // + // Caller must have READ permission to the project. + rpc GetPipeline(GetPipelineRequest) returns (Pipeline) { + option (google.api.http) = { get: "/v1alpha2/pipelines/{pipeline_id}" }; + } + + // Lists pipelines. + // + // Caller must have READ permission to the project. + rpc ListPipelines(ListPipelinesRequest) returns (ListPipelinesResponse) { + option (google.api.http) = { get: "/v1alpha2/pipelines" }; + } + + // Deletes a pipeline based on ID. + // + // Caller must have WRITE permission to the project. + rpc DeletePipeline(DeletePipelineRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { delete: "/v1alpha2/pipelines/{pipeline_id}" }; + } + + // Gets controller configuration information. Should only be called + // by VMs created by the Pipelines Service and not by end users. + rpc GetControllerConfig(GetControllerConfigRequest) returns (ControllerConfig) { + option (google.api.http) = { get: "/v1alpha2/pipelines:getControllerConfig" }; + } + + // Sets status of a given operation. All timestamps are sent on each + // call, and the whole series of events is replaced, in case + // intermediate calls are lost. Should only be called by VMs created + // by the Pipelines Service and not by end users. + rpc SetOperationStatus(SetOperationStatusRequest) returns (google.protobuf.Empty) { + option (google.api.http) = { put: "/v1alpha2/pipelines:setOperationStatus" body: "*" }; + } +} + +// Describes a GCE resource that is being managed by a running +// [pipeline][google.genomics.v1alpha2.Pipeline]. +message GCE { + // The instance on which the operation is running. + string instance_name = 1; + + // The availability zone in which the instance resides. + string zone = 2; + + // The machine type of the instance. + string machine_type = 3; + + // The names of the disks that were created for this pipeline. + repeated string disk_names = 4; +} + +// Runtime metadata that will be populated in the +// [runtimeMetadata][google.genomics.v1.OperationMetadata.runtime_metadata] +// field of the Operation associated with a RunPipeline execution. +message RuntimeMetadata { + // Execution information specific to Google Compute Engine. + GCE gce = 1; +} + +// The pipeline object. Represents a transformation from a set of input +// parameters to a set of output parameters. The transformation is defined +// as a docker image and command to run within that image. Each pipeline +// is run on a Google Compute Engine VM. A pipeline can be created with the +// `create` method and then later run with the `run` method, or a pipeline can +// be defined and run all at once with the `run` method. +message Pipeline { + // Required. The project in which to create the pipeline. The caller must have + // WRITE access. + string project_id = 1; + + // Required. A user specified pipeline name that does not have to be unique. + // This name can be used for filtering Pipelines in ListPipelines. + string name = 2; + + // User-specified description. + string description = 3; + + // Input parameters of the pipeline. + repeated PipelineParameter input_parameters = 8; + + // Output parameters of the pipeline. + repeated PipelineParameter output_parameters = 9; + + // Required. The executor indicates in which environment the pipeline runs. + oneof executor { + // Specifies the docker run information. + DockerExecutor docker = 5; + } + + // Required. Specifies resource requirements for the pipeline run. + // Required fields: + // + // * + // [minimumCpuCores][google.genomics.v1alpha2.PipelineResources.minimum_cpu_cores] + // + // * + // [minimumRamGb][google.genomics.v1alpha2.PipelineResources.minimum_ram_gb] + PipelineResources resources = 6; + + // Unique pipeline id that is generated by the service when CreatePipeline + // is called. Cannot be specified in the Pipeline used in the + // CreatePipelineRequest, and will be populated in the response to + // CreatePipeline and all subsequent Get and List calls. Indicates that the + // service has registered this pipeline. + string pipeline_id = 7; +} + +// The request to create a pipeline. The pipeline field here should not have +// `pipelineId` populated, as that will be populated by the server. +message CreatePipelineRequest { + // The pipeline to create. Should not have `pipelineId` populated. + Pipeline pipeline = 1; +} + +// The pipeline run arguments. +message RunPipelineArgs { + // Required. The project in which to run the pipeline. The caller must have + // WRITER access to all Google Cloud services and resources (e.g. Google + // Compute Engine) will be used. + string project_id = 1; + + // Pipeline input arguments; keys are defined in the pipeline documentation. + // All input parameters that do not have default values must be specified. + // If parameters with defaults are specified here, the defaults will be + // overridden. + map inputs = 2; + + // Pipeline output arguments; keys are defined in the pipeline + // documentation. All output parameters of without default values + // must be specified. If parameters with defaults are specified + // here, the defaults will be overridden. + map outputs = 3; + + // The Google Cloud Service Account that will be used to access data and + // services. By default, the compute service account associated with + // `projectId` is used. + ServiceAccount service_account = 4; + + // Client-specified pipeline operation identifier. + string client_id = 5; + + // Specifies resource requirements/overrides for the pipeline run. + PipelineResources resources = 6; + + // Required. Logging options. Used by the service to communicate results + // to the user. + LoggingOptions logging = 7; +} + +// The request to run a pipeline. If `pipelineId` is specified, it +// refers to a saved pipeline created with CreatePipeline and set as +// the `pipelineId` of the returned Pipeline object. If +// `ephemeralPipeline` is specified, that pipeline is run once +// with the given args and not saved. It is an error to specify both +// `pipelineId` and `ephemeralPipeline`. `pipelineArgs` +// must be specified. +message RunPipelineRequest { + oneof pipeline { + // The already created pipeline to run. + string pipeline_id = 1; + + // A new pipeline object to run once and then delete. + Pipeline ephemeral_pipeline = 2; + } + + // The arguments to use when running this pipeline. + RunPipelineArgs pipeline_args = 3; +} + +// A request to get a saved pipeline by id. +message GetPipelineRequest { + // Caller must have READ access to the project in which this pipeline + // is defined. + string pipeline_id = 1; +} + +// A request to list pipelines in a given project. Pipelines can be +// filtered by name using `namePrefix`: all pipelines with names that +// begin with `namePrefix` will be returned. Uses standard pagination: +// `pageSize` indicates how many pipelines to return, and +// `pageToken` comes from a previous ListPipelinesResponse to +// indicate offset. +message ListPipelinesRequest { + // Required. The name of the project to search for pipelines. Caller + // must have READ access to this project. + string project_id = 1; + + // Pipelines with names that match this prefix should be + // returned. If unspecified, all pipelines in the project, up to + // `pageSize`, will be returned. + string name_prefix = 2; + + // Number of pipelines to return at once. Defaults to 256, and max + // is 2048. + int32 page_size = 3; + + // Token to use to indicate where to start getting results. + // If unspecified, returns the first page of results. + string page_token = 4; +} + +// The response of ListPipelines. Contains at most `pageSize` +// pipelines. If it contains `pageSize` pipelines, and more pipelines +// exist, then `nextPageToken` will be populated and should be +// used as the `pageToken` argument to a subsequent ListPipelines +// request. +message ListPipelinesResponse { + // The matched pipelines. + repeated Pipeline pipelines = 1; + + // The token to use to get the next page of results. + string next_page_token = 2; +} + +// The request to delete a saved pipeline by ID. +message DeletePipelineRequest { + // Caller must have WRITE access to the project in which this pipeline + // is defined. + string pipeline_id = 1; +} + +// Request to get controller configuation. Should only be used +// by VMs created by the Pipelines Service and not by end users. +message GetControllerConfigRequest { + // The operation to retrieve controller configuration for. + string operation_id = 1; + + uint64 validation_token = 2; +} + +// Stores the information that the controller will fetch from the +// server in order to run. Should only be used by VMs created by the +// Pipelines Service and not by end users. +message ControllerConfig { + message RepeatedString { + repeated string values = 1; + } + + string image = 1; + + string cmd = 2; + + string gcs_log_path = 3; + + string machine_type = 4; + + map vars = 5; + + map disks = 6; + + map gcs_sources = 7; + + map gcs_sinks = 8; +} + +// Stores the list of events and times they occured for major events in job +// execution. +message TimestampEvent { + // String indicating the type of event + string description = 1; + + // The time this event occured. + google.protobuf.Timestamp timestamp = 2; +} + +// Request to set operation status. Should only be used by VMs +// created by the Pipelines Service and not by end users. +message SetOperationStatusRequest { + string operation_id = 1; + + repeated TimestampEvent timestamp_events = 2; + + google.rpc.Code error_code = 3; + + string error_message = 4; + + uint64 validation_token = 5; +} + +// A Google Cloud Service Account. +message ServiceAccount { + // Email address of the service account. Defaults to `default`, + // which uses the compute service account associated with the project. + string email = 1; + + // List of scopes to be enabled for this service account on the + // pipeline virtual machine. + // The following scopes are automatically included: + // * https://www.googleapis.com/auth/genomics + // * https://www.googleapis.com/auth/compute + // * https://www.googleapis.com/auth/devstorage.full_control + repeated string scopes = 2; +} + +// The logging options for the pipeline run. +message LoggingOptions { + // The location in Google Cloud Storage to which the pipeline logs + // will be copied. Can be specified as a fully qualified directory + // path, in which case logs will be output with a unique identifier + // as the filename in that directory, or as a fully specified path, + // which must end in `.log`, in which case that path will be + // used, and the user must ensure that logs are not + // overwritten. Stdout and stderr logs from the run are also + // generated and output as `-stdout.log` and `-stderr.log`. + string gcs_path = 1; +} + +// The system resources for the pipeline run. +message PipelineResources { + // A Google Compute Engine disk resource specification. + message Disk { + // The types of disks that may be attached to VMs. + enum Type { + // Default disk type. Use one of the other options below. + TYPE_UNSPECIFIED = 0; + + // Specifies a Google Compute Engine persistent hard disk. See + // https://cloud.google.com/compute/docs/disks/persistent-disks#typeofdisks + // for details. + PERSISTENT_HDD = 1; + + // Specifies a Google Compute Engine persistent solid-state disk. See + // https://cloud.google.com/compute/docs/disks/persistent-disks#typeofdisks + // for details. + PERSISTENT_SSD = 2; + + // Specifies a Google Compute Engine local SSD. + // See https://cloud.google.com/compute/docs/disks/local-ssd for details. + LOCAL_SSD = 3; + } + + // Required. The name of the disk that can be used in the pipeline + // parameters. Must be 1 - 63 characters. + // The name "boot" is reserved for system use. + string name = 1; + + // Required. The type of the disk to create. + Type type = 2; + + // The size of the disk. Defaults to 500 (GB). + // This field is not applicable for local SSD. + int32 size_gb = 3; + + // The full or partial URL of the persistent disk to attach. See + // https://cloud.google.com/compute/docs/reference/latest/instances#resource + // and + // https://cloud.google.com/compute/docs/disks/persistent-disks#snapshots + // for more details. + string source = 4; + + // Specifies whether or not to delete the disk when the pipeline + // completes. This field is applicable only for newly created disks. See + // https://cloud.google.com/compute/docs/reference/latest/instances#resource + // for more details. + // By default, `autoDelete` is `false`. `autoDelete` will be enabled if set + // to `true` at create time or run time. + bool auto_delete = 6; + + // Specifies how a sourced-base persistent disk will be mounted. See + // https://cloud.google.com/compute/docs/disks/persistent-disks#use_multi_instances + // for more details. + // Can only be set at create time. + bool read_only = 7; + + // Required at create time and cannot be overridden at run time. + // Specifies the path in the docker container where files on + // this disk should be located. For example, if `mountPoint` + // is `/mnt/disk`, and the parameter has `localPath` + // `inputs/file.txt`, the docker container can access the data at + // `/mnt/disk/inputs/file.txt`. + string mount_point = 8; + } + + // The minimum number of cores to use. Defaults to 1. + int32 minimum_cpu_cores = 1; + + // At create time means that preemptible machines may be + // used for the run. At run time, means they should be used. Cannot + // be true at run time if false at create time. + // Defaults to `false`. + bool preemptible = 2; + + // The minimum amount of RAM to use. Defaults to 3.75 (GB) + double minimum_ram_gb = 3; + + // Disks to attach. + repeated Disk disks = 4; + + // List of Google Compute Engine availability zones to which resource + // creation will restricted. If empty, any zone may be chosen. + repeated string zones = 5; + + // The size of the boot disk. Defaults to 10 (GB). + int32 boot_disk_size_gb = 6; +} + +// Parameters facilitate setting and delivering data into the +// pipeline's execution environment. They are defined at create time, +// with optional defaults, and can be overridden at run time. +// +// If `localCopy` is unset, then the parameter specifies a string that +// is passed as-is into the pipeline, as the value of the environment +// variable with the given name. A default value can be optionally +// specified at create time. The default can be overridden at run time +// using the inputs map. If no default is given, a value must be +// supplied at runtime. +// +// If `localCopy` is defined, then the parameter specifies a data +// source or sink, both in Google Cloud Storage and on the Docker container +// where the pipeline computation is run. The [service account associated with +// the Pipeline][google.genomics.v1alpha2.RunPipelineArgs.service_account] (by +// default the project's Compute Engine service account) must have access to the +// Google Cloud Storage paths. +// +// At run time, the Google Cloud Storage paths can be overridden if a default +// was provided at create time, or must be set otherwise. The pipeline runner +// should add a key/value pair to either the inputs or outputs map. The +// indicated data copies will be carried out before/after pipeline execution, +// just as if the corresponding arguments were provided to `gsutil cp`. +// +// For example: Given the following `PipelineParameter`, specified +// in the `inputParameters` list: +// +// ``` +// {name: "input_file", localCopy: {path: "file.txt", disk: "pd1"}} +// ``` +// +// where `disk` is defined in the `PipelineResources` object as: +// +// ``` +// {name: "pd1", mountPoint: "/mnt/disk/"} +// ``` +// +// We create a disk named `pd1`, mount it on the host VM, and map +// `/mnt/pd1` to `/mnt/disk` in the docker container. At +// runtime, an entry for `input_file` would be required in the inputs +// map, such as: +// +// ``` +// inputs["input_file"] = "gs://my-bucket/bar.txt" +// ``` +// +// This would generate the following gsutil call: +// +// ``` +// gsutil cp gs://my-bucket/bar.txt /mnt/pd1/file.txt +// ``` +// +// The file `/mnt/pd1/file.txt` maps to `/mnt/disk/file.txt` in the +// Docker container. Acceptable paths are: +// +// +// +// +// +// +// +// +// +//
Google Cloud storage pathLocal path
filefile
globdirectory
+// +// For outputs, the direction of the copy is reversed: +// +// ``` +// gsutil cp /mnt/disk/file.txt gs://my-bucket/bar.txt +// ``` +// +// Acceptable paths are: +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// +//
Local pathGoogle Cloud Storage path
filefile
filedirectory - directory must already exist
globdirectory - directory will be created if it doesn't exist
+// +// One restriction due to docker limitations, is that for outputs that are found +// on the boot disk, the local path cannot be a glob and must be a file. +message PipelineParameter { + // LocalCopy defines how a remote file should be copied to and from the VM. + message LocalCopy { + // Required. The path within the user's docker container where + // this input should be localized to and from, relative to the specified + // disk's mount point. For example: file.txt, + string path = 1; + + // Required. The name of the disk where this parameter is + // located. Can be the name of one of the disks specified in the + // Resources field, or "boot", which represents the Docker + // instance's boot disk and has a mount point of `/`. + string disk = 2; + } + + // Required. Name of the parameter - the pipeline runner uses this string + // as the key to the input and output maps in RunPipeline. + string name = 1; + + // Human-readable description. + string description = 2; + + // The default value for this parameter. Can be overridden at runtime. + // If `localCopy` is present, then this must be a Google Cloud Storage path + // beginning with `gs://`. + string default_value = 5; + + // If present, this parameter is marked for copying to and from the VM. + // `LocalCopy` indicates where on the VM the file should be. The value + // given to this parameter (either at runtime or using `defaultValue`) + // must be the remote path where the file should be. + LocalCopy local_copy = 6; +} + +// The Docker execuctor specification. +message DockerExecutor { + // Required. Image name from either Docker Hub or Google Container Repository. + // Users that run pipelines must have READ access to the image. + string image_name = 1; + + // Required. The command string to run. Parameters that do not have + // `localCopy` specified should be used as environment variables, while + // those that do can be accessed at the defined paths. + string cmd = 2; +}