You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
952 lines
34 KiB
952 lines
34 KiB
// Copyright 2023 Google LLC |
|
// |
|
// Licensed under the Apache License, Version 2.0 (the "License"); |
|
// you may not use this file except in compliance with the License. |
|
// You may obtain a copy of the License at |
|
// |
|
// http://www.apache.org/licenses/LICENSE-2.0 |
|
// |
|
// Unless required by applicable law or agreed to in writing, software |
|
// distributed under the License is distributed on an "AS IS" BASIS, |
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
// See the License for the specific language governing permissions and |
|
// limitations under the License. |
|
|
|
syntax = "proto3"; |
|
|
|
package google.cloud.documentai.v1beta3; |
|
|
|
import "google/api/field_behavior.proto"; |
|
import "google/cloud/documentai/v1beta3/barcode.proto"; |
|
import "google/cloud/documentai/v1beta3/geometry.proto"; |
|
import "google/protobuf/timestamp.proto"; |
|
import "google/rpc/status.proto"; |
|
import "google/type/color.proto"; |
|
import "google/type/date.proto"; |
|
import "google/type/datetime.proto"; |
|
import "google/type/money.proto"; |
|
import "google/type/postal_address.proto"; |
|
|
|
option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3"; |
|
option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb"; |
|
option java_multiple_files = true; |
|
option java_outer_classname = "DocumentProto"; |
|
option java_package = "com.google.cloud.documentai.v1beta3"; |
|
option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3"; |
|
option ruby_package = "Google::Cloud::DocumentAI::V1beta3"; |
|
|
|
// Document represents the canonical document resource in Document AI. It is an |
|
// interchange format that provides insights into documents and allows for |
|
// collaboration between users and Document AI to iterate and optimize for |
|
// quality. |
|
message Document { |
|
// For a large document, sharding may be performed to produce several |
|
// document shards. Each document shard contains this field to detail which |
|
// shard it is. |
|
message ShardInfo { |
|
// The 0-based index of this shard. |
|
int64 shard_index = 1; |
|
|
|
// Total number of shards. |
|
int64 shard_count = 2; |
|
|
|
// The index of the first character in |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text] in the |
|
// overall document global text. |
|
int64 text_offset = 3; |
|
} |
|
|
|
// Annotation for common text style attributes. This adheres to CSS |
|
// conventions as much as possible. |
|
message Style { |
|
// Font size with unit. |
|
message FontSize { |
|
// Font size for the text. |
|
float size = 1; |
|
|
|
// Unit for the font size. Follows CSS naming (such as `in`, `px`, and |
|
// `pt`). |
|
string unit = 2; |
|
} |
|
|
|
// Text anchor indexing into the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. |
|
TextAnchor text_anchor = 1; |
|
|
|
// Text color. |
|
google.type.Color color = 2; |
|
|
|
// Text background color. |
|
google.type.Color background_color = 3; |
|
|
|
// [Font weight](https://www.w3schools.com/cssref/pr_font_weight.asp). |
|
// Possible values are `normal`, `bold`, `bolder`, and `lighter`. |
|
string font_weight = 4; |
|
|
|
// [Text style](https://www.w3schools.com/cssref/pr_font_font-style.asp). |
|
// Possible values are `normal`, `italic`, and `oblique`. |
|
string text_style = 5; |
|
|
|
// [Text |
|
// decoration](https://www.w3schools.com/cssref/pr_text_text-decoration.asp). |
|
// Follows CSS standard. <text-decoration-line> <text-decoration-color> |
|
// <text-decoration-style> |
|
string text_decoration = 6; |
|
|
|
// Font size. |
|
FontSize font_size = 7; |
|
|
|
// Font family such as `Arial`, `Times New Roman`. |
|
// https://www.w3schools.com/cssref/pr_font_font-family.asp |
|
string font_family = 8; |
|
} |
|
|
|
// A page in a [Document][google.cloud.documentai.v1beta3.Document]. |
|
message Page { |
|
// Dimension for the page. |
|
message Dimension { |
|
// Page width. |
|
float width = 1; |
|
|
|
// Page height. |
|
float height = 2; |
|
|
|
// Dimension unit. |
|
string unit = 3; |
|
} |
|
|
|
// Rendered image contents for this page. |
|
message Image { |
|
// Raw byte content of the image. |
|
bytes content = 1; |
|
|
|
// Encoding [media type (MIME |
|
// type)](https://www.iana.org/assignments/media-types/media-types.xhtml) |
|
// for the image. |
|
string mime_type = 2; |
|
|
|
// Width of the image in pixels. |
|
int32 width = 3; |
|
|
|
// Height of the image in pixels. |
|
int32 height = 4; |
|
} |
|
|
|
// Representation for transformation matrix, intended to be compatible and |
|
// used with OpenCV format for image manipulation. |
|
message Matrix { |
|
// Number of rows in the matrix. |
|
int32 rows = 1; |
|
|
|
// Number of columns in the matrix. |
|
int32 cols = 2; |
|
|
|
// This encodes information about what data type the matrix uses. |
|
// For example, 0 (CV_8U) is an unsigned 8-bit image. For the full list |
|
// of OpenCV primitive data types, please refer to |
|
// https://docs.opencv.org/4.3.0/d1/d1b/group__core__hal__interface.html |
|
int32 type = 3; |
|
|
|
// The matrix data. |
|
bytes data = 4; |
|
} |
|
|
|
// Visual element describing a layout unit on a page. |
|
message Layout { |
|
// Detected human reading orientation. |
|
enum Orientation { |
|
// Unspecified orientation. |
|
ORIENTATION_UNSPECIFIED = 0; |
|
|
|
// Orientation is aligned with page up. |
|
PAGE_UP = 1; |
|
|
|
// Orientation is aligned with page right. |
|
// Turn the head 90 degrees clockwise from upright to read. |
|
PAGE_RIGHT = 2; |
|
|
|
// Orientation is aligned with page down. |
|
// Turn the head 180 degrees from upright to read. |
|
PAGE_DOWN = 3; |
|
|
|
// Orientation is aligned with page left. |
|
// Turn the head 90 degrees counterclockwise from upright to read. |
|
PAGE_LEFT = 4; |
|
} |
|
|
|
// Text anchor indexing into the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. |
|
TextAnchor text_anchor = 1; |
|
|
|
// Confidence of the current |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] within |
|
// context of the object this layout is for. e.g. confidence can be for a |
|
// single token, a table, a visual element, etc. depending on context. |
|
// Range `[0, 1]`. |
|
float confidence = 2; |
|
|
|
// The bounding polygon for the |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout]. |
|
BoundingPoly bounding_poly = 3; |
|
|
|
// Detected orientation for the |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout]. |
|
Orientation orientation = 4; |
|
} |
|
|
|
// A block has a set of lines (collected into paragraphs) that have a |
|
// common line-spacing and orientation. |
|
message Block { |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for |
|
// [Block][google.cloud.documentai.v1beta3.Document.Page.Block]. |
|
Layout layout = 1; |
|
|
|
// A list of detected languages together with confidence. |
|
repeated DetectedLanguage detected_languages = 2; |
|
|
|
// The history of this annotation. |
|
Provenance provenance = 3 [deprecated = true]; |
|
} |
|
|
|
// A collection of lines that a human would perceive as a paragraph. |
|
message Paragraph { |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for |
|
// [Paragraph][google.cloud.documentai.v1beta3.Document.Page.Paragraph]. |
|
Layout layout = 1; |
|
|
|
// A list of detected languages together with confidence. |
|
repeated DetectedLanguage detected_languages = 2; |
|
|
|
// The history of this annotation. |
|
Provenance provenance = 3 [deprecated = true]; |
|
} |
|
|
|
// A collection of tokens that a human would perceive as a line. |
|
// Does not cross column boundaries, can be horizontal, vertical, etc. |
|
message Line { |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for |
|
// [Line][google.cloud.documentai.v1beta3.Document.Page.Line]. |
|
Layout layout = 1; |
|
|
|
// A list of detected languages together with confidence. |
|
repeated DetectedLanguage detected_languages = 2; |
|
|
|
// The history of this annotation. |
|
Provenance provenance = 3 [deprecated = true]; |
|
} |
|
|
|
// A detected token. |
|
message Token { |
|
// Detected break at the end of a |
|
// [Token][google.cloud.documentai.v1beta3.Document.Page.Token]. |
|
message DetectedBreak { |
|
// Enum to denote the type of break found. |
|
enum Type { |
|
// Unspecified break type. |
|
TYPE_UNSPECIFIED = 0; |
|
|
|
// A single whitespace. |
|
SPACE = 1; |
|
|
|
// A wider whitespace. |
|
WIDE_SPACE = 2; |
|
|
|
// A hyphen that indicates that a token has been split across lines. |
|
HYPHEN = 3; |
|
} |
|
|
|
// Detected break type. |
|
Type type = 1; |
|
} |
|
|
|
// Font and other text style attributes. |
|
message StyleInfo { |
|
// Font size in points (`1` point is `¹⁄₇₂` inches). |
|
int32 font_size = 1; |
|
|
|
// Font size in pixels, equal to _unrounded |
|
// [font_size][google.cloud.documentai.v1beta3.Document.Page.Token.StyleInfo.font_size]_ |
|
// * _resolution_ ÷ `72.0`. |
|
double pixel_font_size = 2; |
|
|
|
// Letter spacing in points. |
|
double letter_spacing = 3; |
|
|
|
// Name or style of the font. |
|
string font_type = 4; |
|
|
|
// Whether the text is bold (equivalent to |
|
// [font_weight][google.cloud.documentai.v1beta3.Document.Page.Token.StyleInfo.font_weight] |
|
// is at least `700`). |
|
bool bold = 5; |
|
|
|
// Whether the text is italic. |
|
bool italic = 6; |
|
|
|
// Whether the text is underlined. |
|
bool underlined = 7; |
|
|
|
// Whether the text is strikethrough. |
|
bool strikeout = 8; |
|
|
|
// Whether the text is a subscript. |
|
bool subscript = 9; |
|
|
|
// Whether the text is a superscript. |
|
bool superscript = 10; |
|
|
|
// Whether the text is in small caps. |
|
bool smallcaps = 11; |
|
|
|
// TrueType weight on a scale `100` (thin) to `1000` (ultra-heavy). |
|
// Normal is `400`, bold is `700`. |
|
int32 font_weight = 12; |
|
|
|
// Whether the text is handwritten. |
|
bool handwritten = 13; |
|
|
|
// Color of the text. |
|
google.type.Color text_color = 14; |
|
|
|
// Color of the background. |
|
google.type.Color background_color = 15; |
|
} |
|
|
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for |
|
// [Token][google.cloud.documentai.v1beta3.Document.Page.Token]. |
|
Layout layout = 1; |
|
|
|
// Detected break at the end of a |
|
// [Token][google.cloud.documentai.v1beta3.Document.Page.Token]. |
|
DetectedBreak detected_break = 2; |
|
|
|
// A list of detected languages together with confidence. |
|
repeated DetectedLanguage detected_languages = 3; |
|
|
|
// The history of this annotation. |
|
Provenance provenance = 4 [deprecated = true]; |
|
|
|
// Text style attributes. |
|
StyleInfo style_info = 5; |
|
} |
|
|
|
// A detected symbol. |
|
message Symbol { |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for |
|
// [Symbol][google.cloud.documentai.v1beta3.Document.Page.Symbol]. |
|
Layout layout = 1; |
|
|
|
// A list of detected languages together with confidence. |
|
repeated DetectedLanguage detected_languages = 2; |
|
} |
|
|
|
// Detected non-text visual elements e.g. checkbox, signature etc. on the |
|
// page. |
|
message VisualElement { |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for |
|
// [VisualElement][google.cloud.documentai.v1beta3.Document.Page.VisualElement]. |
|
Layout layout = 1; |
|
|
|
// Type of the |
|
// [VisualElement][google.cloud.documentai.v1beta3.Document.Page.VisualElement]. |
|
string type = 2; |
|
|
|
// A list of detected languages together with confidence. |
|
repeated DetectedLanguage detected_languages = 3; |
|
} |
|
|
|
// A table representation similar to HTML table structure. |
|
message Table { |
|
// A row of table cells. |
|
message TableRow { |
|
// Cells that make up this row. |
|
repeated TableCell cells = 1; |
|
} |
|
|
|
// A cell representation inside the table. |
|
message TableCell { |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for |
|
// [TableCell][google.cloud.documentai.v1beta3.Document.Page.Table.TableCell]. |
|
Layout layout = 1; |
|
|
|
// How many rows this cell spans. |
|
int32 row_span = 2; |
|
|
|
// How many columns this cell spans. |
|
int32 col_span = 3; |
|
|
|
// A list of detected languages together with confidence. |
|
repeated DetectedLanguage detected_languages = 4; |
|
} |
|
|
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for |
|
// [Table][google.cloud.documentai.v1beta3.Document.Page.Table]. |
|
Layout layout = 1; |
|
|
|
// Header rows of the table. |
|
repeated TableRow header_rows = 2; |
|
|
|
// Body rows of the table. |
|
repeated TableRow body_rows = 3; |
|
|
|
// A list of detected languages together with confidence. |
|
repeated DetectedLanguage detected_languages = 4; |
|
|
|
// The history of this table. |
|
Provenance provenance = 5 [deprecated = true]; |
|
} |
|
|
|
// A form field detected on the page. |
|
message FormField { |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for the |
|
// [FormField][google.cloud.documentai.v1beta3.Document.Page.FormField] |
|
// name. e.g. `Address`, `Email`, `Grand total`, `Phone number`, etc. |
|
Layout field_name = 1; |
|
|
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for the |
|
// [FormField][google.cloud.documentai.v1beta3.Document.Page.FormField] |
|
// value. |
|
Layout field_value = 2; |
|
|
|
// A list of detected languages for name together with confidence. |
|
repeated DetectedLanguage name_detected_languages = 3; |
|
|
|
// A list of detected languages for value together with confidence. |
|
repeated DetectedLanguage value_detected_languages = 4; |
|
|
|
// If the value is non-textual, this field represents the type. Current |
|
// valid values are: |
|
// |
|
// - blank (this indicates the `field_value` is normal text) |
|
// - `unfilled_checkbox` |
|
// - `filled_checkbox` |
|
string value_type = 5; |
|
|
|
// Created for Labeling UI to export key text. |
|
// If corrections were made to the text identified by the |
|
// `field_name.text_anchor`, this field will contain the correction. |
|
string corrected_key_text = 6; |
|
|
|
// Created for Labeling UI to export value text. |
|
// If corrections were made to the text identified by the |
|
// `field_value.text_anchor`, this field will contain the correction. |
|
string corrected_value_text = 7; |
|
|
|
// The history of this annotation. |
|
Provenance provenance = 8; |
|
} |
|
|
|
// A detected barcode. |
|
message DetectedBarcode { |
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for |
|
// [DetectedBarcode][google.cloud.documentai.v1beta3.Document.Page.DetectedBarcode]. |
|
Layout layout = 1; |
|
|
|
// Detailed barcode information of the |
|
// [DetectedBarcode][google.cloud.documentai.v1beta3.Document.Page.DetectedBarcode]. |
|
Barcode barcode = 2; |
|
} |
|
|
|
// Detected language for a structural component. |
|
message DetectedLanguage { |
|
// The [BCP-47 language |
|
// code](https://www.unicode.org/reports/tr35/#Unicode_locale_identifier), |
|
// such as `en-US` or `sr-Latn`. |
|
string language_code = 1; |
|
|
|
// Confidence of detected language. Range `[0, 1]`. |
|
float confidence = 2; |
|
} |
|
|
|
// Image quality scores for the page image. |
|
message ImageQualityScores { |
|
// Image Quality Defects |
|
message DetectedDefect { |
|
// Name of the defect type. Supported values are: |
|
// |
|
// - `quality/defect_blurry` |
|
// - `quality/defect_noisy` |
|
// - `quality/defect_dark` |
|
// - `quality/defect_faint` |
|
// - `quality/defect_text_too_small` |
|
// - `quality/defect_document_cutoff` |
|
// - `quality/defect_text_cutoff` |
|
// - `quality/defect_glare` |
|
string type = 1; |
|
|
|
// Confidence of detected defect. Range `[0, 1]` where `1` indicates |
|
// strong confidence that the defect exists. |
|
float confidence = 2; |
|
} |
|
|
|
// The overall quality score. Range `[0, 1]` where `1` is perfect quality. |
|
float quality_score = 1; |
|
|
|
// A list of detected defects. |
|
repeated DetectedDefect detected_defects = 2; |
|
} |
|
|
|
// 1-based index for current |
|
// [Page][google.cloud.documentai.v1beta3.Document.Page] in a parent |
|
// [Document][google.cloud.documentai.v1beta3.Document]. Useful when a page |
|
// is taken out of a [Document][google.cloud.documentai.v1beta3.Document] |
|
// for individual processing. |
|
int32 page_number = 1; |
|
|
|
// Rendered image for this page. This image is preprocessed to remove any |
|
// skew, rotation, and distortions such that the annotation bounding boxes |
|
// can be upright and axis-aligned. |
|
Image image = 13; |
|
|
|
// Transformation matrices that were applied to the original document image |
|
// to produce |
|
// [Page.image][google.cloud.documentai.v1beta3.Document.Page.image]. |
|
repeated Matrix transforms = 14; |
|
|
|
// Physical dimension of the page. |
|
Dimension dimension = 2; |
|
|
|
// [Layout][google.cloud.documentai.v1beta3.Document.Page.Layout] for the |
|
// page. |
|
Layout layout = 3; |
|
|
|
// A list of detected languages together with confidence. |
|
repeated DetectedLanguage detected_languages = 4; |
|
|
|
// A list of visually detected text blocks on the page. |
|
// A block has a set of lines (collected into paragraphs) that have a common |
|
// line-spacing and orientation. |
|
repeated Block blocks = 5; |
|
|
|
// A list of visually detected text paragraphs on the page. |
|
// A collection of lines that a human would perceive as a paragraph. |
|
repeated Paragraph paragraphs = 6; |
|
|
|
// A list of visually detected text lines on the page. |
|
// A collection of tokens that a human would perceive as a line. |
|
repeated Line lines = 7; |
|
|
|
// A list of visually detected tokens on the page. |
|
repeated Token tokens = 8; |
|
|
|
// A list of detected non-text visual elements e.g. checkbox, |
|
// signature etc. on the page. |
|
repeated VisualElement visual_elements = 9; |
|
|
|
// A list of visually detected tables on the page. |
|
repeated Table tables = 10; |
|
|
|
// A list of visually detected form fields on the page. |
|
repeated FormField form_fields = 11; |
|
|
|
// A list of visually detected symbols on the page. |
|
repeated Symbol symbols = 12; |
|
|
|
// A list of detected barcodes. |
|
repeated DetectedBarcode detected_barcodes = 15; |
|
|
|
// Image quality scores. |
|
ImageQualityScores image_quality_scores = 17; |
|
|
|
// The history of this page. |
|
Provenance provenance = 16 [deprecated = true]; |
|
} |
|
|
|
// An entity that could be a phrase in the text or a property that belongs to |
|
// the document. It is a known entity type, such as a person, an organization, |
|
// or location. |
|
message Entity { |
|
// Parsed and normalized entity value. |
|
message NormalizedValue { |
|
// An optional structured entity value. |
|
// Must match entity type defined in schema if |
|
// known. If this field is present, the `text` field could also be |
|
// populated. |
|
oneof structured_value { |
|
// Money value. See also: |
|
// https://github.com/googleapis/googleapis/blob/master/google/type/money.proto |
|
google.type.Money money_value = 2; |
|
|
|
// Date value. Includes year, month, day. See also: |
|
// https://github.com/googleapis/googleapis/blob/master/google/type/date.proto |
|
google.type.Date date_value = 3; |
|
|
|
// DateTime value. Includes date, time, and timezone. See also: |
|
// https://github.com/googleapis/googleapis/blob/master/google/type/datetime.proto |
|
google.type.DateTime datetime_value = 4; |
|
|
|
// Postal address. See also: |
|
// https://github.com/googleapis/googleapis/blob/master/google/type/postal_address.proto |
|
google.type.PostalAddress address_value = 5; |
|
|
|
// Boolean value. Can be used for entities with binary values, or for |
|
// checkboxes. |
|
bool boolean_value = 6; |
|
|
|
// Integer value. |
|
int32 integer_value = 7; |
|
|
|
// Float value. |
|
float float_value = 8; |
|
} |
|
|
|
// Optional. An optional field to store a normalized string. |
|
// For some entity types, one of respective `structured_value` fields may |
|
// also be populated. Also not all the types of `structured_value` will be |
|
// normalized. For example, some processors may not generate `float` |
|
// or `integer` normalized text by default. |
|
// |
|
// Below are sample formats mapped to structured values. |
|
// |
|
// - Money/Currency type (`money_value`) is in the ISO 4217 text format. |
|
// - Date type (`date_value`) is in the ISO 8601 text format. |
|
// - Datetime type (`datetime_value`) is in the ISO 8601 text format. |
|
string text = 1 [(google.api.field_behavior) = OPTIONAL]; |
|
} |
|
|
|
// Optional. Provenance of the entity. |
|
// Text anchor indexing into the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. |
|
TextAnchor text_anchor = 1 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Required. Entity type from a schema e.g. `Address`. |
|
string type = 2 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Optional. Text value of the entity e.g. `1600 Amphitheatre Pkwy`. |
|
string mention_text = 3 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Deprecated. Use `id` field instead. |
|
string mention_id = 4 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Confidence of detected Schema entity. Range `[0, 1]`. |
|
float confidence = 5 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Represents the provenance of this entity wrt. the location on |
|
// the page where it was found. |
|
PageAnchor page_anchor = 6 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Canonical id. This will be a unique value in the entity list |
|
// for this document. |
|
string id = 7 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Normalized entity value. Absent if the extracted value could |
|
// not be converted or the type (e.g. address) is not supported for certain |
|
// parsers. This field is also only populated for certain supported document |
|
// types. |
|
NormalizedValue normalized_value = 9 |
|
[(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Entities can be nested to form a hierarchical data structure |
|
// representing the content in the document. |
|
repeated Entity properties = 10 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. The history of this annotation. |
|
Provenance provenance = 11 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Whether the entity will be redacted for de-identification |
|
// purposes. |
|
bool redacted = 12 [(google.api.field_behavior) = OPTIONAL]; |
|
} |
|
|
|
// Relationship between |
|
// [Entities][google.cloud.documentai.v1beta3.Document.Entity]. |
|
message EntityRelation { |
|
// Subject entity id. |
|
string subject_id = 1; |
|
|
|
// Object entity id. |
|
string object_id = 2; |
|
|
|
// Relationship description. |
|
string relation = 3; |
|
} |
|
|
|
// Text reference indexing into the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. |
|
message TextAnchor { |
|
// A text segment in the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. The |
|
// indices may be out of bounds which indicate that the text extends into |
|
// another document shard for large sharded documents. See |
|
// [ShardInfo.text_offset][google.cloud.documentai.v1beta3.Document.ShardInfo.text_offset] |
|
message TextSegment { |
|
// [TextSegment][google.cloud.documentai.v1beta3.Document.TextAnchor.TextSegment] |
|
// start UTF-8 char index in the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. |
|
int64 start_index = 1; |
|
|
|
// [TextSegment][google.cloud.documentai.v1beta3.Document.TextAnchor.TextSegment] |
|
// half open end UTF-8 char index in the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. |
|
int64 end_index = 2; |
|
} |
|
|
|
// The text segments from the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. |
|
repeated TextSegment text_segments = 1; |
|
|
|
// Contains the content of the text span so that users do |
|
// not have to look it up in the text_segments. It is always |
|
// populated for formFields. |
|
string content = 2; |
|
} |
|
|
|
// Referencing the visual context of the entity in the |
|
// [Document.pages][google.cloud.documentai.v1beta3.Document.pages]. Page |
|
// anchors can be cross-page, consist of multiple bounding polygons and |
|
// optionally reference specific layout element types. |
|
message PageAnchor { |
|
// Represents a weak reference to a page element within a document. |
|
message PageRef { |
|
// The type of layout that is being referenced. |
|
enum LayoutType { |
|
// Layout Unspecified. |
|
LAYOUT_TYPE_UNSPECIFIED = 0; |
|
|
|
// References a |
|
// [Page.blocks][google.cloud.documentai.v1beta3.Document.Page.blocks] |
|
// element. |
|
BLOCK = 1; |
|
|
|
// References a |
|
// [Page.paragraphs][google.cloud.documentai.v1beta3.Document.Page.paragraphs] |
|
// element. |
|
PARAGRAPH = 2; |
|
|
|
// References a |
|
// [Page.lines][google.cloud.documentai.v1beta3.Document.Page.lines] |
|
// element. |
|
LINE = 3; |
|
|
|
// References a |
|
// [Page.tokens][google.cloud.documentai.v1beta3.Document.Page.tokens] |
|
// element. |
|
TOKEN = 4; |
|
|
|
// References a |
|
// [Page.visual_elements][google.cloud.documentai.v1beta3.Document.Page.visual_elements] |
|
// element. |
|
VISUAL_ELEMENT = 5; |
|
|
|
// Refrrences a |
|
// [Page.tables][google.cloud.documentai.v1beta3.Document.Page.tables] |
|
// element. |
|
TABLE = 6; |
|
|
|
// References a |
|
// [Page.form_fields][google.cloud.documentai.v1beta3.Document.Page.form_fields] |
|
// element. |
|
FORM_FIELD = 7; |
|
} |
|
|
|
// Required. Index into the |
|
// [Document.pages][google.cloud.documentai.v1beta3.Document.pages] |
|
// element, for example using |
|
// `[Document.pages][page_refs.page]` to locate the related page element. |
|
// This field is skipped when its value is the default `0`. See |
|
// https://developers.google.com/protocol-buffers/docs/proto3#json. |
|
int64 page = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
// Optional. The type of the layout element that is being referenced if |
|
// any. |
|
LayoutType layout_type = 2 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Deprecated. Use |
|
// [PageRef.bounding_poly][google.cloud.documentai.v1beta3.Document.PageAnchor.PageRef.bounding_poly] |
|
// instead. |
|
string layout_id = 3 |
|
[deprecated = true, (google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Identifies the bounding polygon of a layout element on the |
|
// page. |
|
BoundingPoly bounding_poly = 4 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Confidence of detected page element, if applicable. Range |
|
// `[0, 1]`. |
|
float confidence = 5 [(google.api.field_behavior) = OPTIONAL]; |
|
} |
|
|
|
// One or more references to visual page elements |
|
repeated PageRef page_refs = 1; |
|
} |
|
|
|
// Structure to identify provenance relationships between annotations in |
|
// different revisions. |
|
message Provenance { |
|
// The parent element the current element is based on. Used for |
|
// referencing/aligning, removal and replacement operations. |
|
message Parent { |
|
// The index of the index into current revision's parent_ids list. |
|
int32 revision = 1; |
|
|
|
// The index of the parent item in the corresponding item list (eg. list |
|
// of entities, properties within entities, etc.) in the parent revision. |
|
int32 index = 3; |
|
|
|
// The id of the parent provenance. |
|
int32 id = 2 [deprecated = true]; |
|
} |
|
|
|
// If a processor or agent does an explicit operation on existing elements. |
|
enum OperationType { |
|
// Operation type unspecified. If no operation is specified a provenance |
|
// entry is simply used to match against a `parent`. |
|
OPERATION_TYPE_UNSPECIFIED = 0; |
|
|
|
// Add an element. |
|
ADD = 1; |
|
|
|
// Remove an element identified by `parent`. |
|
REMOVE = 2; |
|
|
|
// Updates any fields within the given provenance scope of the message. It |
|
// overwrites the fields rather than replacing them. Use this when you |
|
// want to update a field value of an entity without also updating all the |
|
// child properties. |
|
UPDATE = 7; |
|
|
|
// Currently unused. Replace an element identified by `parent`. |
|
REPLACE = 3; |
|
|
|
// Deprecated. Request human review for the element identified by |
|
// `parent`. |
|
EVAL_REQUESTED = 4 [deprecated = true]; |
|
|
|
// Deprecated. Element is reviewed and approved at human review, |
|
// confidence will be set to 1.0. |
|
EVAL_APPROVED = 5 [deprecated = true]; |
|
|
|
// Deprecated. Element is skipped in the validation process. |
|
EVAL_SKIPPED = 6 [deprecated = true]; |
|
} |
|
|
|
// The index of the revision that produced this element. |
|
int32 revision = 1 [deprecated = true]; |
|
|
|
// The Id of this operation. Needs to be unique within the scope of the |
|
// revision. |
|
int32 id = 2 [deprecated = true]; |
|
|
|
// References to the original elements that are replaced. |
|
repeated Parent parents = 3; |
|
|
|
// The type of provenance operation. |
|
OperationType type = 4; |
|
} |
|
|
|
// Contains past or forward revisions of this document. |
|
message Revision { |
|
// Human Review information of the document. |
|
message HumanReview { |
|
// Human review state. e.g. `requested`, `succeeded`, `rejected`. |
|
string state = 1; |
|
|
|
// A message providing more details about the current state of processing. |
|
// For example, the rejection reason when the state is `rejected`. |
|
string state_message = 2; |
|
} |
|
|
|
// Who/what made the change |
|
oneof source { |
|
// If the change was made by a person specify the name or id of that |
|
// person. |
|
string agent = 4; |
|
|
|
// If the annotation was made by processor identify the processor by its |
|
// resource name. |
|
string processor = 5; |
|
} |
|
|
|
// Id of the revision, internally generated by doc proto storage. |
|
// Unique within the context of the document. |
|
string id = 1; |
|
|
|
// The revisions that this revision is based on. This can include one or |
|
// more parent (when documents are merged.) This field represents the |
|
// index into the `revisions` field. |
|
repeated int32 parent = 2 [deprecated = true]; |
|
|
|
// The revisions that this revision is based on. Must include all the ids |
|
// that have anything to do with this revision - eg. there are |
|
// `provenance.parent.revision` fields that index into this field. |
|
repeated string parent_ids = 7; |
|
|
|
// The time that the revision was created, internally generated by |
|
// doc proto storage at the time of create. |
|
google.protobuf.Timestamp create_time = 3; |
|
|
|
// Human Review information of this revision. |
|
HumanReview human_review = 6; |
|
} |
|
|
|
// This message is used for text changes aka. OCR corrections. |
|
message TextChange { |
|
// Provenance of the correction. |
|
// Text anchor indexing into the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. There |
|
// can only be a single `TextAnchor.text_segments` element. If the start |
|
// and end index of the text segment are the same, the text change is |
|
// inserted before that index. |
|
TextAnchor text_anchor = 1; |
|
|
|
// The text that replaces the text identified in the `text_anchor`. |
|
string changed_text = 2; |
|
|
|
// The history of this annotation. |
|
repeated Provenance provenance = 3 [deprecated = true]; |
|
} |
|
|
|
// Original source document from the user. |
|
oneof source { |
|
// Optional. Currently supports Google Cloud Storage URI of the form |
|
// `gs://bucket_name/object_name`. Object versioning is not supported. |
|
// For more information, refer to [Google Cloud Storage Request |
|
// URIs](https://cloud.google.com/storage/docs/reference-uris). |
|
string uri = 1 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Optional. Inline document content, represented as a stream of bytes. |
|
// Note: As with all `bytes` fields, protobuffers use a pure binary |
|
// representation, whereas JSON representations use base64. |
|
bytes content = 2 [(google.api.field_behavior) = OPTIONAL]; |
|
} |
|
|
|
// An IANA published [media type (MIME |
|
// type)](https://www.iana.org/assignments/media-types/media-types.xhtml). |
|
string mime_type = 3; |
|
|
|
// Optional. UTF-8 encoded text in reading order from the document. |
|
string text = 4 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
// Styles for the |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. |
|
repeated Style text_styles = 5 [deprecated = true]; |
|
|
|
// Visual page layout for the |
|
// [Document][google.cloud.documentai.v1beta3.Document]. |
|
repeated Page pages = 6; |
|
|
|
// A list of entities detected on |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. For |
|
// document shards, entities in this list may cross shard boundaries. |
|
repeated Entity entities = 7; |
|
|
|
// Placeholder. Relationship among |
|
// [Document.entities][google.cloud.documentai.v1beta3.Document.entities]. |
|
repeated EntityRelation entity_relations = 8; |
|
|
|
// Placeholder. A list of text corrections made to |
|
// [Document.text][google.cloud.documentai.v1beta3.Document.text]. This is |
|
// usually used for annotating corrections to OCR mistakes. Text changes for |
|
// a given revision may not overlap with each other. |
|
repeated TextChange text_changes = 14; |
|
|
|
// Information about the sharding if this document is sharded part of a larger |
|
// document. If the document is not sharded, this message is not specified. |
|
ShardInfo shard_info = 9; |
|
|
|
// Any error that occurred while processing this document. |
|
google.rpc.Status error = 10; |
|
|
|
// Placeholder. Revision history of this document. |
|
repeated Revision revisions = 13; |
|
}
|
|
|