googleapis/google/cloud/documentai/v1beta3/document_io.proto

// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

syntax = "proto3";

package google.cloud.documentai.v1beta3;

import "google/protobuf/field_mask.proto";

option csharp_namespace = "Google.Cloud.DocumentAI.V1Beta3";
option go_package = "cloud.google.com/go/documentai/apiv1beta3/documentaipb;documentaipb";
option java_multiple_files = true;
option java_outer_classname = "DocumentIoProto";
option java_package = "com.google.cloud.documentai.v1beta3";
option php_namespace = "Google\\Cloud\\DocumentAI\\V1beta3";
option ruby_package = "Google::Cloud::DocumentAI::V1beta3";

// Payload message of raw document content (bytes).
message RawDocument {
  // Inline document content.
  bytes content = 1;

  // An IANA MIME type (RFC6838) indicating the nature and format of the
  // [content][google.cloud.documentai.v1beta3.RawDocument.content].
  string mime_type = 2;
}

// Specifies a document stored on Cloud Storage.
message GcsDocument {
  // The Cloud Storage object uri.
  string gcs_uri = 1;

  // An IANA MIME type (RFC6838) of the content.
  string mime_type = 2;
}

// Specifies a set of documents on Cloud Storage.
message GcsDocuments {
  // The list of documents.
  repeated GcsDocument documents = 1;
}

// Specifies all documents on Cloud Storage with a common prefix.
message GcsPrefix {
  // The URI prefix.
  string gcs_uri_prefix = 1;
}

// The common config to specify a set of documents used as input.
message BatchDocumentsInputConfig {
  // The source.
  oneof source {
    // The set of documents that match the specified Cloud Storage `gcs_prefix`.
    GcsPrefix gcs_prefix = 1;

    // The set of documents individually specified on Cloud Storage.
    GcsDocuments gcs_documents = 2;
  }
}

// Config that controls the output of documents. All documents will be written
// as a JSON file.
message DocumentOutputConfig {
  // The configuration used when outputting documents.
  message GcsOutputConfig {
    // The sharding config for the output document.
    message ShardingConfig {
      // The number of pages per shard.
      int32 pages_per_shard = 1;

      // The number of overlapping pages between consecutive shards.
      int32 pages_overlap = 2;
    }

    // The Cloud Storage uri (a directory) of the output.
    string gcs_uri = 1;

    // Specifies which fields to include in the output documents.
    // Only supports top level document and pages field so it must be in the
    // form of `{document_field_name}` or `pages.{page_field_name}`.
    google.protobuf.FieldMask field_mask = 2;

    // Specifies the sharding config for the output document.
    ShardingConfig sharding_config = 3;
  }

  // The destination of the results.
  oneof destination {
    // Output config to write the results to Cloud Storage.
    GcsOutputConfig gcs_output_config = 1;
  }
}

// Config for Document OCR.
message OcrConfig {
  // Hints for OCR Engine
  message Hints {
    // List of BCP-47 language codes to use for OCR. In most cases, not
    // specifying it yields the best results since it enables automatic language
    // detection. For languages based on the Latin alphabet, setting hints is
    // not needed. In rare cases, when the language of the text in the
    // image is known, setting a hint will help get better results (although it
    // will be a significant hindrance if the hint is wrong).
    repeated string language_hints = 1;
  }

  // Hints for the OCR model.
  Hints hints = 2;

  // Enables special handling for PDFs with existing text information. Results
  // in better text extraction quality in such PDF inputs.
  bool enable_native_pdf_parsing = 3;

  // Enables intelligent document quality scores after OCR. Can help with
  // diagnosing why OCR responses are of poor quality for a given input.
  // Adds additional latency comparable to regular OCR to the process call.
  bool enable_image_quality_scores = 4;

  // A list of advanced OCR options to further fine-tune OCR behavior. Current
  // valid values are:
  //
  // - `legacy_layout`: a heuristics layout detection algorithm, which serves as
  // an alternative to the current ML-based layout detection algorithm.
  // Customers can choose the best suitable layout algorithm based on their
  // situation.
  repeated string advanced_ocr_options = 5;

  // Includes symbol level OCR information if set to true.
  bool enable_symbol = 6;

  // Turn on font id model and returns font style information.
  bool compute_style_info = 8;
}