|
|
|
@ -20,6 +20,9 @@ import "google/api/annotations.proto"; |
|
|
|
|
import "google/api/client.proto"; |
|
|
|
|
import "google/api/field_behavior.proto"; |
|
|
|
|
import "google/api/resource.proto"; |
|
|
|
|
import "google/cloud/aiplatform/v1/io.proto"; |
|
|
|
|
import "google/cloud/aiplatform/v1/operation.proto"; |
|
|
|
|
import "google/longrunning/operations.proto"; |
|
|
|
|
|
|
|
|
|
option csharp_namespace = "Google.Cloud.AIPlatform.V1"; |
|
|
|
|
option go_package = "cloud.google.com/go/aiplatform/apiv1/aiplatformpb;aiplatformpb"; |
|
|
|
@ -43,6 +46,19 @@ service EvaluationService { |
|
|
|
|
body: "*" |
|
|
|
|
}; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Evaluates a dataset based on a set of given metrics. |
|
|
|
|
rpc EvaluateDataset(EvaluateDatasetRequest) |
|
|
|
|
returns (google.longrunning.Operation) { |
|
|
|
|
option (google.api.http) = { |
|
|
|
|
post: "/v1/{location=projects/*/locations/*}:evaluateDataset" |
|
|
|
|
body: "*" |
|
|
|
|
}; |
|
|
|
|
option (google.longrunning.operation_info) = { |
|
|
|
|
response_type: "EvaluateDatasetResponse" |
|
|
|
|
metadata_type: "EvaluateDatasetOperationMetadata" |
|
|
|
|
}; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Pairwise prediction autorater preference. |
|
|
|
@ -60,6 +76,161 @@ enum PairwiseChoice { |
|
|
|
|
TIE = 3; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Operation metadata for Dataset Evaluation. |
|
|
|
|
message EvaluateDatasetOperationMetadata { |
|
|
|
|
// Generic operation metadata. |
|
|
|
|
GenericOperationMetadata generic_metadata = 1; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Response in LRO for EvaluationService.EvaluateDataset. |
|
|
|
|
message EvaluateDatasetResponse { |
|
|
|
|
// Output only. Output info for EvaluationService.EvaluateDataset. |
|
|
|
|
OutputInfo output_info = 3 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Describes the info for output of EvaluationService.EvaluateDataset. |
|
|
|
|
message OutputInfo { |
|
|
|
|
// The output location into which evaluation output is written. |
|
|
|
|
oneof output_location { |
|
|
|
|
// Output only. The full path of the Cloud Storage directory created, into |
|
|
|
|
// which the evaluation results and aggregation results are written. |
|
|
|
|
string gcs_output_directory = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Request message for EvaluationService.EvaluateDataset. |
|
|
|
|
message EvaluateDatasetRequest { |
|
|
|
|
// Required. The resource name of the Location to evaluate the dataset. |
|
|
|
|
// Format: `projects/{project}/locations/{location}` |
|
|
|
|
string location = 1 [ |
|
|
|
|
(google.api.field_behavior) = REQUIRED, |
|
|
|
|
(google.api.resource_reference) = { |
|
|
|
|
type: "locations.googleapis.com/Location" |
|
|
|
|
} |
|
|
|
|
]; |
|
|
|
|
|
|
|
|
|
// Required. The dataset used for evaluation. |
|
|
|
|
EvaluationDataset dataset = 2 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. The metrics used for evaluation. |
|
|
|
|
repeated Metric metrics = 3 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Config for evaluation output. |
|
|
|
|
OutputConfig output_config = 4 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Optional. Autorater config used for evaluation. |
|
|
|
|
AutoraterConfig autorater_config = 5 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Config for evaluation output. |
|
|
|
|
message OutputConfig { |
|
|
|
|
// The destination for evaluation output. |
|
|
|
|
oneof destination { |
|
|
|
|
// Cloud storage destination for evaluation output. |
|
|
|
|
GcsDestination gcs_destination = 1; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// The metric used for dataset level evaluation. |
|
|
|
|
message Metric { |
|
|
|
|
// The aggregation metrics supported by EvaluationService.EvaluateDataset. |
|
|
|
|
enum AggregationMetric { |
|
|
|
|
// Unspecified aggregation metric. |
|
|
|
|
AGGREGATION_METRIC_UNSPECIFIED = 0; |
|
|
|
|
|
|
|
|
|
// Average aggregation metric. |
|
|
|
|
AVERAGE = 1; |
|
|
|
|
|
|
|
|
|
// Mode aggregation metric. |
|
|
|
|
MODE = 2; |
|
|
|
|
|
|
|
|
|
// Standard deviation aggregation metric. |
|
|
|
|
STANDARD_DEVIATION = 3; |
|
|
|
|
|
|
|
|
|
// Variance aggregation metric. |
|
|
|
|
VARIANCE = 4; |
|
|
|
|
|
|
|
|
|
// Minimum aggregation metric. |
|
|
|
|
MINIMUM = 5; |
|
|
|
|
|
|
|
|
|
// Maximum aggregation metric. |
|
|
|
|
MAXIMUM = 6; |
|
|
|
|
|
|
|
|
|
// Median aggregation metric. |
|
|
|
|
MEDIAN = 7; |
|
|
|
|
|
|
|
|
|
// 90th percentile aggregation metric. |
|
|
|
|
PERCENTILE_P90 = 8; |
|
|
|
|
|
|
|
|
|
// 95th percentile aggregation metric. |
|
|
|
|
PERCENTILE_P95 = 9; |
|
|
|
|
|
|
|
|
|
// 99th percentile aggregation metric. |
|
|
|
|
PERCENTILE_P99 = 10; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// The metric spec used for evaluation. |
|
|
|
|
oneof metric_spec { |
|
|
|
|
// Spec for pointwise metric. |
|
|
|
|
PointwiseMetricSpec pointwise_metric_spec = 2; |
|
|
|
|
|
|
|
|
|
// Spec for pairwise metric. |
|
|
|
|
PairwiseMetricSpec pairwise_metric_spec = 3; |
|
|
|
|
|
|
|
|
|
// Spec for exact match metric. |
|
|
|
|
ExactMatchSpec exact_match_spec = 4; |
|
|
|
|
|
|
|
|
|
// Spec for bleu metric. |
|
|
|
|
BleuSpec bleu_spec = 5; |
|
|
|
|
|
|
|
|
|
// Spec for rouge metric. |
|
|
|
|
RougeSpec rouge_spec = 6; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Optional. The aggregation metrics to use. |
|
|
|
|
repeated AggregationMetric aggregation_metrics = 1 |
|
|
|
|
[(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// The dataset used for evaluation. |
|
|
|
|
message EvaluationDataset { |
|
|
|
|
// The source of the dataset. |
|
|
|
|
oneof source { |
|
|
|
|
// Cloud storage source holds the dataset. |
|
|
|
|
GcsSource gcs_source = 1; |
|
|
|
|
|
|
|
|
|
// BigQuery source holds the dataset. |
|
|
|
|
BigQuerySource bigquery_source = 2; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// The configs for autorater. This is applicable to both EvaluateInstances and |
|
|
|
|
// EvaluateDataset. |
|
|
|
|
message AutoraterConfig { |
|
|
|
|
// Optional. Number of samples for each instance in the dataset. |
|
|
|
|
// If not specified, the default is 4. Minimum value is 1, maximum value |
|
|
|
|
// is 32. |
|
|
|
|
optional int32 sampling_count = 1 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
|
|
|
|
|
// Optional. Whether to flip the candidate and baseline responses. |
|
|
|
|
// This is only applicable to the pairwise metric. If enabled, also provide |
|
|
|
|
// PairwiseMetricSpec.candidate_response_field_name and |
|
|
|
|
// PairwiseMetricSpec.baseline_response_field_name. When rendering |
|
|
|
|
// PairwiseMetricSpec.metric_prompt_template, the candidate and baseline |
|
|
|
|
// fields will be flipped for half of the samples to reduce bias. |
|
|
|
|
optional bool flip_enabled = 2 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
|
|
|
|
|
// Optional. The fully qualified name of the publisher model or tuned |
|
|
|
|
// autorater endpoint to use. |
|
|
|
|
// |
|
|
|
|
// Publisher model format: |
|
|
|
|
// `projects/{project}/locations/{location}/publishers/*/models/*` |
|
|
|
|
// |
|
|
|
|
// Tuned model endpoint format: |
|
|
|
|
// `projects/{project}/locations/{location}/endpoints/{endpoint}` |
|
|
|
|
string autorater_model = 3 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Request message for EvaluationService.EvaluateInstances. |
|
|
|
|
message EvaluateInstancesRequest { |
|
|
|
|
// Instances and specs for evaluation |
|
|
|
@ -146,6 +317,24 @@ message EvaluateInstancesRequest { |
|
|
|
|
|
|
|
|
|
// Input for Metricx metric. |
|
|
|
|
MetricxInput metricx_input = 32; |
|
|
|
|
|
|
|
|
|
// Input for trajectory exact match metric. |
|
|
|
|
TrajectoryExactMatchInput trajectory_exact_match_input = 33; |
|
|
|
|
|
|
|
|
|
// Input for trajectory in order match metric. |
|
|
|
|
TrajectoryInOrderMatchInput trajectory_in_order_match_input = 34; |
|
|
|
|
|
|
|
|
|
// Input for trajectory match any order metric. |
|
|
|
|
TrajectoryAnyOrderMatchInput trajectory_any_order_match_input = 35; |
|
|
|
|
|
|
|
|
|
// Input for trajectory precision metric. |
|
|
|
|
TrajectoryPrecisionInput trajectory_precision_input = 37; |
|
|
|
|
|
|
|
|
|
// Input for trajectory recall metric. |
|
|
|
|
TrajectoryRecallInput trajectory_recall_input = 38; |
|
|
|
|
|
|
|
|
|
// Input for trajectory single tool use metric. |
|
|
|
|
TrajectorySingleToolUseInput trajectory_single_tool_use_input = 39; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Required. The resource name of the Location to evaluate the instances. |
|
|
|
@ -156,6 +345,10 @@ message EvaluateInstancesRequest { |
|
|
|
|
type: "locations.googleapis.com/Location" |
|
|
|
|
} |
|
|
|
|
]; |
|
|
|
|
|
|
|
|
|
// Optional. Autorater config used for evaluation. |
|
|
|
|
AutoraterConfig autorater_config = 30 |
|
|
|
|
[(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Response message for EvaluationService.EvaluateInstances. |
|
|
|
@ -249,6 +442,24 @@ message EvaluateInstancesResponse { |
|
|
|
|
|
|
|
|
|
// Result for Metricx metric. |
|
|
|
|
MetricxResult metricx_result = 30; |
|
|
|
|
|
|
|
|
|
// Result for trajectory exact match metric. |
|
|
|
|
TrajectoryExactMatchResults trajectory_exact_match_results = 31; |
|
|
|
|
|
|
|
|
|
// Result for trajectory in order match metric. |
|
|
|
|
TrajectoryInOrderMatchResults trajectory_in_order_match_results = 32; |
|
|
|
|
|
|
|
|
|
// Result for trajectory any order match metric. |
|
|
|
|
TrajectoryAnyOrderMatchResults trajectory_any_order_match_results = 33; |
|
|
|
|
|
|
|
|
|
// Result for trajectory precision metric. |
|
|
|
|
TrajectoryPrecisionResults trajectory_precision_results = 35; |
|
|
|
|
|
|
|
|
|
// Results for trajectory recall metric. |
|
|
|
|
TrajectoryRecallResults trajectory_recall_results = 36; |
|
|
|
|
|
|
|
|
|
// Results for trajectory single tool use metric. |
|
|
|
|
TrajectorySingleToolUseResults trajectory_single_tool_use_results = 37; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
@ -1010,6 +1221,10 @@ message PointwiseMetricSpec { |
|
|
|
|
// Required. Metric prompt template for pointwise metric. |
|
|
|
|
optional string metric_prompt_template = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Optional. System instructions for pointwise metric. |
|
|
|
|
optional string system_instruction = 2 |
|
|
|
|
[(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for pointwise metric result. |
|
|
|
@ -1047,6 +1262,18 @@ message PairwiseMetricSpec { |
|
|
|
|
// Required. Metric prompt template for pairwise metric. |
|
|
|
|
optional string metric_prompt_template = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Optional. The field name of the candidate response. |
|
|
|
|
string candidate_response_field_name = 2 |
|
|
|
|
[(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
|
|
|
|
|
// Optional. The field name of the baseline response. |
|
|
|
|
string baseline_response_field_name = 3 |
|
|
|
|
[(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
|
|
|
|
|
// Optional. System instructions for pairwise metric. |
|
|
|
|
optional string system_instruction = 4 |
|
|
|
|
[(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for pairwise metric result. |
|
|
|
@ -1315,3 +1542,256 @@ message MetricxResult { |
|
|
|
|
// Output only. MetricX score. Range depends on version. |
|
|
|
|
optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Instances and metric spec for TrajectoryExactMatch metric. |
|
|
|
|
message TrajectoryExactMatchInput { |
|
|
|
|
// Required. Spec for TrajectoryExactMatch metric. |
|
|
|
|
TrajectoryExactMatchSpec metric_spec = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Repeated TrajectoryExactMatch instance. |
|
|
|
|
repeated TrajectoryExactMatchInstance instances = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryExactMatch metric - returns 1 if tool calls in the |
|
|
|
|
// reference trajectory exactly match the predicted trajectory, else 0. |
|
|
|
|
message TrajectoryExactMatchSpec {} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryExactMatch instance. |
|
|
|
|
message TrajectoryExactMatchInstance { |
|
|
|
|
// Required. Spec for predicted tool call trajectory. |
|
|
|
|
optional Trajectory predicted_trajectory = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Spec for reference tool call trajectory. |
|
|
|
|
optional Trajectory reference_trajectory = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Results for TrajectoryExactMatch metric. |
|
|
|
|
message TrajectoryExactMatchResults { |
|
|
|
|
// Output only. TrajectoryExactMatch metric values. |
|
|
|
|
repeated TrajectoryExactMatchMetricValue |
|
|
|
|
trajectory_exact_match_metric_values = 1 |
|
|
|
|
[(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// TrajectoryExactMatch metric value for an instance. |
|
|
|
|
message TrajectoryExactMatchMetricValue { |
|
|
|
|
// Output only. TrajectoryExactMatch score. |
|
|
|
|
optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Instances and metric spec for TrajectoryInOrderMatch metric. |
|
|
|
|
message TrajectoryInOrderMatchInput { |
|
|
|
|
// Required. Spec for TrajectoryInOrderMatch metric. |
|
|
|
|
TrajectoryInOrderMatchSpec metric_spec = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Repeated TrajectoryInOrderMatch instance. |
|
|
|
|
repeated TrajectoryInOrderMatchInstance instances = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryInOrderMatch metric - returns 1 if tool calls in the |
|
|
|
|
// reference trajectory appear in the predicted trajectory in the same order, |
|
|
|
|
// else 0. |
|
|
|
|
message TrajectoryInOrderMatchSpec {} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryInOrderMatch instance. |
|
|
|
|
message TrajectoryInOrderMatchInstance { |
|
|
|
|
// Required. Spec for predicted tool call trajectory. |
|
|
|
|
optional Trajectory predicted_trajectory = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Spec for reference tool call trajectory. |
|
|
|
|
optional Trajectory reference_trajectory = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Results for TrajectoryInOrderMatch metric. |
|
|
|
|
message TrajectoryInOrderMatchResults { |
|
|
|
|
// Output only. TrajectoryInOrderMatch metric values. |
|
|
|
|
repeated TrajectoryInOrderMatchMetricValue |
|
|
|
|
trajectory_in_order_match_metric_values = 1 |
|
|
|
|
[(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// TrajectoryInOrderMatch metric value for an instance. |
|
|
|
|
message TrajectoryInOrderMatchMetricValue { |
|
|
|
|
// Output only. TrajectoryInOrderMatch score. |
|
|
|
|
optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Instances and metric spec for TrajectoryAnyOrderMatch metric. |
|
|
|
|
message TrajectoryAnyOrderMatchInput { |
|
|
|
|
// Required. Spec for TrajectoryAnyOrderMatch metric. |
|
|
|
|
TrajectoryAnyOrderMatchSpec metric_spec = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Repeated TrajectoryAnyOrderMatch instance. |
|
|
|
|
repeated TrajectoryAnyOrderMatchInstance instances = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryAnyOrderMatch metric - returns 1 if all tool calls in the |
|
|
|
|
// reference trajectory appear in the predicted trajectory in any order, else |
|
|
|
|
// 0. |
|
|
|
|
message TrajectoryAnyOrderMatchSpec {} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryAnyOrderMatch instance. |
|
|
|
|
message TrajectoryAnyOrderMatchInstance { |
|
|
|
|
// Required. Spec for predicted tool call trajectory. |
|
|
|
|
optional Trajectory predicted_trajectory = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Spec for reference tool call trajectory. |
|
|
|
|
optional Trajectory reference_trajectory = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Results for TrajectoryAnyOrderMatch metric. |
|
|
|
|
message TrajectoryAnyOrderMatchResults { |
|
|
|
|
// Output only. TrajectoryAnyOrderMatch metric values. |
|
|
|
|
repeated TrajectoryAnyOrderMatchMetricValue |
|
|
|
|
trajectory_any_order_match_metric_values = 1 |
|
|
|
|
[(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// TrajectoryAnyOrderMatch metric value for an instance. |
|
|
|
|
message TrajectoryAnyOrderMatchMetricValue { |
|
|
|
|
// Output only. TrajectoryAnyOrderMatch score. |
|
|
|
|
optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Instances and metric spec for TrajectoryPrecision metric. |
|
|
|
|
message TrajectoryPrecisionInput { |
|
|
|
|
// Required. Spec for TrajectoryPrecision metric. |
|
|
|
|
TrajectoryPrecisionSpec metric_spec = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Repeated TrajectoryPrecision instance. |
|
|
|
|
repeated TrajectoryPrecisionInstance instances = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryPrecision metric - returns a float score based on average |
|
|
|
|
// precision of individual tool calls. |
|
|
|
|
message TrajectoryPrecisionSpec {} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryPrecision instance. |
|
|
|
|
message TrajectoryPrecisionInstance { |
|
|
|
|
// Required. Spec for predicted tool call trajectory. |
|
|
|
|
optional Trajectory predicted_trajectory = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Spec for reference tool call trajectory. |
|
|
|
|
optional Trajectory reference_trajectory = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Results for TrajectoryPrecision metric. |
|
|
|
|
message TrajectoryPrecisionResults { |
|
|
|
|
// Output only. TrajectoryPrecision metric values. |
|
|
|
|
repeated TrajectoryPrecisionMetricValue trajectory_precision_metric_values = 1 |
|
|
|
|
[(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// TrajectoryPrecision metric value for an instance. |
|
|
|
|
message TrajectoryPrecisionMetricValue { |
|
|
|
|
// Output only. TrajectoryPrecision score. |
|
|
|
|
optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Instances and metric spec for TrajectoryRecall metric. |
|
|
|
|
message TrajectoryRecallInput { |
|
|
|
|
// Required. Spec for TrajectoryRecall metric. |
|
|
|
|
TrajectoryRecallSpec metric_spec = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Repeated TrajectoryRecall instance. |
|
|
|
|
repeated TrajectoryRecallInstance instances = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryRecall metric - returns a float score based on average |
|
|
|
|
// recall of individual tool calls. |
|
|
|
|
message TrajectoryRecallSpec {} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectoryRecall instance. |
|
|
|
|
message TrajectoryRecallInstance { |
|
|
|
|
// Required. Spec for predicted tool call trajectory. |
|
|
|
|
optional Trajectory predicted_trajectory = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Spec for reference tool call trajectory. |
|
|
|
|
optional Trajectory reference_trajectory = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Results for TrajectoryRecall metric. |
|
|
|
|
message TrajectoryRecallResults { |
|
|
|
|
// Output only. TrajectoryRecall metric values. |
|
|
|
|
repeated TrajectoryRecallMetricValue trajectory_recall_metric_values = 1 |
|
|
|
|
[(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// TrajectoryRecall metric value for an instance. |
|
|
|
|
message TrajectoryRecallMetricValue { |
|
|
|
|
// Output only. TrajectoryRecall score. |
|
|
|
|
optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Instances and metric spec for TrajectorySingleToolUse metric. |
|
|
|
|
message TrajectorySingleToolUseInput { |
|
|
|
|
// Required. Spec for TrajectorySingleToolUse metric. |
|
|
|
|
TrajectorySingleToolUseSpec metric_spec = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Required. Repeated TrajectorySingleToolUse instance. |
|
|
|
|
repeated TrajectorySingleToolUseInstance instances = 2 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectorySingleToolUse metric - returns 1 if tool is present in the |
|
|
|
|
// predicted trajectory, else 0. |
|
|
|
|
message TrajectorySingleToolUseSpec { |
|
|
|
|
// Required. Spec for tool name to be checked for in the predicted trajectory. |
|
|
|
|
optional string tool_name = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for TrajectorySingleToolUse instance. |
|
|
|
|
message TrajectorySingleToolUseInstance { |
|
|
|
|
// Required. Spec for predicted tool call trajectory. |
|
|
|
|
optional Trajectory predicted_trajectory = 1 |
|
|
|
|
[(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Results for TrajectorySingleToolUse metric. |
|
|
|
|
message TrajectorySingleToolUseResults { |
|
|
|
|
// Output only. TrajectorySingleToolUse metric values. |
|
|
|
|
repeated TrajectorySingleToolUseMetricValue |
|
|
|
|
trajectory_single_tool_use_metric_values = 1 |
|
|
|
|
[(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// TrajectorySingleToolUse metric value for an instance. |
|
|
|
|
message TrajectorySingleToolUseMetricValue { |
|
|
|
|
// Output only. TrajectorySingleToolUse score. |
|
|
|
|
optional float score = 1 [(google.api.field_behavior) = OUTPUT_ONLY]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for trajectory. |
|
|
|
|
message Trajectory { |
|
|
|
|
// Required. Tool calls in the trajectory. |
|
|
|
|
repeated ToolCall tool_calls = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
// Spec for tool call. |
|
|
|
|
message ToolCall { |
|
|
|
|
// Required. Spec for tool name |
|
|
|
|
optional string tool_name = 1 [(google.api.field_behavior) = REQUIRED]; |
|
|
|
|
|
|
|
|
|
// Optional. Spec for tool input |
|
|
|
|
optional string tool_input = 2 [(google.api.field_behavior) = OPTIONAL]; |
|
|
|
|
} |
|
|
|
|