diff --git a/docs/en/guides/triton-inference-server.md b/docs/en/guides/triton-inference-server.md index 7395ccef1..09f7516b1 100644 --- a/docs/en/guides/triton-inference-server.md +++ b/docs/en/guides/triton-inference-server.md @@ -80,6 +80,28 @@ The Triton Model Repository is a storage location where Triton can access and lo # Create config file (triton_model_path / "config.pbtxt").touch() + + # (Optional) Enable TensorRT for GPU inference + # First run will be slow due to TensorRT engine conversion + import json + + data = { + "optimization": { + "execution_accelerators": { + "gpu_execution_accelerator": [ + { + "name": "tensorrt", + "parameters": {"key": "precision_mode", "value": "FP16"}, + "parameters": {"key": "max_workspace_size_bytes", "value": "3221225472"}, + "parameters": {"key": "trt_engine_cache_enable", "value": "1"}, + } + ] + } + } + } + + with open(triton_model_path / "config.pbtxt", "w") as f: + json.dump(data, f, indent=4) ``` ## Running Triton Inference Server @@ -94,7 +116,7 @@ import time from tritonclient.http import InferenceServerClient # Define image https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver -tag = "nvcr.io/nvidia/tritonserver:23.09-py3" # 6.4 GB +tag = "nvcr.io/nvidia/tritonserver:24.09-py3" # 8.57 GB # Pull the image subprocess.call(f"docker pull {tag}", shell=True) @@ -187,7 +209,7 @@ Setting up [Ultralytics YOLO11](https://docs.ultralytics.com/models/yolov8/) wit from tritonclient.http import InferenceServerClient # Define image https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver - tag = "nvcr.io/nvidia/tritonserver:23.09-py3" + tag = "nvcr.io/nvidia/tritonserver:24.09-py3" subprocess.call(f"docker pull {tag}", shell=True)