Merge branch 'main' into exp

exp-a
Laughing-q 8 months ago
commit 4ccd40b944
  1. 2
      docker/Dockerfile
  2. 4
      docker/Dockerfile-jetson
  3. 3
      docs/en/datasets/detect/african-wildlife.md
  4. 3
      docs/en/datasets/detect/brain-tumor.md
  5. 6
      docs/en/datasets/detect/index.md
  6. 98
      docs/en/datasets/detect/lvis.md
  7. 1
      docs/en/datasets/index.md
  8. 11
      docs/en/datasets/segment/carparts-seg.md
  9. 96
      docs/en/guides/heatmaps.md
  10. 2
      docs/en/guides/index.md
  11. 256
      docs/en/guides/nvidia-jetson.md
  12. 45
      docs/en/guides/object-counting.md
  13. 152
      docs/en/guides/queue-management.md
  14. 12
      docs/en/guides/workouts-monitoring.md
  15. 14
      docs/en/help/CI.md
  16. 10
      docs/en/integrations/edge-tpu.md
  17. 6
      docs/en/integrations/index.md
  18. 10
      docs/en/integrations/paddlepaddle.md
  19. 16
      docs/en/integrations/tf-graphdef.md
  20. 3
      docs/en/integrations/tf-savedmodel.md
  21. 118
      docs/en/integrations/tfjs.md
  22. 2
      docs/en/models/fast-sam.md
  23. 106
      docs/en/models/yolo-world.md
  24. 48
      docs/en/models/yolov9.md
  25. 2
      docs/en/modes/predict.md
  26. 37
      docs/en/modes/train.md
  27. 4
      docs/en/reference/data/augment.md
  28. 4
      docs/en/reference/data/build.md
  29. 10
      docs/en/reference/data/dataset.md
  30. 8
      docs/en/reference/data/utils.md
  31. 15
      docs/en/reference/models/yolo/world/train.md
  32. 11
      docs/en/reference/models/yolo/world/train_world.md
  33. 16
      docs/en/reference/solutions/queue_management.md
  34. 4
      docs/en/reference/utils/torch_utils.md
  35. 37
      docs/en/usage/cfg.md
  36. 30
      docs/en/usage/simple-utilities.md
  37. 2
      docs/mkdocs_github_authors.yaml
  38. 14
      examples/YOLOv8-ONNXRuntime-CPP/inference.cpp
  39. 1
      examples/YOLOv8-ONNXRuntime-CPP/inference.h
  40. 8
      mkdocs.yml
  41. 3
      pyproject.toml
  42. 3
      tests/test_engine.py
  43. 1
      tests/test_integrations.py
  44. 59
      tests/test_python.py
  45. 5
      ultralytics/__init__.py
  46. 6
      ultralytics/cfg/__init__.py
  47. 1239
      ultralytics/cfg/datasets/lvis.yaml
  48. 4
      ultralytics/cfg/default.yaml
  49. 38
      ultralytics/cfg/models/v9/yolov9c-seg.yaml
  50. 6
      ultralytics/cfg/models/v9/yolov9c.yaml
  51. 61
      ultralytics/cfg/models/v9/yolov9e-seg.yaml
  52. 9
      ultralytics/cfg/models/v9/yolov9e.yaml
  53. 15
      ultralytics/data/__init__.py
  54. 150
      ultralytics/data/augment.py
  55. 42
      ultralytics/data/base.py
  56. 28
      ultralytics/data/build.py
  57. 37
      ultralytics/data/converter.py
  58. 258
      ultralytics/data/dataset.py
  59. 21
      ultralytics/data/explorer/explorer.py
  60. 6
      ultralytics/data/explorer/gui/dash.py
  61. 5
      ultralytics/data/explorer/utils.py
  62. 23
      ultralytics/data/loaders.py
  63. 2
      ultralytics/data/split_dota.py
  64. 41
      ultralytics/data/utils.py
  65. 11
      ultralytics/engine/exporter.py
  66. 15
      ultralytics/engine/model.py
  67. 4
      ultralytics/engine/results.py
  68. 162
      ultralytics/engine/trainer.py
  69. 2
      ultralytics/engine/tuner.py
  70. 4
      ultralytics/engine/validator.py
  71. 6
      ultralytics/hub/auth.py
  72. 6
      ultralytics/hub/session.py
  73. 14
      ultralytics/hub/utils.py
  74. 2
      ultralytics/models/fastsam/model.py
  75. 14
      ultralytics/models/fastsam/prompt.py
  76. 2
      ultralytics/models/nas/model.py
  77. 2
      ultralytics/models/rtdetr/val.py
  78. 2
      ultralytics/models/sam/model.py
  79. 6
      ultralytics/models/sam/modules/tiny_encoder.py
  80. 2
      ultralytics/models/sam/modules/transformer.py
  81. 29
      ultralytics/models/sam/predict.py
  82. 4
      ultralytics/models/yolo/__init__.py
  83. 5
      ultralytics/models/yolo/classify/train.py
  84. 2
      ultralytics/models/yolo/detect/train.py
  85. 55
      ultralytics/models/yolo/detect/val.py
  86. 3
      ultralytics/models/yolo/model.py
  87. 2
      ultralytics/models/yolo/obb/val.py
  88. 5
      ultralytics/models/yolo/world/__init__.py
  89. 92
      ultralytics/models/yolo/world/train.py
  90. 108
      ultralytics/models/yolo/world/train_world.py
  91. 12
      ultralytics/nn/autobackend.py
  92. 16
      ultralytics/nn/modules/__init__.py
  93. 6
      ultralytics/nn/modules/block.py
  94. 2
      ultralytics/nn/modules/conv.py
  95. 19
      ultralytics/nn/modules/head.py
  96. 56
      ultralytics/nn/tasks.py
  97. 2
      ultralytics/solutions/ai_gym.py
  98. 129
      ultralytics/solutions/heatmap.py
  99. 141
      ultralytics/solutions/object_counter.py
  100. 187
      ultralytics/solutions/queue_management.py
  101. Some files were not shown because too many files have changed in this diff Show More

@ -3,7 +3,7 @@
# Image is CUDA-optimized for YOLOv8 single/multi-GPU training and inference
# Start FROM PyTorch image https://hub.docker.com/r/pytorch/pytorch or nvcr.io/nvidia/pytorch:23.03-py3
FROM pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime
FROM pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime
RUN pip install --no-cache nvidia-tensorrt --index-url https://pypi.ngc.nvidia.com
# Downloads to user config dir

@ -28,8 +28,8 @@ RUN grep -v "opencv-python" pyproject.toml > temp.toml && mv temp.toml pyproject
# Install pip packages manually for TensorRT compatibility https://github.com/NVIDIA/TensorRT/issues/2567
RUN python3 -m pip install --upgrade pip wheel
RUN pip install --no-cache tqdm matplotlib pyyaml psutil pandas onnx "numpy==1.23"
RUN pip install --no-cache -e .
RUN pip install --no-cache tqdm matplotlib pyyaml psutil pandas onnx
RUN pip install --no-cache -e ".[export]"
# Set environment variables
ENV OMP_NUM_THREADS=1

@ -75,7 +75,6 @@ To train a YOLOv8n model on the African wildlife dataset for 100 epochs with an
# Start prediction with a finetuned *.pt model
yolo detect predict model='path/to/best.pt' imgsz=640 source="https://ultralytics.com/assets/african-wildlife-sample.jpg"
```
## Sample Images and Annotations
@ -89,4 +88,4 @@ This example illustrates the variety and complexity of images in the African wil
## Citations and Acknowledgments
The dataset has been released available under the [AGPL-3.0 License](https://github.com/ultralytics/ultralytics/blob/main/LICENSE).
The dataset has been released available under the [AGPL-3.0 License](https://github.com/ultralytics/ultralytics/blob/main/LICENSE).

@ -74,7 +74,6 @@ To train a YOLOv8n model on the brain tumor dataset for 100 epochs with an image
# Start prediction with a finetuned *.pt model
yolo detect predict model='path/to/best.pt' imgsz=640 source="https://ultralytics.com/assets/brain-tumor-sample.jpg"
```
## Sample Images and Annotations
@ -88,4 +87,4 @@ This example highlights the diversity and intricacy of images within the brain t
## Citations and Acknowledgments
The dataset has been released available under the [AGPL-3.0 License](https://github.com/ultralytics/ultralytics/blob/main/LICENSE).
The dataset has been released available under the [AGPL-3.0 License](https://github.com/ultralytics/ultralytics/blob/main/LICENSE).

@ -40,9 +40,10 @@ The label file corresponding to the above image contains 2 persons (class `0`) a
<p align="center"><img width="428" src="https://user-images.githubusercontent.com/26833433/112467037-d2568c00-8d66-11eb-8796-55402ac0d62f.png" alt="Example label file"></p>
When using the Ultralytics YOLO format, organize your training and validation images and labels as shown in the example below.
When using the Ultralytics YOLO format, organize your training and validation images and labels as shown in the [COCO8 dataset](coco8.md) example below.
<p align="center"><img width="800" src="https://github.com/IvorZhu331/ultralytics/assets/26833433/a55ec82d-2bb5-40f9-ac5c-f935e7eb9f07" alt="Example dataset directory structure"></p>
<p align="center"><img width="700" src="https://user-images.githubusercontent.com/26833433/134436012-65111ad1-9541-4853-81a6-f19a3468b75f.png" alt="Example dataset directory structure"></p>
## Usage
@ -74,6 +75,7 @@ Here is a list of the supported datasets and a brief description for each:
- [**Argoverse**](argoverse.md): A collection of sensor data collected from autonomous vehicles. It contains 3D tracking annotations for car objects.
- [**COCO**](coco.md): Common Objects in Context (COCO) is a large-scale object detection, segmentation, and captioning dataset with 80 object categories.
- [**LVIS**](lvis.md): LVIS is a large-scale object detection, segmentation, and captioning dataset with 1203 object categories.
- [**COCO8**](coco8.md): A smaller subset of the COCO dataset, COCO8 is more lightweight and faster to train.
- [**GlobalWheat2020**](globalwheat2020.md): A dataset containing images of wheat heads for the Global Wheat Challenge 2020.
- [**Objects365**](objects365.md): A large-scale object detection dataset with 365 object categories and 600k images, aimed at advancing object detection research.

@ -0,0 +1,98 @@
---
comments: true
description: Learn how LVIS, a leading dataset for object detection and segmentation, integrates with Ultralytics. Discover ways to use it for training YOLO models.
keywords: Ultralytics, LVIS dataset, object detection, YOLO, YOLO model training, image segmentation, computer vision, deep learning models
---
# LVIS Dataset
The [LVIS dataset](https://www.lvisdataset.org/) is a large-scale, fine-grained vocabulary-level annotation dataset developed and released by Facebook AI Research (FAIR). It is primarily used as a research benchmark for object detection and instance segmentation with a large vocabulary of categories, aiming to drive further advancements in computer vision field.
<p align="center">
<img width="640" src="https://github.com/ultralytics/ultralytics/assets/26833433/40230a80-e7bc-4310-a860-4cc0ef4bb02a" alt="LVIS Dataset example images">
</p>
## Key Features
- LVIS contains 160k images and 2M instance annotations for object detection, segmentation, and captioning tasks.
- The dataset comprises 1203 object categories, including common objects like cars, bicycles, and animals, as well as more specific categories such as umbrellas, handbags, and sports equipment.
- Annotations include object bounding boxes, segmentation masks, and captions for each image.
- LVIS provides standardized evaluation metrics like mean Average Precision (mAP) for object detection, and mean Average Recall (mAR) for segmentation tasks, making it suitable for comparing model performance.
- LVIS uses the exactly the same images as [COCO](./coco.md) dataset, but with different splits and different annotations.
## Dataset Structure
The LVIS dataset is split into three subsets:
1. **Train**: This subset contains 100k images for training object detection, segmentation, and captioning models.
2. **Val**: This subset has 20k images used for validation purposes during model training.
3. **Minival**: This subset is exactly the same as COCO val2017 set which has 5k images used for validation purposes during model training.
4. **Test**: This subset consists of 20k images used for testing and benchmarking the trained models. Ground truth annotations for this subset are not publicly available, and the results are submitted to the [LVIS evaluation server](https://eval.ai/web/challenges/challenge-page/675/overview) for performance evaluation.
## Applications
The LVIS dataset is widely used for training and evaluating deep learning models in object detection (such as YOLO, Faster R-CNN, and SSD), instance segmentation (such as Mask R-CNN). The dataset's diverse set of object categories, large number of annotated images, and standardized evaluation metrics make it an essential resource for computer vision researchers and practitioners.
## Dataset YAML
A YAML (Yet Another Markup Language) file is used to define the dataset configuration. It contains information about the dataset's paths, classes, and other relevant information. In the case of the LVIS dataset, the `lvis.yaml` file is maintained at [https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/datasets/lvis.yaml](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/datasets/lvis.yaml).
!!! Example "ultralytics/cfg/datasets/lvis.yaml"
```yaml
--8<-- "ultralytics/cfg/datasets/lvis.yaml"
```
## Usage
To train a YOLOv8n model on the LVIS dataset for 100 epochs with an image size of 640, you can use the following code snippets. For a comprehensive list of available arguments, refer to the model [Training](../../modes/train.md) page.
!!! Example "Train Example"
=== "Python"
```python
from ultralytics import YOLO
# Load a model
model = YOLO('yolov8n.pt') # load a pretrained model (recommended for training)
# Train the model
results = model.train(data='lvis.yaml', epochs=100, imgsz=640)
```
=== "CLI"
```bash
# Start training from a pretrained *.pt model
yolo detect train data=lvis.yaml model=yolov8n.pt epochs=100 imgsz=640
```
## Sample Images and Annotations
The LVIS dataset contains a diverse set of images with various object categories and complex scenes. Here are some examples of images from the dataset, along with their corresponding annotations:
![LVIS Dataset sample image](https://github.com/ultralytics/ultralytics/assets/26833433/38cc033a-68b0-47f3-a5b8-4ef554362e40)
- **Mosaiced Image**: This image demonstrates a training batch composed of mosaiced dataset images. Mosaicing is a technique used during training that combines multiple images into a single image to increase the variety of objects and scenes within each training batch. This helps improve the model's ability to generalize to different object sizes, aspect ratios, and contexts.
The example showcases the variety and complexity of the images in the LVIS dataset and the benefits of using mosaicing during the training process.
## Citations and Acknowledgments
If you use the LVIS dataset in your research or development work, please cite the following paper:
!!! Quote ""
=== "BibTeX"
```bibtex
@inproceedings{gupta2019lvis,
title={{LVIS}: A Dataset for Large Vocabulary Instance Segmentation},
author={Gupta, Agrim and Dollar, Piotr and Girshick, Ross},
booktitle={Proceedings of the {IEEE} Conference on Computer Vision and Pattern Recognition},
year={2019}
}
```
We would like to acknowledge the LVIS Consortium for creating and maintaining this valuable resource for the computer vision community. For more information about the LVIS dataset and its creators, visit the [LVIS dataset website](https://www.lvisdataset.org/).

@ -36,6 +36,7 @@ Bounding box object detection is a computer vision technique that involves detec
- [Argoverse](detect/argoverse.md): A dataset containing 3D tracking and motion forecasting data from urban environments with rich annotations.
- [COCO](detect/coco.md): A large-scale dataset designed for object detection, segmentation, and captioning with over 200K labeled images.
- [LVIS](detect/lvis.md): A large-scale object detection, segmentation, and captioning dataset with 1203 object categories.
- [COCO8](detect/coco8.md): Contains the first 4 images from COCO train and COCO val, suitable for quick tests.
- [Global Wheat 2020](detect/globalwheat2020.md): A dataset of wheat head images collected from around the world for object detection and localization tasks.
- [Objects365](detect/objects365.md): A high-quality, large-scale dataset for object detection with 365 object categories and over 600K annotated images.

@ -10,6 +10,17 @@ The [Roboflow](https://roboflow.com/?ref=ultralytics) [Carparts Segmentation Dat
Whether you're working on automotive research, developing AI solutions for vehicle maintenance, or exploring computer vision applications, the Carparts Segmentation Dataset serves as a valuable resource for enhancing accuracy and efficiency in your projects.
<p align="center">
<br>
<iframe loading="lazy" width="720" height="405" src="https://www.youtube.com/embed/eHuzCNZeu0g"
title="YouTube video player" frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
allowfullscreen>
</iframe>
<br>
<strong>Watch:</strong> Carparts Instance Segmentation Using Ultralytics HUB
</p>
## Dataset Structure
The data distribution within the Carparts Segmentation Dataset is organized as outlined below:

@ -65,7 +65,8 @@ A heatmap generated with [Ultralytics YOLOv8](https://github.com/ultralytics/ult
imw=w,
imh=h,
view_img=True,
shape="circle")
shape="circle",
classes_names=model.names)
while cap.isOpened():
success, im0 = cap.read()
@ -110,7 +111,8 @@ A heatmap generated with [Ultralytics YOLOv8](https://github.com/ultralytics/ult
imh=h,
view_img=True,
shape="circle",
count_reg_pts=line_points)
count_reg_pts=line_points,
classes_names=model.names)
while cap.isOpened():
success, im0 = cap.read()
@ -126,6 +128,51 @@ A heatmap generated with [Ultralytics YOLOv8](https://github.com/ultralytics/ult
video_writer.release()
cv2.destroyAllWindows()
```
=== "Polygon Counting"
```python
from ultralytics import YOLO
import heatmap
import cv2
model = YOLO("yolov8n.pt")
cap = cv2.VideoCapture("path/to/video/file.mp4")
assert cap.isOpened(), "Error reading video file"
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, cv2.CAP_PROP_FRAME_HEIGHT, cv2.CAP_PROP_FPS))
# Video writer
video_writer = cv2.VideoWriter("heatmap_output.avi",
cv2.VideoWriter_fourcc(*'mp4v'),
fps,
(w, h))
# Define polygon points
region_points = [(20, 400), (1080, 404), (1080, 360), (20, 360), (20, 400)]
# Init heatmap
heatmap_obj = heatmap.Heatmap()
heatmap_obj.set_args(colormap=cv2.COLORMAP_PARULA,
imw=w,
imh=h,
view_img=True,
shape="circle",
count_reg_pts=region_points,
classes_names=model.names)
while cap.isOpened():
success, im0 = cap.read()
if not success:
print("Video frame is empty or video processing has been successfully completed.")
break
tracks = model.track(im0, persist=True, show=False)
im0 = heatmap_obj.generate_heatmap(im0, tracks)
video_writer.write(im0)
cap.release()
video_writer.release()
cv2.destroyAllWindows()
```
=== "Region Counting"
@ -155,7 +202,8 @@ A heatmap generated with [Ultralytics YOLOv8](https://github.com/ultralytics/ult
imh=h,
view_img=True,
shape="circle",
count_reg_pts=region_points)
count_reg_pts=region_points,
classes_names=model.names)
while cap.isOpened():
success, im0 = cap.read()
@ -190,7 +238,8 @@ A heatmap generated with [Ultralytics YOLOv8](https://github.com/ultralytics/ult
imw=w,
imh=h,
view_img=True,
shape="circle")
shape="circle",
classes_names=model.names)
results = model.track(im0, persist=True)
im0 = heatmap_obj.generate_heatmap(im0, tracks=results)
@ -223,7 +272,8 @@ A heatmap generated with [Ultralytics YOLOv8](https://github.com/ultralytics/ult
imw=w,
imh=h,
view_img=True,
shape="circle")
shape="circle",
classes_names=model.names)
while cap.isOpened():
success, im0 = cap.read()
@ -243,22 +293,26 @@ A heatmap generated with [Ultralytics YOLOv8](https://github.com/ultralytics/ult
### Arguments `set_args`
| Name | Type | Default | Description |
|-----------------------|----------------|-------------------|-----------------------------------------------------------|
| `view_img` | `bool` | `False` | Display the frame with heatmap |
| `colormap` | `cv2.COLORMAP` | `None` | cv2.COLORMAP for heatmap |
| `imw` | `int` | `None` | Width of Heatmap |
| `imh` | `int` | `None` | Height of Heatmap |
| `heatmap_alpha` | `float` | `0.5` | Heatmap alpha value |
| `count_reg_pts` | `list` | `None` | Object counting region points |
| `count_txt_thickness` | `int` | `2` | Count values text size |
| `count_txt_color` | `RGB Color` | `(0, 0, 0)` | Foreground color for Object counts text |
| `count_color` | `RGB Color` | `(255, 255, 255)` | Background color for Object counts text |
| `count_reg_color` | `RGB Color` | `(255, 0, 255)` | Counting region color |
| `region_thickness` | `int` | `5` | Counting region thickness value |
| `decay_factor` | `float` | `0.99` | Decay factor for heatmap area removal after specific time |
| `shape` | `str` | `circle` | Heatmap shape for display "rect" or "circle" supported |
| `line_dist_thresh` | `int` | `15` | Euclidean Distance threshold for line counter |
| Name | Type | Default | Description |
|----------------------|----------------|---------------------|-----------------------------------------------------------|
| `view_img` | `bool` | `False` | Display the frame with heatmap |
| `colormap` | `cv2.COLORMAP` | `None` | cv2.COLORMAP for heatmap |
| `imw` | `int` | `None` | Width of Heatmap |
| `imh` | `int` | `None` | Height of Heatmap |
| `line_thickness` | `int` | `2` | Increase bounding boxes and count text thickness |
| `view_in_counts` | `bool` | `True` | Display in-counts only on video frame |
| `view_out_counts` | `bool` | `True` | Display out-counts only on video frame |
| `classes_names` | `dict` | `model.model.names` | Dictionary of Class Names |
| `heatmap_alpha` | `float` | `0.5` | Heatmap alpha value |
| `count_reg_pts` | `list` | `None` | Object counting region points |
| `count_txt_color` | `RGB Color` | `(0, 0, 0)` | Foreground color for Object counts text |
| `count_reg_color` | `RGB Color` | `(255, 0, 255)` | Counting region color |
| `region_thickness` | `int` | `5` | Counting region thickness value |
| `decay_factor` | `float` | `0.99` | Decay factor for heatmap area removal after specific time |
| `shape` | `str` | `circle` | Heatmap shape for display "rect" or "circle" supported |
| `line_dist_thresh` | `int` | `15` | Euclidean Distance threshold for line counter |
| `count_bg_color` | `RGB Color` | `(255, 255, 255)` | Count highlighter color |
| `cls_txtdisplay_gap` | `int` | `50` | Display gap between each class count |
### Arguments `model.track`

@ -35,6 +35,7 @@ Here's a compilation of in-depth guides to help you master different aspects of
- [Conda Quickstart](conda-quickstart.md) 🚀 NEW: Step-by-step guide to setting up a [Conda](https://anaconda.org/conda-forge/ultralytics) environment for Ultralytics. Learn how to install and start using the Ultralytics package efficiently with Conda.
- [Docker Quickstart](docker-quickstart.md) 🚀 NEW: Complete guide to setting up and using Ultralytics YOLO models with [Docker](https://hub.docker.com/r/ultralytics/ultralytics). Learn how to install Docker, manage GPU support, and run YOLO models in isolated containers for consistent development and deployment.
- [Raspberry Pi](raspberry-pi.md) 🚀 NEW: Quickstart tutorial to run YOLO models to the latest Raspberry Pi hardware.
- [Nvidia-Jetson](nvidia-jetson.md)🚀 NEW: Quickstart guide for deploying YOLO models on Nvidia Jetson devices.
- [Triton Inference Server Integration](triton-inference-server.md) 🚀 NEW: Dive into the integration of Ultralytics YOLOv8 with NVIDIA's Triton Inference Server for scalable and efficient deep learning inference deployments.
- [YOLO Thread-Safe Inference](yolo-thread-safe-inference.md) 🚀 NEW: Guidelines for performing inference with YOLO models in a thread-safe manner. Learn the importance of thread safety and best practices to prevent race conditions and ensure consistent predictions.
- [Isolating Segmentation Objects](isolating-segmentation-objects.md) 🚀 NEW: Step-by-step recipe and explanation on how to extract and/or isolate objects from images using Ultralytics Segmentation.
@ -55,6 +56,7 @@ Here's a compilation of in-depth guides to help you master different aspects of
- [VisionEye View Objects Mapping](vision-eye.md) 🚀 NEW: This feature aim computers to discern and focus on specific objects, much like the way the human eye observes details from a particular viewpoint.
- [Speed Estimation](speed-estimation.md) 🚀 NEW: Speed estimation in computer vision relies on analyzing object motion through techniques like [object tracking](https://docs.ultralytics.com/modes/track/), crucial for applications like autonomous vehicles and traffic monitoring.
- [Distance Calculation](distance-calculation.md) 🚀 NEW: Distance calculation, which involves measuring the separation between two objects within a defined space, is a crucial aspect. In the context of Ultralytics YOLOv8, the method employed for this involves using the bounding box centroid to determine the distance associated with user-highlighted bounding boxes.
- [Queue Management](queue-management.md) 🚀 NEW: Queue management is the practice of efficiently controlling and directing the flow of people or tasks, often through strategic planning and technology implementation, to minimize wait times and improve overall productivity.
## Contribute to Our Guides

@ -0,0 +1,256 @@
---
comments: true
description: Quick start guide to setting up YOLOv8 on a NVIDIA Jetson device with comprehensive benchmarks.
keywords: Ultralytics, YOLO, NVIDIA, Jetson, TensorRT, quick start guide, hardware setup, machine learning, AI
---
# Quick Start Guide: NVIDIA Jetson with Ultralytics YOLOv8
This comprehensive guide provides a detailed walkthrough for deploying Ultralytics YOLOv8 on [NVIDIA Jetson](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/) devices. Additionally, it showcases performance benchmarks to demonstrate the capabilities of YOLOv8 on these small and powerful devices.
<img width="1024" src="https://github.com/ultralytics/ultralytics/assets/20147381/c68fb2eb-371a-43e5-b7b8-2b869d90bc07" alt="NVIDIA Jetson Ecosystem">
!!! Note
This guide has been tested with [Seeed Studio reComputer J4012](https://www.seeedstudio.com/reComputer-J4012-p-5586.html) which is based on NVIDIA Jetson Orin NX 16GB running the latest stable JetPack release of [JP5.1.3](https://developer.nvidia.com/embedded/jetpack-sdk-513). Using this guide for older Jetson devices such as the Jetson Nano (this only supports until JP4.6.4) may not be guaranteed to work. However this is expected to work on all Jetson Orin, Xavier NX, AGX Xavier devices running JP5.1.3.
## What is NVIDIA Jetson?
NVIDIA Jetson is a series of embedded computing boards designed to bring accelerated AI (artificial intelligence) computing to edge devices. These compact and powerful devices are built around NVIDIA's GPU architecture and are capable of running complex AI algorithms and deep learning models directly on the device, without needing to rely on cloud computing resources. Jetson boards are often used in robotics, autonomous vehicles, industrial automation, and other applications where AI inference needs to be performed locally with low latency and high efficiency. Additionally, these boards are based on the ARM64 architecture and runs on lower power compared to traditional GPU computing devices.
## NVIDIA Jetson Series Comparison
[Jetson Orin](https://www.nvidia.com/en-us/autonomous-machines/embedded-systems/jetson-orin/) is the latest iteration of the NVIDIA Jetson family based on NVIDIA Ampere architecture which brings drastically improved AI performance when compared to the previous generations. Below table compared few of the Jetson devices in the ecosystem.
| | Jetson AGX Orin 64GB | Jetson Orin NX 16GB | Jetson Orin Nano 8GB | Jetson AGX Xavier | Jetson Xavier NX | Jetson Nano |
|-------------------|------------------------------------------------------------------|-----------------------------------------------------------------|---------------------------------------------------------------|-------------------------------------------------------------|--------------------------------------------------------------|---------------------------------------------|
| AI Performance | 275 TOPS | 100 TOPS | 40 TOPs | 32 TOPS | 21 TOPS | 472 GFLOPS |
| GPU | 2048-core NVIDIA Ampere architecture GPU with 64 Tensor Cores | 1024-core NVIDIA Ampere architecture GPU with 32 Tensor Cores | 1024-core NVIDIA Ampere architecture GPU with 32 Tensor Cores | 512-core NVIDIA Volta architecture GPU with 64 Tensor Cores | 384-core NVIDIA Volta™ architecture GPU with 48 Tensor Cores | 128-core NVIDIA Maxwell™ architecture GPU |
| GPU Max Frequency | 1.3 GHz | 918 MHz | 625 MHz | 1377 MHz | 1100 MHz | 921MHz |
| CPU | 12-core NVIDIA Arm® Cortex A78AE v8.2 64-bit CPU 3MB L2 + 6MB L3 | 8-core NVIDIA Arm® Cortex A78AE v8.2 64-bit CPU 2MB L2 + 4MB L3 | 6-core Arm® Cortex®-A78AE v8.2 64-bit CPU 1.5MB L2 + 4MB L3 | 8-core NVIDIA Carmel Arm®v8.2 64-bit CPU 8MB L2 + 4MB L3 | 6-core NVIDIA Carmel Arm®v8.2 64-bit CPU 6MB L2 + 4MB L3 | Quad-Core Arm® Cortex®-A57 MPCore processor |
| CPU Max Frequency | 2.2 GHz | 2.0 GHz | 1.5 GHz | 2.2 GHz | 1.9 GHz | 1.43GHz |
| Memory | 64GB 256-bit LPDDR5 204.8GB/s | 16GB 128-bit LPDDR5 102.4GB/s | 8GB 128-bit LPDDR5 68 GB/s | 32GB 256-bit LPDDR4x 136.5GB/s | 8GB 128-bit LPDDR4x 59.7GB/s | 4GB 64-bit LPDDR4 25.6GB/s" |
For a more detailed comparison table, please visit the **Technical Specifications** section of [official NVIDIA Jetson page](https://developer.nvidia.com/embedded/jetson-modules).
## What is NVIDIA JetPack?
[NVIDIA JetPack SDK](https://developer.nvidia.com/embedded/jetpack) powering the Jetson modules is the most comprehensive solution and provides full development environment for building end-to-end accelerated AI applications and shortens time to market. JetPack includes Jetson Linux with bootloader, Linux kernel, Ubuntu desktop environment, and a complete set of libraries for acceleration of GPU computing, multimedia, graphics, and computer vision. It also includes samples, documentation, and developer tools for both host computer and developer kit, and supports higher level SDKs such as DeepStream for streaming video analytics, Isaac for robotics, and Riva for conversational AI.
## Flash JetPack to NVIDIA Jetson
The first step after getting your hands on an NVIDIA Jetson device is to flash NVIDIA JetPack to the device. There are several different way of flashing NVIDIA Jetson devices.
1. If you own an official NVIDIA Development Kit such as the Jetson Orin Nano Developer Kit, you can visit [this link](https://developer.nvidia.com/embedded/learn/get-started-jetson-orin-nano-devkit) to download an image and prepare an SD card with JetPack for booting the device.
2. If you own any other NVIDIA Development Kit, you can visit [this link](https://docs.nvidia.com/sdk-manager/install-with-sdkm-jetson/index.html) to flash JetPack to the device using [SDK Manager](https://developer.nvidia.com/sdk-manager).
3. If you own a Seeed Studio reComputer J4012 device, you can visit [this link](https://wiki.seeedstudio.com/reComputer_J4012_Flash_Jetpack) to flash JetPack to the included SSD.
4. If you own any other third party device powered by the NVIDIA Jetson module, it is recommended to follow command-line flashing by visiting [this link](https://docs.nvidia.com/jetson/archives/r35.5.0/DeveloperGuide/IN/QuickStart.html).
!!! Note
For methods 3 and 4 above, after flashing the system and booting the device, please enter "sudo apt update && sudo apt install nvidia-jetpack -y" on the device terminal to install all the remaining JetPack components needed.
## Start with Docker
The fastest way to get started with Ultralytics YOLOv8 on NVIDIA Jetson is to run with pre-built docker image for Jetson.
Execute the below command to pull the Docker containter and run on Jetson. This is based on [l4t-pytorch](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/l4t-pytorch) docker image which contains PyTorch and Torchvision in a Python3 environment.
```sh
t=ultralytics/ultralytics:latest-jetson && sudo docker pull $t && sudo docker run -it --ipc=host --runtime=nvidia $t
```
## Start without Docker
### Install Ultralytics Package
Here we will install ultralyics package on the Jetson with optional dependencies so that we can export the PyTorch models to other different formats. We will mainly focus on [NVIDIA TensorRT exports](https://docs.ultralytics.com/integrations/tensorrt) because TensoRT will make sure we can get the maximum performance out of the Jetson devices.
1. Update packages list, install pip and upgrade to latest
```sh
sudo apt update
sudo apt install python3-pip -y
pip install -U pip
```
2. Install `ultralytics` pip package with optional dependencies
```sh
pip install ultralytics[export]
```
3. Reboot the device
```sh
sudo reboot
```
### Install PyTorch and Torchvision
The above ultralytics installation will install Torch and Torchvision. However, these 2 packages installed via pip are not compatible to run on Jetson platform which is based on ARM64 architecture. Therefore, we need to manually install pre-built PyTorch pip wheel and compile/ install Torchvision from source.
1. Uninstall currently installed PyTorch and Torchvision
```sh
pip uninstall torch torchvision
```
2. Install PyTorch 2.1.0 according to JP5.1.3
```sh
sudo apt-get install -y libopenblas-base libopenmpi-dev
wget https://developer.download.nvidia.com/compute/redist/jp/v512/pytorch/torch-2.1.0a0+41361538.nv23.06-cp38-cp38-linux_aarch64.whl -O torch-2.1.0a0+41361538.nv23.06-cp38-cp38-linux_aarch64.whl
pip install torch-2.1.0a0+41361538.nv23.06-cp38-cp38-linux_aarch64.whl
```
3. Install Torchvision v0.16.2 according to PyTorch v2.1.0
```sh
sudo apt install -y libjpeg-dev zlib1g-dev
git clone https://github.com/pytorch/vision torchvision
cd torchvision
git checkout v0.16.2
python3 setup.py install --user
```
Visit [this page](https://forums.developer.nvidia.com/t/pytorch-for-jetson/72048) to access all different versions of PyTorch for different JetPack versions. For a more detailed list on the PyTorch, Torchvision compatibility, please check [here](https://github.com/pytorch/vision).
## Use TensorRT on NVIDIA Jetson
Out of all the model export formats supported by Ultralytics, TensorRT delivers the best inference performance when working with NVIDIA Jetson devices and our recommendation is to use TensorRT with Jetson. We also have a detailed document on TensorRT [here](https://docs.ultralytics.com/integrations/tensorrt).
## Convert Model to TensorRT and Run Inference
The YOLOv8n model in PyTorch format is converted to TensorRT to run inference with the exported model.
!!! Example
=== "Python"
```python
from ultralytics import YOLO
# Load a YOLOv8n PyTorch model
model = YOLO('yolov8n.pt')
# Export the model
model.export(format='engine') # creates 'yolov8n.engine'
# Load the exported TensorRT model
trt_model = YOLO('yolov8n.engine')
# Run inference
results = trt_model('https://ultralytics.com/images/bus.jpg')
```
=== "CLI"
```bash
# Export a YOLOv8n PyTorch model to TensorRT format
yolo export model=yolov8n.pt format=engine # creates 'yolov8n.engine'
# Run inference with the exported model
yolo predict model=yolov8n.engine source='https://ultralytics.com/images/bus.jpg'
```
## Arguments
| Key | Value | Description |
|----------|------------|------------------------------------------------------|
| `format` | `'engine'` | format to export to |
| `imgsz` | `640` | image size as scalar or (h, w) list, i.e. (640, 480) |
| `half` | `False` | FP16 quantization |
## NVIDIA Jetson Orin YOLOv8 Benchmarks
YOLOv8 benchmarks below were run by the Ultralytics team on 3 different model formats measuring speed and accuracy: PyTorch, TorchScript and TensorRT. Benchmarks were run on Seeed Studio reComputer J4012 powered by Jetson Orin NX 16GB device at FP32 precision with default input image size of 640.
<div style="text-align: center;">
<img width="800" src="https://github.com/ultralytics/ultralytics/assets/20147381/202950fa-c24a-43ec-90c8-4d7b6a6c406e" alt="NVIDIA Jetson Ecosystem">
</div>
| Model | Format | Status | Size (MB) | mAP50-95(B) | Inference time (ms/im) |
|---------|-------------|--------|-----------|-------------|------------------------|
| YOLOv8n | PyTorch | ✅ | 6.2 | 0.4473 | 14.3 |
| YOLOv8n | TorchScript | ✅ | 12.4 | 0.4520 | 13.3 |
| YOLOv8n | TensorRT | ✅ | 13.6 | 0.4520 | 8.7 |
| YOLOv8s | PyTorch | ✅ | 21.5 | 0.5868 | 18 |
| YOLOv8s | TorchScript | ✅ | 43.0 | 0.5971 | 23.9 |
| YOLOv8s | TensorRT | ✅ | 44.0 | 0.5965 | 14.82 |
| YOLOv8m | PyTorch | ✅ | 49.7 | 0.6101 | 36.4 |
| YOLOv8m | TorchScript | ✅ | 99.2 | 0.6125 | 53.34 |
| YOLOv8m | TensorRT | ✅ | 100.3 | 0.6123 | 33.28 |
| YOLOv8l | PyTorch | ✅ | 83.7 | 0.6588 | 61.3 |
| YOLOv8l | TorchScript | ✅ | 167.2 | 0.6587 | 85.21 |
| YOLOv8l | TensorRT | ✅ | 168.3 | 0.6591 | 51.34 |
| YOLOv8x | PyTorch | ✅ | 130.5 | 0.6650 | 93 |
| YOLOv8x | TorchScript | ✅ | 260.7 | 0.6651 | 135.3 |
| YOLOv8x | TensorRT | ✅ | 261.8 | 0.6645 | 84.5 |
This table represents the benchmark results for five different models (YOLOv8n, YOLOv8s, YOLOv8m, YOLOv8l, YOLOv8x) across three different formats (PyTorch, TorchScript, TensorRT), giving us the status, size, mAP50-95(B) metric, and inference time for each combination.
Visit [this link](https://www.seeedstudio.com/blog/2023/03/30/yolov8-performance-benchmarks-on-nvidia-jetson-devices) to explore more benchmarking efforts by Seeed Studio running on different versions of NVIDIA Jetson hardware.
## Reproduce Our Results
To reproduce the above Ultralytics benchmarks on all export [formats](../modes/export.md) run this code:
!!! Example
=== "Python"
```python
from ultralytics import YOLO
# Load a YOLOv8n PyTorch model
model = YOLO('yolov8n.pt')
# Benchmark YOLOv8n speed and accuracy on the COCO128 dataset for all all export formats
results = model.benchmarks(data='coco128.yaml', imgsz=640)
```
=== "CLI"
```bash
# Benchmark YOLOv8n speed and accuracy on the COCO128 dataset for all all export formats
yolo benchmark model=yolov8n.pt data=coco128.yaml imgsz=640
```
Note that benchmarking results might vary based on the exact hardware and software configuration of a system, as well as the current workload of the system at the time the benchmarks are run. For the most reliable results use a dataset with a large number of images, i.e. `data='coco128.yaml' (128 val images), or `data='coco.yaml'` (5000 val images).
!!! Note
Currently only PyTorch, Torchscript and TensorRT are working with the benchmarking tools. We will update it to support other exports in the future.
## Best Practices when using NVIDIA Jetson
When using NVIDIA Jetson, there are a couple of best practices to follow in order to enable maximum performance on the NVIDIA Jetson running YOLOv8.
1. Enable MAX Power Mode
Enabling MAX Power Mode on the Jetson will make sure all CPU, GPU cores are turned on.
```sh
sudo nvpmodel -m 0
```
2. Enable Jetson Clocks
Enabling Jetson Clocks will make sure all CPU, GPU cores are clocked at their maximum frequency.
```sh
sudo jetson_clocks
```
3. Install Jetson Stats Application
We can use jetson stats application to monitor the temperatures of the system components and check other system details such as view CPU, GPU, RAM utilization, change power modes, set to max clocks, check JetPack information
```sh
sudo apt update
sudo pip install jetson-stats
sudo reboot
jtop
```
<img width="1024" src="https://github.com/ultralytics/ultralytics/assets/20147381/f7017975-6eaa-4d02-8007-ab52314cebfd" alt="Jetson Stats">
## Next Steps
Congratulations on successfully setting up YOLOv8 on your NVIDIA Jetson! For further learning and support, visit more guide at [Ultralytics YOLOv8 Docs](../index.md)!

@ -62,7 +62,8 @@ Object counting with [Ultralytics YOLOv8](https://github.com/ultralytics/ultraly
counter.set_args(view_img=True,
reg_pts=region_points,
classes_names=model.names,
draw_tracks=True)
draw_tracks=True,
line_thickness=2)
while cap.isOpened():
success, im0 = cap.read()
@ -105,7 +106,8 @@ Object counting with [Ultralytics YOLOv8](https://github.com/ultralytics/ultraly
counter.set_args(view_img=True,
reg_pts=region_points,
classes_names=model.names,
draw_tracks=True)
draw_tracks=True,
line_thickness=2)
while cap.isOpened():
success, im0 = cap.read()
@ -148,7 +150,8 @@ Object counting with [Ultralytics YOLOv8](https://github.com/ultralytics/ultraly
counter.set_args(view_img=True,
reg_pts=line_points,
classes_names=model.names,
draw_tracks=True)
draw_tracks=True,
line_thickness=2)
while cap.isOpened():
success, im0 = cap.read()
@ -191,7 +194,8 @@ Object counting with [Ultralytics YOLOv8](https://github.com/ultralytics/ultraly
counter.set_args(view_img=True,
reg_pts=line_points,
classes_names=model.names,
draw_tracks=True)
draw_tracks=True,
line_thickness=2)
while cap.isOpened():
success, im0 = cap.read()
@ -215,23 +219,22 @@ Object counting with [Ultralytics YOLOv8](https://github.com/ultralytics/ultraly
### Optional Arguments `set_args`
| Name | Type | Default | Description |
|-----------------------|-------------|----------------------------|-----------------------------------------------|
| `view_img` | `bool` | `False` | Display frames with counts |
| `view_in_counts` | `bool` | `True` | Display in-counts only on video frame |
| `view_out_counts` | `bool` | `True` | Display out-counts only on video frame |
| `line_thickness` | `int` | `2` | Increase bounding boxes thickness |
| `reg_pts` | `list` | `[(20, 400), (1260, 400)]` | Points defining the Region Area |
| `classes_names` | `dict` | `model.model.names` | Dictionary of Class Names |
| `region_color` | `RGB Color` | `(255, 0, 255)` | Color of the Object counting Region or Line |
| `track_thickness` | `int` | `2` | Thickness of Tracking Lines |
| `draw_tracks` | `bool` | `False` | Enable drawing Track lines |
| `track_color` | `RGB Color` | `(0, 255, 0)` | Color for each track line |
| `line_dist_thresh` | `int` | `15` | Euclidean Distance threshold for line counter |
| `count_txt_thickness` | `int` | `2` | Thickness of Object counts text |
| `count_txt_color` | `RGB Color` | `(0, 0, 0)` | Foreground color for Object counts text |
| `count_color` | `RGB Color` | `(255, 255, 255)` | Background color for Object counts text |
| `region_thickness` | `int` | `5` | Thickness for object counter region or line |
| Name | Type | Default | Description |
|-----------------------|-------------|----------------------------|--------------------------------------------------|
| `view_img` | `bool` | `False` | Display frames with counts |
| `view_in_counts` | `bool` | `True` | Display in-counts only on video frame |
| `view_out_counts` | `bool` | `True` | Display out-counts only on video frame |
| `line_thickness` | `int` | `2` | Increase bounding boxes and count text thickness |
| `reg_pts` | `list` | `[(20, 400), (1260, 400)]` | Points defining the Region Area |
| `classes_names` | `dict` | `model.model.names` | Dictionary of Class Names |
| `count_reg_color` | `RGB Color` | `(255, 0, 255)` | Color of the Object counting Region or Line |
| `track_thickness` | `int` | `2` | Thickness of Tracking Lines |
| `draw_tracks` | `bool` | `False` | Enable drawing Track lines |
| `track_color` | `RGB Color` | `(0, 255, 0)` | Color for each track line |
| `line_dist_thresh` | `int` | `15` | Euclidean Distance threshold for line counter |
| `count_txt_color` | `RGB Color` | `(255, 255, 255)` | Foreground color for Object counts text |
| `region_thickness` | `int` | `5` | Thickness for object counter region or line |
| `count_bg_color` | `RGB Color` | `(255, 255, 255)` | Count highlighter color |
### Arguments `model.track`

@ -0,0 +1,152 @@
---
comments: true
description: Queue Management Using Ultralytics YOLOv8
keywords: Ultralytics, YOLOv8, Queue Management, Object Counting, Object Tracking, Object Detection, Notebook, IPython Kernel, CLI, Python SDK
---
# Queue Management using Ultralytics YOLOv8 🚀
## What is Queue Management?
Queue management using [Ultralytics YOLOv8](https://github.com/ultralytics/ultralytics/) involves organizing and controlling lines of people or vehicles to reduce wait times and enhance efficiency. It's about optimizing queues to improve customer satisfaction and system performance in various settings like retail, banks, airports, and healthcare facilities.
## Advantages of Queue Management?
- **Reduced Waiting Times:** Queue management systems efficiently organize queues, minimizing wait times for customers. This leads to improved satisfaction levels as customers spend less time waiting and more time engaging with products or services.
- **Increased Efficiency:** Implementing queue management allows businesses to allocate resources more effectively. By analyzing queue data and optimizing staff deployment, businesses can streamline operations, reduce costs, and improve overall productivity.
## Real World Applications
| Logistics | Retail |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------:|
| ![Queue management at airport ticket counter using Ultralytics YOLOv8](https://github.com/RizwanMunawar/RizwanMunawar/assets/62513924/10487e76-bf60-4a9c-a0f3-5a75a05fa7a3) | ![Queue monitoring in crowd using Ultralytics YOLOv8](https://github.com/RizwanMunawar/RizwanMunawar/assets/62513924/dcc6d2ca-5576-434d-83c6-e57fe07bc693) |
| Queue management at airport ticket counter Using Ultralytics YOLOv8 | Queue monitoring in crowd Ultralytics YOLOv8 |
!!! Example "Queue Management using YOLOv8 Example"
=== "Queue Manager"
```python
import cv2
from ultralytics import YOLO
from ultralytics.solutions import queue_management
model = YOLO("yolov8n.pt")
cap = cv2.VideoCapture("path/to/video/file.mp4")
assert cap.isOpened(), "Error reading video file"
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH,
cv2.CAP_PROP_FRAME_HEIGHT,
cv2.CAP_PROP_FPS))
video_writer = cv2.VideoWriter("queue_management.avi",
cv2.VideoWriter_fourcc(*'mp4v'),
fps,
(w, h))
queue_region = [(20, 400), (1080, 404), (1080, 360), (20, 360)]
queue = queue_management.QueueManager()
queue.set_args(classes_names=model.names,
reg_pts=queue_region,
line_thickness=3,
fontsize=1.0,
region_color=(255, 144, 31))
while cap.isOpened():
success, im0 = cap.read()
if success:
tracks = model.track(im0, show=False, persist=True,
verbose=False)
out = queue.process_queue(im0, tracks)
video_writer.write(im0)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
continue
print("Video frame is empty or video processing has been successfully completed.")
break
cap.release()
cv2.destroyAllWindows()
```
=== "Queue Manager Specific Classes"
```python
import cv2
from ultralytics import YOLO
from ultralytics.solutions import queue_management
model = YOLO("yolov8n.pt")
cap = cv2.VideoCapture("path/to/video/file.mp4")
assert cap.isOpened(), "Error reading video file"
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH,
cv2.CAP_PROP_FRAME_HEIGHT,
cv2.CAP_PROP_FPS))
video_writer = cv2.VideoWriter("queue_management.avi",
cv2.VideoWriter_fourcc(*'mp4v'),
fps,
(w, h))
queue_region = [(20, 400), (1080, 404), (1080, 360), (20, 360)]
queue = queue_management.QueueManager()
queue.set_args(classes_names=model.names,
reg_pts=queue_region,
line_thickness=3,
fontsize=1.0,
region_color=(255, 144, 31))
while cap.isOpened():
success, im0 = cap.read()
if success:
tracks = model.track(im0, show=False, persist=True,
verbose=False, classes=0) # Only person class
out = queue.process_queue(im0, tracks)
video_writer.write(im0)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
continue
print("Video frame is empty or video processing has been successfully completed.")
break
cap.release()
cv2.destroyAllWindows()
```
### Optional Arguments `set_args`
| Name | Type | Default | Description |
|---------------------|-------------|----------------------------|---------------------------------------------|
| `view_img` | `bool` | `False` | Display frames with counts |
| `view_queue_counts` | `bool` | `True` | Display Queue counts only on video frame |
| `line_thickness` | `int` | `2` | Increase bounding boxes thickness |
| `reg_pts` | `list` | `[(20, 400), (1260, 400)]` | Points defining the Region Area |
| `classes_names` | `dict` | `model.model.names` | Dictionary of Class Names |
| `region_color` | `RGB Color` | `(255, 0, 255)` | Color of the Object counting Region or Line |
| `track_thickness` | `int` | `2` | Thickness of Tracking Lines |
| `draw_tracks` | `bool` | `False` | Enable drawing Track lines |
| `track_color` | `RGB Color` | `(0, 255, 0)` | Color for each track line |
| `count_txt_color` | `RGB Color` | `(255, 255, 255)` | Foreground color for Object counts text |
| `region_thickness` | `int` | `5` | Thickness for object counter region or line |
| `fontsize` | `float` | `0.6` | Font size of counting text |
### Arguments `model.track`
| Name | Type | Default | Description |
|-----------|---------|----------------|-------------------------------------------------------------|
| `source` | `im0` | `None` | source directory for images or videos |
| `persist` | `bool` | `False` | persisting tracks between frames |
| `tracker` | `str` | `botsort.yaml` | Tracking method 'bytetrack' or 'botsort' |
| `conf` | `float` | `0.3` | Confidence Threshold |
| `iou` | `float` | `0.5` | IOU Threshold |
| `classes` | `list` | `None` | filter results by class, i.e. classes=0, or classes=[0,2,3] |
| `verbose` | `bool` | `True` | Display the object tracking results |

@ -8,6 +8,18 @@ keywords: Ultralytics, YOLOv8, Object Detection, Pose Estimation, PushUps, PullU
Monitoring workouts through pose estimation with [Ultralytics YOLOv8](https://github.com/ultralytics/ultralytics/) enhances exercise assessment by accurately tracking key body landmarks and joints in real-time. This technology provides instant feedback on exercise form, tracks workout routines, and measures performance metrics, optimizing training sessions for users and trainers alike.
<p align="center">
<br>
<iframe loading="lazy" width="720" height="405" src="https://www.youtube.com/embed/LGGxqLZtvuw"
title="YouTube video player" frameborder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
allowfullscreen>
</iframe>
<br>
<strong>Watch:</strong> Workouts Monitoring using Ultralytics YOLOv8 | Pushups, Pullups, Ab Workouts
</p>
## Advantages of Workouts Monitoring?
- **Optimized Performance:** Tailoring workouts based on monitoring data for better results.

@ -22,13 +22,13 @@ Here's a brief description of our CI actions:
Below is the table showing the status of these CI tests for our main repositories:
| Repository | CI | Docker Deployment | Broken Links | CodeQL | PyPi and Docs Publishing |
|-----------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [yolov3](https://github.com/ultralytics/yolov3) | [![YOLOv3 CI](https://github.com/ultralytics/yolov3/actions/workflows/ci-testing.yml/badge.svg)](https://github.com/ultralytics/yolov3/actions/workflows/ci-testing.yml) | [![Publish Docker Images](https://github.com/ultralytics/yolov3/actions/workflows/docker.yml/badge.svg)](https://github.com/ultralytics/yolov3/actions/workflows/docker.yml) | [![Check Broken links](https://github.com/ultralytics/yolov3/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/yolov3/actions/workflows/links.yml) | [![CodeQL](https://github.com/ultralytics/yolov3/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/ultralytics/yolov3/actions/workflows/codeql-analysis.yml) | |
| [yolov5](https://github.com/ultralytics/yolov5) | [![YOLOv5 CI](https://github.com/ultralytics/yolov5/actions/workflows/ci-testing.yml/badge.svg)](https://github.com/ultralytics/yolov5/actions/workflows/ci-testing.yml) | [![Publish Docker Images](https://github.com/ultralytics/yolov5/actions/workflows/docker.yml/badge.svg)](https://github.com/ultralytics/yolov5/actions/workflows/docker.yml) | [![Check Broken links](https://github.com/ultralytics/yolov5/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/yolov5/actions/workflows/links.yml) | [![CodeQL](https://github.com/ultralytics/yolov5/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/ultralytics/yolov5/actions/workflows/codeql-analysis.yml) | |
| [ultralytics](https://github.com/ultralytics/ultralytics) | [![ultralytics CI](https://github.com/ultralytics/ultralytics/actions/workflows/ci.yaml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/ci.yaml) | [![Publish Docker Images](https://github.com/ultralytics/ultralytics/actions/workflows/docker.yaml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/docker.yaml) | [![Check Broken links](https://github.com/ultralytics/ultralytics/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/links.yml) | [![CodeQL](https://github.com/ultralytics/ultralytics/actions/workflows/codeql.yaml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/codeql.yaml) | [![Publish to PyPI and Deploy Docs](https://github.com/ultralytics/ultralytics/actions/workflows/publish.yml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/publish.yml) |
| [hub](https://github.com/ultralytics/hub) | [![HUB CI](https://github.com/ultralytics/hub/actions/workflows/ci.yaml/badge.svg)](https://github.com/ultralytics/hub/actions/workflows/ci.yaml) | | [![Check Broken links](https://github.com/ultralytics/hub/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/hub/actions/workflows/links.yml) | | |
| [docs](https://github.com/ultralytics/docs) | | | [![Check Broken links](https://github.com/ultralytics/docs/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/docs/actions/workflows/links.yml)[![Check Domains](https://github.com/ultralytics/docs/actions/workflows/check_domains.yml/badge.svg)](https://github.com/ultralytics/docs/actions/workflows/check_domains.yml) | | [![pages-build-deployment](https://github.com/ultralytics/docs/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/ultralytics/docs/actions/workflows/pages/pages-build-deployment) |
| Repository | CI | Docker Deployment | Broken Links | CodeQL | PyPi and Docs Publishing |
|-----------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| [yolov3](https://github.com/ultralytics/yolov3) | [![YOLOv3 CI](https://github.com/ultralytics/yolov3/actions/workflows/ci-testing.yml/badge.svg)](https://github.com/ultralytics/yolov3/actions/workflows/ci-testing.yml) | [![Publish Docker Images](https://github.com/ultralytics/yolov3/actions/workflows/docker.yml/badge.svg)](https://github.com/ultralytics/yolov3/actions/workflows/docker.yml) | [![Check Broken links](https://github.com/ultralytics/yolov3/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/yolov3/actions/workflows/links.yml) | [![CodeQL](https://github.com/ultralytics/yolov3/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/ultralytics/yolov3/actions/workflows/codeql-analysis.yml) | |
| [yolov5](https://github.com/ultralytics/yolov5) | [![YOLOv5 CI](https://github.com/ultralytics/yolov5/actions/workflows/ci-testing.yml/badge.svg)](https://github.com/ultralytics/yolov5/actions/workflows/ci-testing.yml) | [![Publish Docker Images](https://github.com/ultralytics/yolov5/actions/workflows/docker.yml/badge.svg)](https://github.com/ultralytics/yolov5/actions/workflows/docker.yml) | [![Check Broken links](https://github.com/ultralytics/yolov5/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/yolov5/actions/workflows/links.yml) | [![CodeQL](https://github.com/ultralytics/yolov5/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/ultralytics/yolov5/actions/workflows/codeql-analysis.yml) | |
| [ultralytics](https://github.com/ultralytics/ultralytics) | [![ultralytics CI](https://github.com/ultralytics/ultralytics/actions/workflows/ci.yaml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/ci.yaml) | [![Publish Docker Images](https://github.com/ultralytics/ultralytics/actions/workflows/docker.yaml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/docker.yaml) | [![Check Broken links](https://github.com/ultralytics/ultralytics/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/links.yml) | [![CodeQL](https://github.com/ultralytics/ultralytics/actions/workflows/codeql.yaml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/codeql.yaml) | [![Publish to PyPI and Deploy Docs](https://github.com/ultralytics/ultralytics/actions/workflows/publish.yml/badge.svg)](https://github.com/ultralytics/ultralytics/actions/workflows/publish.yml) |
| [hub](https://github.com/ultralytics/hub) | [![HUB CI](https://github.com/ultralytics/hub/actions/workflows/ci.yaml/badge.svg)](https://github.com/ultralytics/hub/actions/workflows/ci.yaml) | | [![Check Broken links](https://github.com/ultralytics/hub/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/hub/actions/workflows/links.yml) | | |
| [docs](https://github.com/ultralytics/docs) | | | [![Check Broken links](https://github.com/ultralytics/docs/actions/workflows/links.yml/badge.svg)](https://github.com/ultralytics/docs/actions/workflows/links.yml)[![Check Domains](https://github.com/ultralytics/docs/actions/workflows/check_domains.yml/badge.svg)](https://github.com/ultralytics/docs/actions/workflows/check_domains.yml) | | [![pages-build-deployment](https://github.com/ultralytics/docs/actions/workflows/pages/pages-build-deployment/badge.svg)](https://github.com/ultralytics/docs/actions/workflows/pages/pages-build-deployment) |
Each badge shows the status of the last run of the corresponding CI test on the `main` branch of the respective repository. If a test fails, the badge will display a "failing" status, and if it passes, it will display a "passing" status.

@ -24,7 +24,7 @@ The Edge TPU works with quantized models. Quantization makes models smaller and
Here are the key features that make TFLite Edge TPU a great model format choice for developers:
- **Optimized Performance on Edge Devices**: The TFLite Edge TPU achieves high-speed neural networking performance through quantization, model optimization, hardware acceleration, and compiler optimization. Its minimalistic architecture contributes to its smaller size and cost-efficiency.
- **Optimized Performance on Edge Devices**: The TFLite Edge TPU achieves high-speed neural networking performance through quantization, model optimization, hardware acceleration, and compiler optimization. Its minimalistic architecture contributes to its smaller size and cost-efficiency.
- **High Computational Throughput**: TFLite Edge TPU combines specialized hardware acceleration and efficient runtime execution to achieve high computational throughput. It is well-suited for deploying machine learning models with stringent performance requirements on edge devices.
@ -38,9 +38,9 @@ TFLite Edge TPU offers various deployment options for machine learning models, i
- **On-Device Deployment**: TensorFlow Edge TPU models can be directly deployed on mobile and embedded devices. On-device deployment allows the models to execute directly on the hardware, eliminating the need for cloud connectivity.
- **Edge Computing with Cloud TensorFlow TPUs**: In scenarios where edge devices have limited processing capabilities, TensorFlow Edge TPUs can offload inference tasks to cloud servers equipped with TPUs.
- **Edge Computing with Cloud TensorFlow TPUs**: In scenarios where edge devices have limited processing capabilities, TensorFlow Edge TPUs can offload inference tasks to cloud servers equipped with TPUs.
- **Hybrid Deployment**: A hybrid approach combines on-device and cloud deployment and offers a versatile and scalable solution for deploying machine learning models. Advantages include on-device processing for quick responses and cloud computing for more complex computations.
- **Hybrid Deployment**: A hybrid approach combines on-device and cloud deployment and offers a versatile and scalable solution for deploying machine learning models. Advantages include on-device processing for quick responses and cloud computing for more complex computations.
## Exporting YOLOv8 Models to TFLite Edge TPU
@ -99,7 +99,7 @@ For more details about supported export options, visit the [Ultralytics document
## Deploying Exported YOLOv8 TFLite Edge TPU Models
After successfully exporting your Ultralytics YOLOv8 models to TFLite Edge TPU format, you can now deploy them. The primary and recommended first step for running a TFLite Edge TPU model is to use the YOLO("model_edgetpu.tflite") method, as outlined in the previous usage code snippet.
After successfully exporting your Ultralytics YOLOv8 models to TFLite Edge TPU format, you can now deploy them. The primary and recommended first step for running a TFLite Edge TPU model is to use the YOLO("model_edgetpu.tflite") method, as outlined in the previous usage code snippet.
However, for in-depth instructions on deploying your TFLite Edge TPU models, take a look at the following resources:
@ -111,7 +111,7 @@ However, for in-depth instructions on deploying your TFLite Edge TPU models, tak
## Summary
In this guide, we’ve learned how to export Ultralytics YOLOv8 models to TFLite Edge TPU format. By following the steps mentioned above, you can increase the speed and power of your computer vision applications.
In this guide, we’ve learned how to export Ultralytics YOLOv8 models to TFLite Edge TPU format. By following the steps mentioned above, you can increase the speed and power of your computer vision applications.
For further details on usage, visit the [Edge TPU official website](https://cloud.google.com/edge-tpu).

@ -64,15 +64,17 @@ Welcome to the Ultralytics Integrations page! This page provides an overview of
- [CoreML](coreml.md): CoreML, developed by [Apple](https://www.apple.com/), is a framework designed for efficiently integrating machine learning models into applications across iOS, macOS, watchOS, and tvOS, using Apple's hardware for effective and secure model deployment.
- [TF SavedModel](tf-savedmodel.md): Developed by [Google](https://www.google.com), TF SavedModel is a universal serialization format for TensorFlow models, enabling easy sharing and deployment across a wide range of platforms, from servers to edge devices.
- [TF GraphDef](tf-graphdef.md): Developed by [Google](https://www.google.com), GraphDef is TensorFlow's format for representing computation graphs, enabling optimized execution of machine learning models across diverse hardware.
- [TFLite](tflite.md): Developed by [Google](https://www.google.com), TFLite is a lightweight framework for deploying machine learning models on mobile and edge devices, ensuring fast, efficient inference with minimal memory footprint.
- [TFLite Edge TPU](edge-tpu.md): Developed by [Google](https://www.google.com) for optimizing TensorFlow Lite models on Edge TPUs, this model format ensures high-speed, efficient edge computing.
- [TF.js](tfjs.md): Developed by [Google](https://www.google.com) to facilitate machine learning in browsers and Node.js, TF.js allows JavaScript-based deployment of ML models.
- [PaddlePaddle](paddlepaddle.md): An open-source deep learning platform by [Baidu](https://www.baidu.com/), PaddlePaddle enables the efficient deployment of AI models and focuses on the scalability of industrial applications.
- [NCNN](ncnn.md): Developed by [Tencent](http://www.tencent.com/), NCNN is an efficient neural network inference framework tailored for mobile devices. It enables direct deployment of AI models into apps, optimizing performance across various mobile platforms.
### Export Formats

@ -16,7 +16,7 @@ The ability to export to PaddlePaddle model format allows you to optimize your [
<img width="75%" src="https://github.com/PaddlePaddle/Paddle/blob/develop/doc/imgs/logo.png?raw=true" alt="PaddlePaddle Logo">
</p>
Developed by Baidu, [PaddlePaddle](https://www.paddlepaddle.org.cn/en) (**PA**rallel **D**istributed **D**eep **LE**arning) is China's first open-source deep learning platform. Unlike some frameworks built mainly for research, PaddlePaddle prioritizes ease of use and smooth integration across industries.
Developed by Baidu, [PaddlePaddle](https://www.paddlepaddle.org.cn/en) (**PA**rallel **D**istributed **D**eep **LE**arning) is China's first open-source deep learning platform. Unlike some frameworks built mainly for research, PaddlePaddle prioritizes ease of use and smooth integration across industries.
It offers tools and resources similar to popular frameworks like TensorFlow and PyTorch, making it accessible for developers of all experience levels. From farming and factories to service businesses, PaddlePaddle's large developer community of over 4.77 million is helping create and deploy AI applications.
@ -26,11 +26,11 @@ By exporting your Ultralytics YOLOv8 models to PaddlePaddle format, you can tap
PaddlePaddle models offer a range of key features that contribute to their flexibility, performance, and scalability across diverse deployment scenarios:
- **Dynamic-to-Static Graph**: PaddlePaddle supports [dynamic-to-static compilation](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/jit/index_en.html), where models can be translated into a static computational graph. This enables optimizations that reduce runtime overhead and boost inference performance.
- **Dynamic-to-Static Graph**: PaddlePaddle supports [dynamic-to-static compilation](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/jit/index_en.html), where models can be translated into a static computational graph. This enables optimizations that reduce runtime overhead and boost inference performance.
- **Operator Fusion**: PaddlePaddle, like TensorRT, uses [operator fusion](https://developer.nvidia.com/gtc/2020/video/s21436-vid) to streamline computation and reduce overhead. The framework minimizes memory transfers and computational steps by merging compatible operations, resulting in faster inference.
- **Operator Fusion**: PaddlePaddle, like TensorRT, uses [operator fusion](https://developer.nvidia.com/gtc/2020/video/s21436-vid) to streamline computation and reduce overhead. The framework minimizes memory transfers and computational steps by merging compatible operations, resulting in faster inference.
- **Quantization**: PaddlePaddle supports [quantization techniques](https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/quantization/PTQ_en.html), including post-training quantization and quantization-aware training. These techniques allow for the use of lower-precision data representations, effectively boosting performance and reducing model size.
- **Quantization**: PaddlePaddle supports [quantization techniques](https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/quantization/PTQ_en.html), including post-training quantization and quantization-aware training. These techniques allow for the use of lower-precision data representations, effectively boosting performance and reducing model size.
## Deployment Options in PaddlePaddle
@ -103,7 +103,7 @@ For more details about supported export options, visit the [Ultralytics document
## Deploying Exported YOLOv8 PaddlePaddle Models
After successfully exporting your Ultralytics YOLOv8 models to PaddlePaddle format, you can now deploy them. The primary and recommended first step for running a PaddlePaddle model is to use the YOLO("./model_paddle_model") method, as outlined in the previous usage code snippet.
After successfully exporting your Ultralytics YOLOv8 models to PaddlePaddle format, you can now deploy them. The primary and recommended first step for running a PaddlePaddle model is to use the YOLO("./model_paddle_model") method, as outlined in the previous usage code snippet.
However, for in-depth instructions on deploying your PaddlePaddle models in various other settings, take a look at the following resources:

@ -10,6 +10,10 @@ When you are deploying cutting-edge computer vision models, like YOLOv8, in diff
In this guide, we'll walk you step by step through how to export your [Ultralytics YOLOv8](https://github.com/ultralytics/ultralytics) models to the TF GraphDef model format. By converting your model, you can streamline deployment and use YOLOv8's computer vision capabilities in a broader range of applications and platforms.
<p align="center">
<img width="640" src="https://github.com/RizwanMunawar/RizwanMunawar/assets/62513924/2d793b51-19f2-49e0-bf4b-5208f2eb5993" alt="TensorFlow GraphDef">
</p>
## Why Should You Export to TF GraphDef?
TF GraphDef is a powerful component of the TensorFlow ecosystem that was developed by Google. It can be used to optimize and deploy models like YOLOv8. Exporting to TF GraphDef lets us move models from research to real-world applications. It allows models to run in environments without the full TensorFlow framework.
@ -20,17 +24,17 @@ GraphDef models can use hardware accelerators such as GPUs, TPUs, and AI chips,
## Key Features of TF GraphDef Models
TF GraphDef offers distinct features for streamlining model deployment and optimization.
TF GraphDef offers distinct features for streamlining model deployment and optimization.
Here's a look at its key characteristics:
- **Model Serialization**: TF GraphDef provides a way to serialize and store TensorFlow models in a platform-independent format. This serialized representation allows you to load and execute your models without the original Python codebase, making deployment easier.
- **Model Serialization**: TF GraphDef provides a way to serialize and store TensorFlow models in a platform-independent format. This serialized representation allows you to load and execute your models without the original Python codebase, making deployment easier.
- **Graph Optimization**: TF GraphDef enables the optimization of computational graphs. These optimizations can boost performance by streamlining execution flow, reducing redundancies, and tailoring operations to suit specific hardware.
- **Graph Optimization**: TF GraphDef enables the optimization of computational graphs. These optimizations can boost performance by streamlining execution flow, reducing redundancies, and tailoring operations to suit specific hardware.
- **Deployment Flexibility**: Models exported to the GraphDef format can be used in various environments, including resource-constrained devices, web browsers, and systems with specialized hardware. This opens up possibilities for wider deployment of your TensorFlow models.
- **Deployment Flexibility**: Models exported to the GraphDef format can be used in various environments, including resource-constrained devices, web browsers, and systems with specialized hardware. This opens up possibilities for wider deployment of your TensorFlow models.
- **Production Focus**: GraphDef is designed for production deployment. It supports efficient execution, serialization features, and optimizations that align with real-world use cases.
- **Production Focus**: GraphDef is designed for production deployment. It supports efficient execution, serialization features, and optimizations that align with real-world use cases.
## Deployment Options with TF GraphDef
@ -44,7 +48,7 @@ Here's how you can deploy with TF GraphDef efficiently across various platforms.
- **Web Browsers:** TensorFlow.js enables the deployment of TF GraphDef models directly within web browsers. It paves the way for real-time object detection applications running on the client side, using the capabilities of YOLOv8 through JavaScript.
- **Specialized Hardware:** TF GraphDef's platform-agnostic nature allows it to target custom hardware, such as accelerators and TPUs (Tensor Processing Units). These devices can provide performance advantages for computationally intensive models.
- **Specialized Hardware:** TF GraphDef's platform-agnostic nature allows it to target custom hardware, such as accelerators and TPUs (Tensor Processing Units). These devices can provide performance advantages for computationally intensive models.
## Exporting YOLOv8 Models to TF GraphDef

@ -32,7 +32,7 @@ Here are the key features that make TF SavedModel a great option for AI develope
## Deployment Options with TF SavedModel
Before we dive into the process of exporting YOLOv8 models to the TF SavedModel format, let's explore some typical deployment scenarios where this format is used.
Before we dive into the process of exporting YOLOv8 models to the TF SavedModel format, let's explore some typical deployment scenarios where this format is used.
TF SavedModel provides a range of options to deploy your machine learning models:
@ -63,7 +63,6 @@ To install the required package, run:
For detailed instructions and best practices related to the installation process, check our [Ultralytics Installation guide](../quickstart.md). While installing the required packages for YOLOv8, if you encounter any difficulties, consult our [Common Issues guide](../guides/yolo-common-issues.md) for solutions and tips.
### Usage
Before diving into the usage instructions, it's important to note that while all [Ultralytics YOLOv8 models](../models/index.md) are available for exporting, you can ensure that the model you select supports export functionality [here](../modes/export.md).

@ -0,0 +1,118 @@
---
comments: true
description: A guide that showcases how to export from an Ultralytics YOLOv8 model to TF.js model format for streamlined browser deployments and optimized model performance.
keywords: Ultralytics YOLOv8, TensorFlow.js, TF.js, Model Deployment, Node.js, Model Format, Export Format, Model Conversion
---
# Export to TF.js Model Format From a YOLOv8 Model Format
Deploying machine learning models directly in the browser or on Node.js can be tricky. You’ll need to make sure your model format is optimized for faster performance so that the model can be used to run interactive applications locally on the user’s device. The TensorFlow.js, or TF.js, model format is designed to use minimal power while delivering fast performance.
The ‘export to TF.js model format’ feature allows you to optimize your [Ultralytics YOLOv8](https://github.com/ultralytics/ultralytics) models for high-speed and locally-run object detection inference. In this guide, we'll walk you through converting your models to the TF.js format, making it easier for your models to perform well on various local browsers and Node.js applications.
## Why Should You Export to TF.js?
Exporting your machine learning models to TensorFlow.js, developed by the TensorFlow team as part of the broader TensorFlow ecosystem, offers numerous advantages for deploying machine learning applications. It helps enhance user privacy and security by keeping sensitive data on the device. The image below shows the TensorFlow.js architecture, and how machine learning models are converted and deployed on both web browsers and Node.js.
<p align="center">
<img width="100%" src="https://res.cloudinary.com/practicaldev/image/fetch/s--oepXBlvm--/c_limit%2Cf_auto%2Cfl_progressive%2Cq_auto%2Cw_880/https://dev-to-uploads.s3.amazonaws.com/uploads/articles/m7r4grt0zkrgyx62xxx3.png" alt="TF.js Architecture">
</p>
Running models locally also reduces latency and provides a more responsive user experience. TensorFlow.js also comes with offline capabilities, allowing users to use your application even without an internet connection. TF.js is designed for efficient execution of complex models on devices with limited resources as it is engineered for scalability, with GPU acceleration support.
## Key Features of TF.js
Here are the key features that make TF.js a powerful tool for developers:
- **Cross-Platform Support:** TensorFlow.js can be used in both browser and Node.js environments, providing flexibility in deployment across different platforms. It lets developers build and deploy applications more easily.
- **Support for Multiple Backends:** TensorFlow.js supports various backends for computation including CPU, WebGL for GPU acceleration, WebAssembly (WASM) for near-native execution speed, and WebGPU for advanced browser-based machine learning capabilities.
- **Offline Capabilities:** With TensorFlow.js, models can run in the browser without the need for an internet connection, making it possible to develop applications that are functional offline.
## Deployment Options with TensorFlow.js
Before we dive into the process of exporting YOLOv8 models to the TF.js format, let's explore some typical deployment scenarios where this format is used.
TF.js provides a range of options to deploy your machine learning models:
- **In-Browser ML Applications:** You can build web applications that run machine learning models directly in the browser. The need for server-side computation is eliminated and the server load is reduced.
- **Node.js Applications::** TensorFlow.js also supports deployment in Node.js environments, enabling the development of server-side machine learning applications. It is particularly useful for applications that require the processing power of a server or access to server-side data
- **Chrome Extensions:** An interesting deployment scenario is the creation of Chrome extensions with TensorFlow.js. For instance, you can develop an extension that allows users to right-click on an image within any webpage to classify it using a pre-trained ML model. TensorFlow.js can be integrated into everyday web browsing experiences to provide immediate insights or augmentations based on machine learning.
## Exporting YOLOv8 Models to TensorFlow.js
You can expand model compatibility and deployment flexibility by converting YOLOv8 models to TF.js.
### Installation
To install the required package, run:
!!! Tip "Installation"
=== "CLI"
```bash
# Install the required package for YOLOv8
pip install ultralytics
```
For detailed instructions and best practices related to the installation process, check our [Ultralytics Installation guide](../quickstart.md). While installing the required packages for YOLOv8, if you encounter any difficulties, consult our [Common Issues guide](../guides/yolo-common-issues.md) for solutions and tips.
### Usage
Before diving into the usage instructions, it's important to note that while all [Ultralytics YOLOv8 models](../models/index.md) are available for exporting, you can ensure that the model you select supports export functionality [here](../modes/export.md).
!!! Example "Usage"
=== "Python"
```python
from ultralytics import YOLO
# Load the YOLOv8 model
model = YOLO('yolov8n.pt')
# Export the model to TF.js format
model.export(format='tfjs') # creates '/yolov8n_web_model'
# Load the exported TF.js model
tfjs_model = YOLO('./yolov8n_web_model')
# Run inference
results = tfjs_model('https://ultralytics.com/images/bus.jpg')
```
=== "CLI"
```bash
# Export a YOLOv8n PyTorch model to TF.js format
yolo export model=yolov8n.pt format=tfjs # creates '/yolov8n_web_model'
# Run inference with the exported model
yolo predict model='./yolov8n_web_model' source='https://ultralytics.com/images/bus.jpg'
```
For more details about supported export options, visit the [Ultralytics documentation page on deployment options](../guides/model-deployment-options.md).
## Deploying Exported YOLOv8 TensorFlow.js Models
Now that you have exported your YOLOv8 model to the TF.js format, the next step is to deploy it. The primary and recommended first step for running a TF.js is to use the YOLO("./yolov8n_web_model") method, as previously shown in the usage code snippet.
However, for in-depth instructions on deploying your TF.js models, take a look at the following resources:
- **[Chrome Extension](https://www.tensorflow.org/js/tutorials/deployment/web_ml_in_chrome)**: Here’s the developer documentation for how to deploy your TF.js models to a Chrome extension.
- **[Run TensorFlow.js in Node.js](https://www.tensorflow.org/js/guide/nodejs)**: A TensorFlow blog post on running TensorFlow.js in Node.js directly.
- **[Deploying TensorFlow.js - Node Project on Cloud Platform](https://www.tensorflow.org/js/guide/node_in_cloud)**: A TensorFlow blog post on deploying a TensorFlow.js model on a Cloud Platform.
## Summary
In this guide, we learned how to export Ultralytics YOLOv8 models to the TensorFlow.js format. By exporting to TF.js, you gain the flexibility to optimize, deploy, and scale your YOLOv8 models on a wide range of platforms.
For further details on usage, visit the [TensorFlow.js official documentation](https://www.tensorflow.org/js/guide).
For more information on integrating Ultralytics YOLOv8 with other platforms and frameworks, don't forget to check out our [integration guide page](index.md). It's packed with great resources to help you make the most of YOLOv8 in your projects.

@ -147,7 +147,7 @@ FastSAM is also available directly from the [https://github.com/CASIA-IVA-Lab/Fa
4. Install the CLIP model:
```shell
pip install git+https://github.com/openai/CLIP.git
pip install git+https://github.com/ultralytics/CLIP.git
```
### Example Usage

@ -36,16 +36,16 @@ This section details the models available with their specific pre-trained weight
All the YOLOv8-World weights have been directly migrated from the official [YOLO-World](https://github.com/AILab-CVC/YOLO-World) repository, highlighting their excellent contributions.
| Model Type | Pre-trained Weights | Tasks Supported | Inference | Validation | Training | Export |
|-----------------|-------------------------------------------------------------------------------------------------------|----------------------------------------|-----------|------------|----------|--------|
| YOLOv8s-world | [yolov8s-world.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8s-world.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ❌ |
| YOLOv8s-worldv2 | [yolov8s-worldv2.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8s-worldv2.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ✅ |
| YOLOv8m-world | [yolov8m-world.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8m-world.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ❌ |
| YOLOv8m-worldv2 | [yolov8m-worldv2.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8m-worldv2.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ✅ |
| YOLOv8l-world | [yolov8l-world.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8l-world.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ❌ |
| YOLOv8l-worldv2 | [yolov8l-worldv2.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8l-worldv2.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ✅ |
| YOLOv8x-world | [yolov8x-world.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8x-world.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ❌ |
| YOLOv8x-worldv2 | [yolov8x-worldv2.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8x-worldv2.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ✅ |
| Model Type | Pre-trained Weights | Tasks Supported | Inference | Validation | Training | Export |
|-----------------|---------------------------------------------------------------------------------------------------------|----------------------------------------|-----------|------------|----------|--------|
| YOLOv8s-world | [yolov8s-world.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8s-world.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ❌ |
| YOLOv8s-worldv2 | [yolov8s-worldv2.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8s-worldv2.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ✅ |
| YOLOv8m-world | [yolov8m-world.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8m-world.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ❌ |
| YOLOv8m-worldv2 | [yolov8m-worldv2.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8m-worldv2.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ✅ |
| YOLOv8l-world | [yolov8l-world.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8l-world.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ❌ |
| YOLOv8l-worldv2 | [yolov8l-worldv2.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8l-worldv2.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ✅ |
| YOLOv8x-world | [yolov8x-world.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8x-world.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ❌ |
| YOLOv8x-worldv2 | [yolov8x-worldv2.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8x-worldv2.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ❌ | ✅ |
## Zero-shot Transfer on COCO Dataset
@ -64,6 +64,39 @@ This section details the models available with their specific pre-trained weight
The YOLO-World models are easy to integrate into your Python applications. Ultralytics provides user-friendly Python API and CLI commands to streamline development.
### Train Usage
!!! Tip "Tip"
We strongly recommend to use `yolov8-worldv2` model for custom training, because it supports deterministic training and also easy to export other formats i.e onnx/tensorrt.
Object detection is straightforward with the `train` method, as illustrated below:
!!! Example
=== "Python"
PyTorch pretrained `*.pt` models as well as configuration `*.yaml` files can be passed to the `YOLOWorld()` class to create a model instance in python:
```python
from ultralytics import YOLOWorld
# Load a pretrained YOLOv8s-worldv2 model
model = YOLOWorld('yolov8s-worldv2.pt')
# Train the model on the COCO8 example dataset for 100 epochs
results = model.train(data='coco8.yaml', epochs=100, imgsz=640)
# Run inference with the YOLOv8n model on the 'bus.jpg' image
results = model('path/to/bus.jpg')
```
=== "CLI"
```bash
# Load a pretrained YOLOv8s-worldv2 model and train it on the COCO8 example dataset for 100 epochs
yolo train model=yolov8s-worldv2.yaml data=coco8.yaml epochs=100 imgsz=640
```
### Predict Usage
Object detection is straightforward with the `predict` method, as illustrated below:
@ -196,6 +229,59 @@ You can also save a model after setting custom classes. By doing this you create
This approach provides a powerful means of customizing state-of-the-art object detection models for specific tasks, making advanced AI more accessible and applicable to a broader range of practical applications.
## Reproduce official results from scratch(Experimental)
### Prepare datasets
- Train data
| Dataset | Type | Samples | Boxes | Annotation Files |
|-------------------------------------------------------------------|-----------|---------|-------|--------------------------------------------------------------------------------------------------------------------------------------------|
| [Objects365v1](https://opendatalab.com/OpenDataLab/Objects365_v1) | Detection | 609k | 9621k | [objects365_train.json](https://opendatalab.com/OpenDataLab/Objects365_v1) |
| [GQA](https://nlp.stanford.edu/data/gqa/images.zip) | Grounding | 621k | 3681k | [final_mixed_train_no_coco.json](https://huggingface.co/GLIPModel/GLIP/blob/main/mdetr_annotations/final_mixed_train_no_coco.json) |
| [Flickr30k](https://shannon.cs.illinois.edu/DenotationGraph/) | Grounding | 149k | 641k | [final_flickr_separateGT_train.json](https://huggingface.co/GLIPModel/GLIP/blob/main/mdetr_annotations/final_flickr_separateGT_train.json) |
- Val data
| Dataset | Type | Annotation Files |
|---------------------------------------------------------------------------------------------------------|-----------|--------------------------------------------------------------------------------------------------------|
| [LVIS minival](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/datasets/lvis.yaml) | Detection | [minival.txt](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/datasets/lvis.yaml) |
### Launch training from scratch
!!! Note
`WorldTrainerFromScratch` is highly customized to allow training yolo-world models on both detection datasets and grounding datasets simultaneously. More details please checkout [ultralytics.model.yolo.world.train_world.py](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/yolo/world/train_world.py).
!!! Example
=== "Python"
```python
from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
from ultralytics import YOLOWorld
data = dict(
train=dict(
yolo_data=["Objects365.yaml"],
grounding_data=[
dict(
img_path="../datasets/flickr30k/images",
json_file="../datasets/flickr30k/final_flickr_separateGT_train.json",
),
dict(
img_path="../datasets/GQA/images",
json_file="../datasets/GQA/final_mixed_train_no_coco.json",
),
],
),
val=dict(yolo_data=["lvis.yaml"]),
)
model = YOLOWorld("yolov8s-worldv2.yaml")
model.train(data=data, batch=128, epochs=100, trainer=WorldTrainerFromScratch)
```
## Citations and Acknowledgements
We extend our gratitude to the [Tencent AILab Computer Vision Center](https://ai.tencent.com/) for their pioneering work in real-time open-vocabulary object detection with YOLO-World:

@ -58,21 +58,37 @@ The performance of YOLOv9 on the [COCO dataset](../datasets/detect/coco.md) exem
**Table 1. Comparison of State-of-the-Art Real-Time Object Detectors**
| Model | size<br><sup>(pixels) | AP<sup>val<br>50-95 | AP<sup>val<br>50 | AP<sup>val<br>75 | params<br><sup>(M) | FLOPs<br><sup>(B) |
|---------------------------------------------------------------------------------------|-----------------------|---------------------|------------------|------------------|--------------------|-------------------|
| YOLOv9-S | 640 | 46.8 | 63.4 | 50.7 | 7.2 | 26.7 |
| YOLOv9-M | 640 | 51.4 | 68.1 | 56.1 | 20.1 | 76.8 |
| [YOLOv9-C](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov9c.pt) | 640 | 53.0 | 70.2 | 57.8 | 25.5 | 102.8 |
| [YOLOv9-E](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov9e.pt) | 640 | 55.6 | 72.8 | 60.6 | 58.1 | 192.5 |
??? question "When will other model scales be available?"
YOLOv9's iterations, ranging from the smaller S variant to the extensive E model, demonstrate improvements not only in accuracy (AP metrics) but also in efficiency with a reduced number of parameters and computational needs (FLOPs). This table underscores YOLOv9's ability to deliver high precision while maintaining or reducing the computational overhead compared to prior versions and competing models.
Despite all metrics shown for the various model scales in the table below, **only** the configurations for `YOLOv9c` and `YOLOv9e` have been published. The Ultralytics Team will work swiftly to add other configurations as they become available, so be sure to check back here regularly for updates.
!!! tip "Performance"
=== "Detection (COCO)"
| Model | size<br><sup>(pixels) | mAP<sup>val<br>50-95 | mAP<sup>val<br>50 | params<br><sup>(M) | FLOPs<br><sup>(B) |
|---------------------------------------------------------------------------------------|-----------------------|----------------------|-------------------|--------------------|-------------------|
| YOLOv9t | 640 | 38.3 | 53.1 | 2.0 | 7.7 |
| YOLOv9s | 640 | 46.8 | 63.4 | 7.2 | 26.7 |
| YOLOv9m | 640 | 51.4 | 68.1 | 20.1 | 76.8 |
| [YOLOv9c](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov9c.pt) | 640 | 53.0 | 70.2 | 25.5 | 102.8 |
| [YOLOv9e](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov9e.pt) | 640 | 55.6 | 72.8 | 58.1 | 192.5 |
=== "Segmentation (COCO)"
| Model | size<br><sup>(pixels) | mAP<sup>box<br>50-95 | mAP<sup>mask<br>50-95 | params<br><sup>(M) | FLOPs<br><sup>(B) |
|-----------------------------------------------------------------------------------------------|-----------------------|----------------------|-----------------------|--------------------|-------------------|
| [YOLOv9c-seg](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov9c-seg.pt) | 640 | 52.4 | 42.2 | 27.9 | 159.4 |
| [YOLOv9e-seg](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov9e-seg.pt) | 640 | 55.1 | 44.3 | 60.5 | 248.4 |
YOLOv9's iterations, ranging from the tiny `t` variant to the extensive `e` model, demonstrate improvements not only in accuracy (mAP metrics) but also in efficiency with a reduced number of parameters and computational needs (FLOPs). This table underscores YOLOv9's ability to deliver high precision while maintaining or reducing the computational overhead compared to prior versions and competing models.
Comparatively, YOLOv9 exhibits remarkable gains:
- **Lightweight Models**: YOLOv9-S surpasses the YOLO MS-S in parameter efficiency and computational load while achieving an improvement of 0.4∼0.6% in AP.
- **Medium to Large Models**: YOLOv9-M and YOLOv9-E show notable advancements in balancing the trade-off between model complexity and detection performance, offering significant reductions in parameters and computations against the backdrop of improved accuracy.
- **Lightweight Models**: YOLOv9s surpasses the YOLO MS-S in parameter efficiency and computational load while achieving an improvement of 0.4∼0.6% in AP.
- **Medium to Large Models**: YOLOv9m and YOLOv9e show notable advancements in balancing the trade-off between model complexity and detection performance, offering significant reductions in parameters and computations against the backdrop of improved accuracy.
The YOLOv9-C model, in particular, highlights the effectiveness of the architecture's optimizations. It operates with 42% fewer parameters and 21% less computational demand than YOLOv7 AF, yet it achieves comparable accuracy, demonstrating YOLOv9's significant efficiency improvements. Furthermore, the YOLOv9-E model sets a new standard for large models, with 15% fewer parameters and 25% less computational need than [YOLOv8x](yolov8.md), alongside a substantial 1.7% improvement in AP.
The YOLOv9c model, in particular, highlights the effectiveness of the architecture's optimizations. It operates with 42% fewer parameters and 21% less computational demand than YOLOv7 AF, yet it achieves comparable accuracy, demonstrating YOLOv9's significant efficiency improvements. Furthermore, the YOLOv9e model sets a new standard for large models, with 15% fewer parameters and 25% less computational need than [YOLOv8x](yolov8.md), alongside a incremental 1.7% improvement in AP.
These results showcase YOLOv9's strategic advancements in model design, emphasizing its enhanced efficiency without compromising on the precision essential for real-time object detection tasks. The model not only pushes the boundaries of performance metrics but also emphasizes the importance of computational efficiency, making it a pivotal development in the field of computer vision.
@ -125,13 +141,17 @@ This example provides simple YOLOv9 training and inference examples. For full do
The YOLOv9 series offers a range of models, each optimized for high-performance [Object Detection](../tasks/detect.md). These models cater to varying computational needs and accuracy requirements, making them versatile for a wide array of applications.
| Model Type | Pre-trained Weights | Tasks Supported | Inference | Validation | Training | Export |
|------------|-----------------------------------------------------------------------------------------|----------------------------------------|-----------|------------|----------|--------|
| YOLOv9-C | [yolov9c.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov9c.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ✅ | ✅ |
| YOLOv9-E | [yolov9e.pt](https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov9e.pt) | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ✅ | ✅ |
| Model | Filenames | Tasks | Inference | Validation | Training | Export |
|------------|-----------------------------------|----------------------------------------------|-----------|------------|----------|--------|
| YOLOv9 | `yolov9c.pt` `yolov9e.pt` | [Object Detection](../tasks/detect.md) | ✅ | ✅ | ✅ | ✅ |
| YOLOv9-seg | `yolov9c-seg.pt` `yolov9e-seg.pt` | [Instance Segmentation](../tasks/segment.md) | ✅ | ✅ | ✅ | ✅ |
This table provides a detailed overview of the YOLOv9 model variants, highlighting their capabilities in object detection tasks and their compatibility with various operational modes such as [Inference](../modes/predict.md), [Validation](../modes/val.md), [Training](../modes/train.md), and [Export](../modes/export.md). This comprehensive support ensures that users can fully leverage the capabilities of YOLOv9 models in a broad range of object detection scenarios.
!!! note
Training YOLOv9 models will require _more_ resources **and** take longer than the equivalent sized [YOLOv8 model](yolov8.md).
## Citations and Acknowledgements
We would like to acknowledge the YOLOv9 authors for their significant contributions in the field of real-time object detection:

@ -364,7 +364,7 @@ Inference arguments:
|-----------------|----------------|------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `source` | `str` | `'ultralytics/assets'` | Specifies the data source for inference. Can be an image path, video file, directory, URL, or device ID for live feeds. Supports a wide range of formats and sources, enabling flexible application across different types of input. |
| `conf` | `float` | `0.25` | Sets the minimum confidence threshold for detections. Objects detected with confidence below this threshold will be disregarded. Adjusting this value can help reduce false positives. |
| `iou` | `float` | `0.7` | Intersection Over Union (IoU) threshold for Non-Maximum Suppression (NMS). Lower values result in fewer detections by eliminating overlapping boxes, useful for reducing duplicates. |
| `iou` | `float` | `0.7` | Intersection Over Union (IoU) threshold for Non-Maximum Suppression (NMS). Lower values result in fewer detections by eliminating overlapping boxes, useful for reducing duplicates. |
| `imgsz` | `int or tuple` | `640` | Defines the image size for inference. Can be a single integer `640` for square resizing or a (height, width) tuple. Proper sizing can improve detection accuracy and processing speed. |
| `half` | `bool` | `False` | Enables half-precision (FP16) inference, which can speed up model inference on supported GPUs with minimal impact on accuracy. |
| `device` | `str` | `None` | Specifies the device for inference (e.g., `cpu`, `cuda:0` or `0`). Allows users to select between CPU, a specific GPU, or other compute devices for model execution. |

@ -230,24 +230,25 @@ The training settings for YOLO models encompass various hyperparameters and conf
Augmentation techniques are essential for improving the robustness and performance of YOLO models by introducing variability into the training data, helping the model generalize better to unseen data. The following table outlines the purpose and effect of each augmentation argument:
| Argument | Type | Default | Range | Description |
|----------------|---------|---------------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `hsv_h` | `float` | `0.015` | `0.0 - 1.0` | Adjusts the hue of the image by a fraction of the color wheel, introducing color variability. Helps the model generalize across different lighting conditions. |
| `hsv_s` | `float` | `0.7` | `0.0 - 1.0` | Alters the saturation of the image by a fraction, affecting the intensity of colors. Useful for simulating different environmental conditions. |
| `hsv_v` | `float` | `0.4` | `0.0 - 1.0` | Modifies the value (brightness) of the image by a fraction, helping the model to perform well under various lighting conditions. |
| `degrees` | `float` | `0.0` | `-180 - +180` | Rotates the image randomly within the specified degree range, improving the model's ability to recognize objects at various orientations. |
| `translate` | `float` | `0.1` | `0.0 - 1.0` | Translates the image horizontally and vertically by a fraction of the image size, aiding in learning to detect partially visible objects. |
| `scale` | `float` | `0.5` | `>=0.0` | Scales the image by a gain factor, simulating objects at different distances from the camera. |
| `shear` | `float` | `0.0` | `-180 - +180` | Shears the image by a specified degree, mimicking the effect of objects being viewed from different angles. |
| `perspective` | `float` | `0.0` | `0.0 - 0.001` | Applies a random perspective transformation to the image, enhancing the model's ability to understand objects in 3D space. |
| `flipud` | `float` | `0.0` | `0.0 - 1.0` | Flips the image upside down with the specified probability, increasing the data variability without affecting the object's characteristics. |
| `fliplr` | `float` | `0.5` | `0.0 - 1.0` | Flips the image left to right with the specified probability, useful for learning symmetrical objects and increasing dataset diversity. |
| `bgr` | `float` | `0.0` | `0.0 - 1.0` | Flips the image channels from RGB to BGR with the specified probability, useful for increasing robustness to incorrect channel ordering. |
| `mosaic` | `float` | `1.0` | `0.0 - 1.0` | Combines four training images into one, simulating different scene compositions and object interactions. Highly effective for complex scene understanding. |
| `mixup` | `float` | `0.0` | `0.0 - 1.0` | Blends two images and their labels, creating a composite image. Enhances the model's ability to generalize by introducing label noise and visual variability. |
| `copy_paste` | `float` | `0.0` | `0.0 - 1.0` | Copies objects from one image and pastes them onto another, useful for increasing object instances and learning object occlusion. |
| `auto_augment` | `str` | `randaugment` | - | Automatically applies a predefined augmentation policy (`randaugment`, `autoaugment`, `augmix`), optimizing for classification tasks by diversifying the visual features. |
| `erasing` | `float` | `0.4` | `0.0 - 1.0` | Randomly erases a portion of the image during classification training, encouraging the model to focus on less obvious features for recognition. |
| Argument | Type | Default | Range | Description |
|-----------------|---------|---------------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `hsv_h` | `float` | `0.015` | `0.0 - 1.0` | Adjusts the hue of the image by a fraction of the color wheel, introducing color variability. Helps the model generalize across different lighting conditions. |
| `hsv_s` | `float` | `0.7` | `0.0 - 1.0` | Alters the saturation of the image by a fraction, affecting the intensity of colors. Useful for simulating different environmental conditions. |
| `hsv_v` | `float` | `0.4` | `0.0 - 1.0` | Modifies the value (brightness) of the image by a fraction, helping the model to perform well under various lighting conditions. |
| `degrees` | `float` | `0.0` | `-180 - +180` | Rotates the image randomly within the specified degree range, improving the model's ability to recognize objects at various orientations. |
| `translate` | `float` | `0.1` | `0.0 - 1.0` | Translates the image horizontally and vertically by a fraction of the image size, aiding in learning to detect partially visible objects. |
| `scale` | `float` | `0.5` | `>=0.0` | Scales the image by a gain factor, simulating objects at different distances from the camera. |
| `shear` | `float` | `0.0` | `-180 - +180` | Shears the image by a specified degree, mimicking the effect of objects being viewed from different angles. |
| `perspective` | `float` | `0.0` | `0.0 - 0.001` | Applies a random perspective transformation to the image, enhancing the model's ability to understand objects in 3D space. |
| `flipud` | `float` | `0.0` | `0.0 - 1.0` | Flips the image upside down with the specified probability, increasing the data variability without affecting the object's characteristics. |
| `fliplr` | `float` | `0.5` | `0.0 - 1.0` | Flips the image left to right with the specified probability, useful for learning symmetrical objects and increasing dataset diversity. |
| `bgr` | `float` | `0.0` | `0.0 - 1.0` | Flips the image channels from RGB to BGR with the specified probability, useful for increasing robustness to incorrect channel ordering. |
| `mosaic` | `float` | `1.0` | `0.0 - 1.0` | Combines four training images into one, simulating different scene compositions and object interactions. Highly effective for complex scene understanding. |
| `mixup` | `float` | `0.0` | `0.0 - 1.0` | Blends two images and their labels, creating a composite image. Enhances the model's ability to generalize by introducing label noise and visual variability. |
| `copy_paste` | `float` | `0.0` | `0.0 - 1.0` | Copies objects from one image and pastes them onto another, useful for increasing object instances and learning object occlusion. |
| `auto_augment` | `str` | `randaugment` | - | Automatically applies a predefined augmentation policy (`randaugment`, `autoaugment`, `augmix`), optimizing for classification tasks by diversifying the visual features. |
| `erasing` | `float` | `0.4` | `0.0 - 0.9` | Randomly erases a portion of the image during classification training, encouraging the model to focus on less obvious features for recognition. |
| `crop_fraction` | `float` | `1.0` | `0.1 - 1.0` | Crops the classification image to a fraction of its size to emphasize central features and adapt to object scales, reducing background distractions. |
These settings can be adjusted to meet the specific requirements of the dataset and task at hand. Experimenting with different values can help find the optimal augmentation strategy that leads to the best model performance.

@ -59,6 +59,10 @@ keywords: Ultralytics, Data Augmentation, BaseTransform, MixUp, RandomHSV, Lette
<br><br>
## ::: ultralytics.data.augment.RandomLoadText
<br><br>
## ::: ultralytics.data.augment.ClassifyLetterBox
<br><br>

@ -27,6 +27,10 @@ keywords: Ultralytics, YOLO v3, Data build, DataLoader, InfiniteDataLoader, seed
<br><br>
## ::: ultralytics.data.build.build_grounding
<br><br>
## ::: ultralytics.data.build.build_dataloader
<br><br>

@ -19,14 +19,18 @@ keywords: Ultralytics, YOLO, YOLODataset, SemanticDataset, data handling, data m
<br><br>
## ::: ultralytics.data.dataset.SemanticDataset
## ::: ultralytics.data.dataset.YOLOMultiModalDataset
<br><br>
## ::: ultralytics.data.dataset.load_dataset_cache_file
## ::: ultralytics.data.dataset.GroundingDataset
<br><br>
## ::: ultralytics.data.dataset.save_dataset_cache_file
## ::: ultralytics.data.dataset.YOLOConcatDataset
<br><br>
## ::: ultralytics.data.dataset.SemanticDataset
<br><br>

@ -66,3 +66,11 @@ keywords: Ultralytics, data utils, YOLO, img2label_paths, exif_size, polygon2mas
## ::: ultralytics.data.utils.autosplit
<br><br>
## ::: ultralytics.data.utils.load_dataset_cache_file
<br><br>
## ::: ultralytics.data.utils.save_dataset_cache_file
<br><br>

@ -0,0 +1,15 @@
# Reference for `ultralytics/models/yolo/world/train.py`
!!! Note
This file is available at [https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/yolo/world/train.py](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/yolo/world/train.py). If you spot a problem please help fix it by [contributing](https://docs.ultralytics.com/help/contributing/) a [Pull Request](https://github.com/ultralytics/ultralytics/edit/main/ultralytics/models/yolo/world/train.py) 🛠. Thank you 🙏!
<br><br>
## ::: ultralytics.models.yolo.world.train.WorldTrainer
<br><br>
## ::: ultralytics.models.yolo.world.train.on_pretrain_routine_end
<br><br>

@ -0,0 +1,11 @@
# Reference for `ultralytics/models/yolo/world/train_world.py`
!!! Note
This file is available at [https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/yolo/world/train_world.py](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/models/yolo/world/train_world.py). If you spot a problem please help fix it by [contributing](https://docs.ultralytics.com/help/contributing/) a [Pull Request](https://github.com/ultralytics/ultralytics/edit/main/ultralytics/models/yolo/world/train_world.py) 🛠. Thank you 🙏!
<br><br>
## ::: ultralytics.models.yolo.world.train_world.WorldTrainerFromScratch
<br><br>

@ -0,0 +1,16 @@
---
description: Discover Ultralytics YOLO's AI Queue Management for retail, using advanced machine learning to enhance customer experience with real-time queue analysis and wait time predictions.
keywords: Ultralytics, YOLO, AI Queue Management, retail analytics, queue detection, wait time prediction, machine learning, YOLOv8, customer experience
---
# Reference for `ultralytics/solutions/queue_management.py`
!!! Note
This file is available at [https://github.com/ultralytics/ultralytics/blob/main/ultralytics/solutions/queue_management.py](https://github.com/ultralytics/ultralytics/blob/main/ultralytics/solutions/queue_management.py). If you spot a problem please help fix it by [contributing](https://docs.ultralytics.com/help/contributing/) a [Pull Request](https://github.com/ultralytics/ultralytics/edit/main/ultralytics/solutions/queue_management.py) 🛠. Thank you 🙏!
<br><br>
## ::: ultralytics.solutions.queue_management.QueueManager
<br><br>

@ -115,6 +115,10 @@ keywords: Ultralytics, Torch Utils, Model EMA, Early Stopping, Smart Inference,
<br><br>
## ::: ultralytics.utils.torch_utils.convert_optimizer_state_dict_to_fp16
<br><br>
## ::: ultralytics.utils.torch_utils.profile
<br><br>

@ -229,24 +229,25 @@ It is crucial to thoughtfully configure these settings to ensure the exported mo
Augmentation techniques are essential for improving the robustness and performance of YOLO models by introducing variability into the training data, helping the model generalize better to unseen data. The following table outlines the purpose and effect of each augmentation argument:
| Argument | Type | Default | Range | Description |
|----------------|---------|---------------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `hsv_h` | `float` | `0.015` | `0.0 - 1.0` | Adjusts the hue of the image by a fraction of the color wheel, introducing color variability. Helps the model generalize across different lighting conditions. |
| `hsv_s` | `float` | `0.7` | `0.0 - 1.0` | Alters the saturation of the image by a fraction, affecting the intensity of colors. Useful for simulating different environmental conditions. |
| `hsv_v` | `float` | `0.4` | `0.0 - 1.0` | Modifies the value (brightness) of the image by a fraction, helping the model to perform well under various lighting conditions. |
| `degrees` | `float` | `0.0` | `-180 - +180` | Rotates the image randomly within the specified degree range, improving the model's ability to recognize objects at various orientations. |
| `translate` | `float` | `0.1` | `0.0 - 1.0` | Translates the image horizontally and vertically by a fraction of the image size, aiding in learning to detect partially visible objects. |
| `scale` | `float` | `0.5` | `>=0.0` | Scales the image by a gain factor, simulating objects at different distances from the camera. |
| `shear` | `float` | `0.0` | `-180 - +180` | Shears the image by a specified degree, mimicking the effect of objects being viewed from different angles. |
| `perspective` | `float` | `0.0` | `0.0 - 0.001` | Applies a random perspective transformation to the image, enhancing the model's ability to understand objects in 3D space. |
| `flipud` | `float` | `0.0` | `0.0 - 1.0` | Flips the image upside down with the specified probability, increasing the data variability without affecting the object's characteristics. |
| `fliplr` | `float` | `0.5` | `0.0 - 1.0` | Flips the image left to right with the specified probability, useful for learning symmetrical objects and increasing dataset diversity. |
| `bgr` | `float` | `0.0` | `0.0 - 1.0` | Flips the image channels from RGB to BGR with the specified probability, useful for increasing robustness to incorrect channel ordering. |
| `mosaic` | `float` | `1.0` | `0.0 - 1.0` | Combines four training images into one, simulating different scene compositions and object interactions. Highly effective for complex scene understanding. |
| `mixup` | `float` | `0.0` | `0.0 - 1.0` | Blends two images and their labels, creating a composite image. Enhances the model's ability to generalize by introducing label noise and visual variability. |
| `copy_paste` | `float` | `0.0` | `0.0 - 1.0` | Copies objects from one image and pastes them onto another, useful for increasing object instances and learning object occlusion. |
| `auto_augment` | `str` | `randaugment` | - | Automatically applies a predefined augmentation policy (`randaugment`, `autoaugment`, `augmix`), optimizing for classification tasks by diversifying the visual features. |
| `erasing` | `float` | `0.4` | `0.0 - 1.0` | Randomly erases a portion of the image during classification training, encouraging the model to focus on less obvious features for recognition. |
| Argument | Type | Default | Range | Description |
|-----------------|---------|---------------|---------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `hsv_h` | `float` | `0.015` | `0.0 - 1.0` | Adjusts the hue of the image by a fraction of the color wheel, introducing color variability. Helps the model generalize across different lighting conditions. |
| `hsv_s` | `float` | `0.7` | `0.0 - 1.0` | Alters the saturation of the image by a fraction, affecting the intensity of colors. Useful for simulating different environmental conditions. |
| `hsv_v` | `float` | `0.4` | `0.0 - 1.0` | Modifies the value (brightness) of the image by a fraction, helping the model to perform well under various lighting conditions. |
| `degrees` | `float` | `0.0` | `-180 - +180` | Rotates the image randomly within the specified degree range, improving the model's ability to recognize objects at various orientations. |
| `translate` | `float` | `0.1` | `0.0 - 1.0` | Translates the image horizontally and vertically by a fraction of the image size, aiding in learning to detect partially visible objects. |
| `scale` | `float` | `0.5` | `>=0.0` | Scales the image by a gain factor, simulating objects at different distances from the camera. |
| `shear` | `float` | `0.0` | `-180 - +180` | Shears the image by a specified degree, mimicking the effect of objects being viewed from different angles. |
| `perspective` | `float` | `0.0` | `0.0 - 0.001` | Applies a random perspective transformation to the image, enhancing the model's ability to understand objects in 3D space. |
| `flipud` | `float` | `0.0` | `0.0 - 1.0` | Flips the image upside down with the specified probability, increasing the data variability without affecting the object's characteristics. |
| `fliplr` | `float` | `0.5` | `0.0 - 1.0` | Flips the image left to right with the specified probability, useful for learning symmetrical objects and increasing dataset diversity. |
| `bgr` | `float` | `0.0` | `0.0 - 1.0` | Flips the image channels from RGB to BGR with the specified probability, useful for increasing robustness to incorrect channel ordering. |
| `mosaic` | `float` | `1.0` | `0.0 - 1.0` | Combines four training images into one, simulating different scene compositions and object interactions. Highly effective for complex scene understanding. |
| `mixup` | `float` | `0.0` | `0.0 - 1.0` | Blends two images and their labels, creating a composite image. Enhances the model's ability to generalize by introducing label noise and visual variability. |
| `copy_paste` | `float` | `0.0` | `0.0 - 1.0` | Copies objects from one image and pastes them onto another, useful for increasing object instances and learning object occlusion. |
| `auto_augment` | `str` | `randaugment` | - | Automatically applies a predefined augmentation policy (`randaugment`, `autoaugment`, `augmix`), optimizing for classification tasks by diversifying the visual features. |
| `erasing` | `float` | `0.4` | `0.0 - 0.9` | Randomly erases a portion of the image during classification training, encouraging the model to focus on less obvious features for recognition. |
| `crop_fraction` | `float` | `1.0` | `0.1 - 1.0` | Crops the classification image to a fraction of its size to emphasize central features and adapt to object scales, reducing background distractions. |
These settings can be adjusted to meet the specific requirements of the dataset and task at hand. Experimenting with different values can help find the optimal augmentation strategy that leads to the best model performance.

@ -59,6 +59,28 @@ convert_coco(#(1)!
For additional information about the `convert_coco` function, [visit the reference page](../reference/data/converter.md#ultralytics.data.converter.convert_coco)
### Get Bounding Box Dimensions
```{.py .annotate }
from ultralytics.utils.plotting import Annotator
from ultralytics import YOLO
import cv2
model = YOLO('yolov8n.pt') # Load pretrain or fine-tune model
# Process the image
source = cv2.imread('path/to/image.jpg')
results = model(source)
# Extract results
annotator = Annotator(source, example=model.names)
for box in results[0].boxes.xyxy.cpu():
width, height, area = annotator.get_bbox_dimension(box)
print("Bounding Box Width {}, Height {}, Area {}".format(
width.item(), height.item(), area.item()))
```
### Convert Bounding Boxes to Segments
With existing `x y w h` bounding box data, convert to segments using the `yolo_bbox2segment` function. The files for images and annotations need to be organized like this:
@ -211,7 +233,8 @@ boxes.bboxes
See the [`Bboxes` reference section](../reference/utils/instance.md#ultralytics.utils.instance.Bboxes) for more attributes and methods available.
!!! tip
Many of the following functions (and more) can be accessed using the [`Bboxes` class](#bounding-box-horizontal-instances) but if you prefer to work with the functions directly, see the next subsections on how to import these independently.
Many of the following functions (and more) can be accessed using the [`Bboxes` class](#bounding-box-horizontal-instances) but if you prefer to work with the functions directly, see the next subsections on how to import these independently.
### Scaling Boxes
@ -258,7 +281,7 @@ new_boxes#(1)!
1. Bounding boxes scaled for the new image size
### Bounding Box Format Conversions
### Bounding Box Format Conversions
#### XYXY → XYWH
@ -351,6 +374,7 @@ image_with_bboxes = ann.result()
1. Names can be used from `model.names` when [working with detection results](../modes/predict.md#working-with-results)
#### Oriented Bounding Boxes (OBB)
```python
import cv2 as cv
import numpy as np
@ -387,7 +411,7 @@ image_with_obb = ann.result()
See the [`Annotator` Reference Page](../reference/utils/plotting.md#ultralytics.utils.plotting.Annotator) for additional insight.
## Miscellaneous
## Miscellaneous
### Code Profiling

@ -16,8 +16,10 @@ abirami.vina@gmail.com: abirami-vina
ayush.chaurarsia@gmail.com: AyushExel
chr043416@gmail.com: RizwanMunawar
glenn.jocher@ultralytics.com: glenn-jocher
lakshanthad@yahoo.com: lakshanthad
muhammadrizwanmunawar123@gmail.com: RizwanMunawar
not.committed.yet: null
plashchynski@gmail.com: plashchynski
priytosh.revolution@live.com: priytosh-tripathi
shuizhuyuanluo@126.com: null
xinwang614@gmail.com: GreatV

@ -301,12 +301,24 @@ char* YOLO_V8::TensorProcess(clock_t& starttime_1, cv::Mat& iImg, N& blob, std::
break;
}
case YOLO_CLS:
case YOLO_CLS_HALF:
{
cv::Mat rawData;
if (modelType == YOLO_CLS) {
// FP32
rawData = cv::Mat(1, this->classes.size(), CV_32F, output);
} else {
// FP16
rawData = cv::Mat(1, this->classes.size(), CV_16F, output);
rawData.convertTo(rawData, CV_32F);
}
float *data = (float *) rawData.data;
DL_RESULT result;
for (int i = 0; i < this->classes.size(); i++)
{
result.classId = i;
result.confidence = output[i];
result.confidence = data[i];
oResult.push_back(result);
}
break;

@ -29,6 +29,7 @@ enum MODEL_TYPE
//FLOAT16 MODEL
YOLO_DETECT_V8_HALF = 4,
YOLO_POSE_V8_HALF = 5,
YOLO_CLS_HALF = 6
};

@ -240,6 +240,7 @@ nav:
- datasets/detect/index.md
- Argoverse: datasets/detect/argoverse.md
- COCO: datasets/detect/coco.md
- LVIS: datasets/detect/lvis.md
- COCO8: datasets/detect/coco8.md
- GlobalWheat2020: datasets/detect/globalwheat2020.md
- Objects365: datasets/detect/objects365.md
@ -299,6 +300,7 @@ nav:
- Conda Quickstart: guides/conda-quickstart.md
- Docker Quickstart: guides/docker-quickstart.md
- Raspberry Pi: guides/raspberry-pi.md
- NVIDIA Jetson: guides/nvidia-jetson.md
- Triton Inference Server: guides/triton-inference-server.md
- Isolating Segmentation Objects: guides/isolating-segmentation-objects.md
- Edge TPU on Raspberry Pi: guides/coral-edge-tpu-on-raspberry-pi.md
@ -316,6 +318,7 @@ nav:
- VisionEye Mapping: guides/vision-eye.md
- Speed Estimation: guides/speed-estimation.md
- Distance Calculation: guides/distance-calculation.md
- Queue Management: guides/queue-management.md
- YOLOv5:
- yolov5/index.md
- Quickstart: yolov5/quickstart_tutorial.md
@ -352,6 +355,7 @@ nav:
- TF GraphDef: integrations/tf-graphdef.md
- TFLite: integrations/tflite.md
- TFLite Edge TPU: integrations/edge-tpu.md
- TF.js: integrations/tfjs.md
- PaddlePaddle: integrations/paddlepaddle.md
- NCNN: integrations/ncnn.md
- Comet ML: integrations/comet.md
@ -492,6 +496,9 @@ nav:
- predict: reference/models/yolo/segment/predict.md
- train: reference/models/yolo/segment/train.md
- val: reference/models/yolo/segment/val.md
- world:
- train: reference/models/yolo/world/train.md
- train_world: reference/models/yolo/world/train_world.md
- nn:
- autobackend: reference/nn/autobackend.md
- modules:
@ -506,6 +513,7 @@ nav:
- distance_calculation: reference/solutions/distance_calculation.md
- heatmap: reference/solutions/heatmap.md
- object_counter: reference/solutions/object_counter.md
- queue_management: reference/solutions/queue_management.md
- speed_estimation: reference/solutions/speed_estimation.md
- trackers:
- basetrack: reference/trackers/basetrack.md

@ -19,7 +19,7 @@
# For comprehensive documentation and usage instructions, visit: https://docs.ultralytics.com
[build-system]
requires = ["setuptools>=43.0.0", "wheel"]
requires = ["setuptools>=57.0.0", "wheel"]
build-backend = "setuptools.build_meta"
# Project settings -----------------------------------------------------------------------------------------------------
@ -101,6 +101,7 @@ export = [
"openvino>=2024.0.0", # OpenVINO export
"tensorflow<=2.13.1; python_version <= '3.11'", # TF bug https://github.com/ultralytics/ultralytics/issues/5161
"tensorflowjs>=3.9.0; python_version <= '3.11'", # TF.js export, automatically installs tensorflow
"numpy==1.23.5; platform_machine == 'aarch64'", # Fix error: `np.bool` was a deprecated alias for the builtin `bool` when using TensorRT models on NVIDIA Jetson
]
explorer = [
"lancedb", # vector search

@ -2,6 +2,7 @@
import sys
from unittest import mock
from ultralytics import YOLO
from ultralytics.cfg import get_cfg
from ultralytics.engine.exporter import Exporter
@ -52,7 +53,7 @@ def test_detect():
pred.add_callback("on_predict_start", test_func)
assert test_func in pred.callbacks["on_predict_start"], "callback test failed"
# Confirm there is no issue with sys.argv being empty.
with mock.patch.object(sys, 'argv', []):
with mock.patch.object(sys, "argv", []):
result = pred(source=ASSETS, model=f"{MODEL}.pt")
assert len(result), "predictor test failed"

@ -34,6 +34,7 @@ def test_mlflow():
@pytest.mark.skipif(not check_requirements("mlflow", install=False), reason="mlflow not installed")
def test_mlflow_keep_run_active():
import os
import mlflow
"""Test training with MLflow tracking enabled."""

@ -10,7 +10,6 @@ import pytest
import torch
import yaml
from PIL import Image
from torchvision.transforms import ToTensor
from ultralytics import RTDETR, YOLO
from ultralytics.cfg import TASK2DATA
@ -108,20 +107,17 @@ def test_predict_img():
assert len(model(batch, imgsz=32)) == len(batch) # multiple sources in a batch
# Test tensor inference
im = cv2.imread(str(SOURCE)) # OpenCV
t = cv2.resize(im, (32, 32))
t = ToTensor()(t)
t = torch.stack([t, t, t, t])
results = model(t, imgsz=32)
assert len(results) == t.shape[0]
results = seg_model(t, imgsz=32)
assert len(results) == t.shape[0]
results = cls_model(t, imgsz=32)
assert len(results) == t.shape[0]
results = pose_model(t, imgsz=32)
assert len(results) == t.shape[0]
results = obb_model(t, imgsz=32)
assert len(results) == t.shape[0]
im = torch.rand((4, 3, 32, 32)) # batch-size 4, FP32 0.0-1.0 RGB order
results = model(im, imgsz=32)
assert len(results) == im.shape[0]
results = seg_model(im, imgsz=32)
assert len(results) == im.shape[0]
results = cls_model(im, imgsz=32)
assert len(results) == im.shape[0]
results = pose_model(im, imgsz=32)
assert len(results) == im.shape[0]
results = obb_model(im, imgsz=32)
assert len(results) == im.shape[0]
def test_predict_grey_and_4ch():
@ -351,7 +347,7 @@ def test_labels_and_crops():
crop_dirs = [p for p in (save_path / "crops").iterdir()]
crop_files = [f for p in crop_dirs for f in p.glob("*")]
# Crop directories match detections
assert all([r.names.get(c) in [d.name for d in crop_dirs] for c in cls_idxs])
assert all([r.names.get(c) in {d.name for d in crop_dirs} for c in cls_idxs])
# Same number of crops as detections
assert len([f for f in crop_files if im_name in f.name]) == len(r.boxes.data)
@ -518,7 +514,8 @@ def test_utils_files():
@pytest.mark.slow
def test_utils_patches_torch_save():
"""Test torch_save backoff when _torch_save throws RuntimeError."""
from unittest.mock import patch, MagicMock
from unittest.mock import MagicMock, patch
from ultralytics.utils.patches import torch_save
mock = MagicMock(side_effect=RuntimeError)
@ -592,8 +589,6 @@ def image():
)
def test_classify_transforms_train(image, auto_augment, erasing, force_color_jitter):
"""Tests classification transforms during training with various augmentation settings."""
import torchvision.transforms as T
from ultralytics.data.augment import classify_augmentations
transform = classify_augmentations(
@ -610,7 +605,6 @@ def test_classify_transforms_train(image, auto_augment, erasing, force_color_jit
hsv_v=0.4,
force_color_jitter=force_color_jitter,
erasing=erasing,
interpolation=T.InterpolationMode.BILINEAR,
)
transformed_image = transform(Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)))
@ -643,3 +637,28 @@ def test_yolo_world():
model = YOLO("yolov8s-world.pt") # no YOLOv8n-world model yet
model.set_classes(["tree", "window"])
model(ASSETS / "bus.jpg", conf=0.01)
# Training from yaml
model = YOLO("yolov8s-worldv2.yaml") # no YOLOv8n-world model yet
model.train(data="coco8.yaml", epochs=2, imgsz=32, cache="disk", batch=-1, close_mosaic=1, name="yolo-world")
model = YOLO("yolov8s-worldv2.pt") # no YOLOv8n-world model yet
# val
model.val(data="coco8.yaml", imgsz=32, save_txt=True, save_json=True)
# Training from pretrain
model.train(data="coco8.yaml", epochs=2, imgsz=32, cache="disk", batch=-1, close_mosaic=1, name="yolo-world")
# test WorWorldTrainerFromScratch
from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
model = YOLO("yolov8s-worldv2.yaml") # no YOLOv8n-world model yet
model.train(
data={"train": {"yolo_data": ["coco8.yaml"]}, "val": {"yolo_data": ["coco8.yaml"]}},
epochs=2,
imgsz=32,
cache="disk",
batch=-1,
close_mosaic=1,
name="yolo-world",
trainer=WorldTrainerFromScratch,
)

@ -1,15 +1,16 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
__version__ = "8.1.37"
__version__ = "8.1.45"
from ultralytics.data.explorer.explorer import Explorer
from ultralytics.models import RTDETR, SAM, YOLO, YOLOWorld
from ultralytics.models.fastsam import FastSAM
from ultralytics.models.nas import NAS
from ultralytics.utils import ASSETS, SETTINGS as settings
from ultralytics.utils import ASSETS, SETTINGS
from ultralytics.utils.checks import check_yolo as checks
from ultralytics.utils.downloads import download
settings = SETTINGS
__all__ = (
"__version__",
"ASSETS",

@ -272,7 +272,7 @@ def get_save_dir(args, name=None):
project = args.project or (ROOT.parent / "tests/tmp/runs" if TESTS_RUNNING else RUNS_DIR) / args.task
name = name or args.name or f"{args.mode}"
save_dir = increment_path(Path(project) / name, exist_ok=args.exist_ok if RANK in (-1, 0) else True)
save_dir = increment_path(Path(project) / name, exist_ok=args.exist_ok if RANK in {-1, 0} else True)
return Path(save_dir)
@ -566,10 +566,10 @@ def entrypoint(debug=""):
task = model.task
# Mode
if mode in ("predict", "track") and "source" not in overrides:
if mode in {"predict", "track"} and "source" not in overrides:
overrides["source"] = DEFAULT_CFG.source or ASSETS
LOGGER.warning(f"WARNING ⚠ 'source' argument is missing. Using default 'source={overrides['source']}'.")
elif mode in ("train", "val"):
elif mode in {"train", "val"}:
if "data" not in overrides and "resume" not in overrides:
overrides["data"] = DEFAULT_CFG.data or TASK2DATA.get(task or DEFAULT_CFG.task, DEFAULT_CFG.data)
LOGGER.warning(f"WARNING ⚠ 'data' argument is missing. Using default 'data={overrides['data']}'.")

File diff suppressed because it is too large Load Diff

@ -116,8 +116,8 @@ mosaic: 1.0 # (float) image mosaic (probability)
mixup: 0.0 # (float) image mixup (probability)
copy_paste: 0.0 # (float) segment copy-paste (probability)
auto_augment: randaugment # (str) auto augmentation policy for classification (randaugment, autoaugment, augmix)
erasing: 0.4 # (float) probability of random erasing during classification training (0-1)
crop_fraction: 1.0 # (float) image crop fraction for classification evaluation/inference (0-1)
erasing: 0.4 # (float) probability of random erasing during classification training (0-0.9), 0 means no erasing, must be less than 1.0.
crop_fraction: 1.0 # (float) image crop fraction for classification (0.1-1), 1.0 means no crop, must be greater than 0.
# Custom config.yaml ---------------------------------------------------------------------------------------------------
cfg: # (str, optional) for overriding defaults.yaml

@ -0,0 +1,38 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv9c-seg
# 654 layers, 27897120 parameters, 159.4 GFLOPs
# parameters
nc: 80 # number of classes
# gelan backbone
backbone:
- [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
- [-1, 1, RepNCSPELAN4, [256, 128, 64, 1]] # 2
- [-1, 1, ADown, [256]] # 3-P3/8
- [-1, 1, RepNCSPELAN4, [512, 256, 128, 1]] # 4
- [-1, 1, ADown, [512]] # 5-P4/16
- [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 6
- [-1, 1, ADown, [512]] # 7-P5/32
- [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 8
- [-1, 1, SPPELAN, [512, 256]] # 9
head:
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [[-1, 6], 1, Concat, [1]] # cat backbone P4
- [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 12
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [[-1, 4], 1, Concat, [1]] # cat backbone P3
- [-1, 1, RepNCSPELAN4, [256, 256, 128, 1]] # 15 (P3/8-small)
- [-1, 1, ADown, [256]]
- [[-1, 12], 1, Concat, [1]] # cat head P4
- [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 18 (P4/16-medium)
- [-1, 1, ADown, [512]]
- [[-1, 9], 1, Concat, [1]] # cat head P5
- [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 21 (P5/32-large)
- [[15, 18, 21], 1, Segment, [nc, 32, 256]] # Segment(P3, P4, P5)

@ -1,4 +1,6 @@
# YOLOv9
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv9c
# 618 layers, 25590912 parameters, 104.0 GFLOPs
# parameters
nc: 80 # number of classes
@ -33,4 +35,4 @@ head:
- [[-1, 9], 1, Concat, [1]] # cat head P5
- [-1, 1, RepNCSPELAN4, [512, 512, 256, 1]] # 21 (P5/32-large)
- [[15, 18, 21], 1, Detect, [nc]] # DDetect(P3, P4, P5)
- [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)

@ -0,0 +1,61 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv9c-seg
# 1261 layers, 60512800 parameters, 248.4 GFLOPs
# parameters
nc: 80 # number of classes
# gelan backbone
backbone:
- [-1, 1, Silence, []]
- [-1, 1, Conv, [64, 3, 2]] # 1-P1/2
- [-1, 1, Conv, [128, 3, 2]] # 2-P2/4
- [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]] # 3
- [-1, 1, ADown, [256]] # 4-P3/8
- [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]] # 5
- [-1, 1, ADown, [512]] # 6-P4/16
- [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 7
- [-1, 1, ADown, [1024]] # 8-P5/32
- [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 9
- [1, 1, CBLinear, [[64]]] # 10
- [3, 1, CBLinear, [[64, 128]]] # 11
- [5, 1, CBLinear, [[64, 128, 256]]] # 12
- [7, 1, CBLinear, [[64, 128, 256, 512]]] # 13
- [9, 1, CBLinear, [[64, 128, 256, 512, 1024]]] # 14
- [0, 1, Conv, [64, 3, 2]] # 15-P1/2
- [[10, 11, 12, 13, 14, -1], 1, CBFuse, [[0, 0, 0, 0, 0]]] # 16
- [-1, 1, Conv, [128, 3, 2]] # 17-P2/4
- [[11, 12, 13, 14, -1], 1, CBFuse, [[1, 1, 1, 1]]] # 18
- [-1, 1, RepNCSPELAN4, [256, 128, 64, 2]] # 19
- [-1, 1, ADown, [256]] # 20-P3/8
- [[12, 13, 14, -1], 1, CBFuse, [[2, 2, 2]]] # 21
- [-1, 1, RepNCSPELAN4, [512, 256, 128, 2]] # 22
- [-1, 1, ADown, [512]] # 23-P4/16
- [[13, 14, -1], 1, CBFuse, [[3, 3]]] # 24
- [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 25
- [-1, 1, ADown, [1024]] # 26-P5/32
- [[14, -1], 1, CBFuse, [[4]]] # 27
- [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 28
- [-1, 1, SPPELAN, [512, 256]] # 29
# gelan head
head:
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [[-1, 25], 1, Concat, [1]] # cat backbone P4
- [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]] # 32
- [-1, 1, nn.Upsample, [None, 2, 'nearest']]
- [[-1, 22], 1, Concat, [1]] # cat backbone P3
- [-1, 1, RepNCSPELAN4, [256, 256, 128, 2]] # 35 (P3/8-small)
- [-1, 1, ADown, [256]]
- [[-1, 32], 1, Concat, [1]] # cat head P4
- [-1, 1, RepNCSPELAN4, [512, 512, 256, 2]] # 38 (P4/16-medium)
- [-1, 1, ADown, [512]]
- [[-1, 29], 1, Concat, [1]] # cat head P5
- [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]] # 41 (P5/32-large)
- [[35, 38, 41], 1, Segment, [nc, 32, 256]] # Segment (P3, P4, P5)

@ -1,4 +1,6 @@
# YOLOv9
# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv9e
# 1225 layers, 58206592 parameters, 193.0 GFLOPs
# parameters
nc: 80 # number of classes
@ -15,13 +17,13 @@ backbone:
- [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 7
- [-1, 1, ADown, [1024]] # 8-P5/32
- [-1, 1, RepNCSPELAN4, [1024, 512, 256, 2]] # 9
- [1, 1, CBLinear, [[64]]] # 10
- [3, 1, CBLinear, [[64, 128]]] # 11
- [5, 1, CBLinear, [[64, 128, 256]]] # 12
- [7, 1, CBLinear, [[64, 128, 256, 512]]] # 13
- [9, 1, CBLinear, [[64, 128, 256, 512, 1024]]] # 14
- [0, 1, Conv, [64, 3, 2]] # 15-P1/2
- [[10, 11, 12, 13, 14, -1], 1, CBFuse, [[0, 0, 0, 0, 0]]] # 16
- [-1, 1, Conv, [128, 3, 2]] # 17-P2/4
@ -56,5 +58,4 @@ head:
- [[-1, 29], 1, Concat, [1]] # cat head P5
- [-1, 1, RepNCSPELAN4, [512, 1024, 512, 2]] # 41 (P5/32-large)
# detect
- [[35, 38, 41], 1, Detect, [nc]] # Detect(P3, P4, P5)

@ -1,15 +1,26 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .base import BaseDataset
from .build import build_dataloader, build_yolo_dataset, load_inference_source
from .dataset import ClassificationDataset, SemanticDataset, YOLODataset
from .build import build_dataloader, build_grounding, build_yolo_dataset, load_inference_source
from .dataset import (
ClassificationDataset,
GroundingDataset,
SemanticDataset,
YOLOConcatDataset,
YOLODataset,
YOLOMultiModalDataset,
)
__all__ = (
"BaseDataset",
"ClassificationDataset",
"SemanticDataset",
"YOLODataset",
"YOLOMultiModalDataset",
"YOLOConcatDataset",
"GroundingDataset",
"build_yolo_dataset",
"build_grounding",
"build_dataloader",
"load_inference_source",
)

@ -3,11 +3,12 @@
import math
import random
from copy import deepcopy
from typing import Tuple, Union
import cv2
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from ultralytics.utils import LOGGER, colorstr
from ultralytics.utils.checks import check_version
@ -19,7 +20,7 @@ from .utils import polygons2masks, polygons2masks_overlap
DEFAULT_MEAN = (0.0, 0.0, 0.0)
DEFAULT_STD = (1.0, 1.0, 1.0)
DEFAULT_CROP_FTACTION = 1.0
DEFAULT_CROP_FRACTION = 1.0
# TODO: we might need a BaseTransform to make all these augments be compatible with both classification and semantic
@ -66,7 +67,7 @@ class Compose:
def __init__(self, transforms):
"""Initializes the Compose object with a list of transforms."""
self.transforms = transforms
self.transforms = transforms if isinstance(transforms, list) else [transforms]
def __call__(self, data):
"""Applies a series of transformations to input data."""
@ -78,6 +79,29 @@ class Compose:
"""Appends a new transform to the existing list of transforms."""
self.transforms.append(transform)
def insert(self, index, transform):
"""Inserts a new transform to the existing list of transforms."""
self.transforms.insert(index, transform)
def __getitem__(self, index: Union[list, int]) -> "Compose":
"""Retrieve a specific transform or a set of transforms using indexing."""
assert isinstance(index, (int, list)), f"The indices should be either list or int type but got {type(index)}"
index = [index] if isinstance(index, int) else index
return Compose([self.transforms[i] for i in index])
def __setitem__(self, index: Union[list, int], value: Union[list, int]) -> None:
"""Retrieve a specific transform or a set of transforms using indexing."""
assert isinstance(index, (int, list)), f"The indices should be either list or int type but got {type(index)}"
if isinstance(index, list):
assert isinstance(
value, list
), f"The indices should be the same type as values, but got {type(index)} and {type(value)}"
if isinstance(index, int):
index, value = [index], [value]
for i, v in zip(index, value):
assert i < len(self.transforms), f"list index {i} out of range {len(self.transforms)}."
self.transforms[i] = v
def tolist(self):
"""Converts the list of transforms to a standard Python list."""
return self.transforms
@ -118,6 +142,8 @@ class BaseMixTransform:
mix_labels[i] = self.pre_transform(data)
labels["mix_labels"] = mix_labels
# Update cls and texts
labels = self._update_label_text(labels)
# Mosaic or MixUp
labels = self._mix_transform(labels)
labels.pop("mix_labels", None)
@ -131,6 +157,22 @@ class BaseMixTransform:
"""Gets a list of shuffled indexes for mosaic augmentation."""
raise NotImplementedError
def _update_label_text(self, labels):
"""Update label text."""
if "texts" not in labels:
return labels
mix_texts = sum([labels["texts"]] + [x["texts"] for x in labels["mix_labels"]], [])
mix_texts = list({tuple(x) for x in mix_texts})
text2id = {text: i for i, text in enumerate(mix_texts)}
for label in [labels] + labels["mix_labels"]:
for i, cls in enumerate(label["cls"].squeeze(-1).tolist()):
text = label["texts"][int(cls)]
label["cls"][i] = text2id[tuple(text)]
label["texts"] = mix_texts
return labels
class Mosaic(BaseMixTransform):
"""
@ -149,7 +191,7 @@ class Mosaic(BaseMixTransform):
def __init__(self, dataset, imgsz=640, p=1.0, n=4):
"""Initializes the object with a dataset, image size, probability, and border."""
assert 0 <= p <= 1.0, f"The probability should be in range [0, 1], but got {p}."
assert n in (4, 9), "grid must be equal to 4 or 9."
assert n in {4, 9}, "grid must be equal to 4 or 9."
super().__init__(dataset=dataset, p=p)
self.dataset = dataset
self.imgsz = imgsz
@ -320,6 +362,8 @@ class Mosaic(BaseMixTransform):
final_labels["instances"].clip(imgsz, imgsz)
good = final_labels["instances"].remove_zero_area_boxes()
final_labels["cls"] = final_labels["cls"][good]
if "texts" in mosaic_labels[0]:
final_labels["texts"] = mosaic_labels[0]["texts"]
return final_labels
@ -641,7 +685,7 @@ class RandomFlip:
Default is 'horizontal'.
flip_idx (array-like, optional): Index mapping for flipping keypoints, if any.
"""
assert direction in ["horizontal", "vertical"], f"Support direction `horizontal` or `vertical`, got {direction}"
assert direction in {"horizontal", "vertical"}, f"Support direction `horizontal` or `vertical`, got {direction}"
assert 0 <= p <= 1.0
self.p = p
@ -970,6 +1014,83 @@ class Format:
return masks, instances, cls
class RandomLoadText:
"""
Randomly sample positive texts and negative texts and update the class indices accordingly to the number of samples.
Attributes:
prompt_format (str): Format for prompt. Default is '{}'.
neg_samples (tuple[int]): A ranger to randomly sample negative texts, Default is (80, 80).
max_samples (int): The max number of different text samples in one image, Default is 80.
padding (bool): Whether to pad texts to max_samples. Default is False.
padding_value (str): The padding text. Default is "".
"""
def __init__(
self,
prompt_format: str = "{}",
neg_samples: Tuple[int, int] = (80, 80),
max_samples: int = 80,
padding: bool = False,
padding_value: str = "",
) -> None:
"""Initializes the RandomLoadText class with given parameters."""
self.prompt_format = prompt_format
self.neg_samples = neg_samples
self.max_samples = max_samples
self.padding = padding
self.padding_value = padding_value
def __call__(self, labels: dict) -> dict:
"""Return updated classes and texts."""
assert "texts" in labels, "No texts found in labels."
class_texts = labels["texts"]
num_classes = len(class_texts)
cls = np.asarray(labels.pop("cls"), dtype=int)
pos_labels = np.unique(cls).tolist()
if len(pos_labels) > self.max_samples:
pos_labels = set(random.sample(pos_labels, k=self.max_samples))
neg_samples = min(min(num_classes, self.max_samples) - len(pos_labels), random.randint(*self.neg_samples))
neg_labels = []
for i in range(num_classes):
if i not in pos_labels:
neg_labels.append(i)
neg_labels = random.sample(neg_labels, k=neg_samples)
sampled_labels = pos_labels + neg_labels
random.shuffle(sampled_labels)
label2ids = {label: i for i, label in enumerate(sampled_labels)}
valid_idx = np.zeros(len(labels["instances"]), dtype=bool)
new_cls = []
for i, label in enumerate(cls.squeeze(-1).tolist()):
if label not in label2ids:
continue
valid_idx[i] = True
new_cls.append([label2ids[label]])
labels["instances"] = labels["instances"][valid_idx]
labels["cls"] = np.array(new_cls)
# Randomly select one prompt when there's more than one prompts
texts = []
for label in sampled_labels:
prompts = class_texts[label]
assert len(prompts) > 0
prompt = self.prompt_format.format(prompts[random.randrange(len(prompts))])
texts.append(prompt)
if self.padding:
valid_labels = len(pos_labels) + len(neg_labels)
num_padding = self.max_samples - valid_labels
if num_padding > 0:
texts += [self.padding_value] * num_padding
labels["texts"] = texts
return labels
def v8_transforms(dataset, imgsz, hyp, stretch=False):
"""Convert images to a size suitable for YOLOv8 training."""
pre_transform = Compose(
@ -1012,8 +1133,8 @@ def classify_transforms(
size=224,
mean=DEFAULT_MEAN,
std=DEFAULT_STD,
interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR,
crop_fraction: float = DEFAULT_CROP_FTACTION,
interpolation=Image.BILINEAR,
crop_fraction: float = DEFAULT_CROP_FRACTION,
):
"""
Classification transforms for evaluation/inference. Inspired by timm/data/transforms_factory.py.
@ -1028,6 +1149,7 @@ def classify_transforms(
Returns:
(T.Compose): torchvision transforms
"""
import torchvision.transforms as T # scope for faster 'import ultralytics'
if isinstance(size, (tuple, list)):
assert len(size) == 2
@ -1036,12 +1158,12 @@ def classify_transforms(
scale_size = math.floor(size / crop_fraction)
scale_size = (scale_size, scale_size)
# aspect ratio is preserved, crops center within image, no borders are added, image is lost
# Aspect ratio is preserved, crops center within image, no borders are added, image is lost
if scale_size[0] == scale_size[1]:
# simple case, use torchvision built-in Resize w/ shortest edge mode (scalar size arg)
# Simple case, use torchvision built-in Resize with the shortest edge mode (scalar size arg)
tfl = [T.Resize(scale_size[0], interpolation=interpolation)]
else:
# resize shortest edge to matching target dim for non-square target
# Resize the shortest edge to matching target dim for non-square target
tfl = [T.Resize(scale_size)]
tfl += [T.CenterCrop(size)]
@ -1056,7 +1178,7 @@ def classify_transforms(
return T.Compose(tfl)
# Classification augmentations train ---------------------------------------------------------------------------------------
# Classification training augmentations --------------------------------------------------------------------------------
def classify_augmentations(
size=224,
mean=DEFAULT_MEAN,
@ -1071,7 +1193,7 @@ def classify_augmentations(
hsv_v=0.4, # image HSV-Value augmentation (fraction)
force_color_jitter=False,
erasing=0.0,
interpolation: T.InterpolationMode = T.InterpolationMode.BILINEAR,
interpolation=Image.BILINEAR,
):
"""
Classification transforms with augmentation for training. Inspired by timm/data/transforms_factory.py.
@ -1095,7 +1217,9 @@ def classify_augmentations(
Returns:
(T.Compose): torchvision transforms
"""
# Transforms to apply if albumentations not installed
# Transforms to apply if Albumentations not installed
import torchvision.transforms as T # scope for faster 'import ultralytics'
if not isinstance(size, int):
raise TypeError(f"classify_transforms() size {size} must be integer, not (list, tuple)")
scale = tuple(scale or (0.08, 1.0)) # default imagenet scale range

@ -15,7 +15,7 @@ import psutil
from torch.utils.data import Dataset
from ultralytics.utils import DEFAULT_CFG, LOCAL_RANK, LOGGER, NUM_THREADS, TQDM
from .utils import HELP_URL, IMG_FORMATS
from .utils import FORMATS_HELP_MSG, HELP_URL, IMG_FORMATS
class BaseDataset(Dataset):
@ -86,13 +86,12 @@ class BaseDataset(Dataset):
self.buffer = [] # buffer size = batch size
self.max_buffer_length = min((self.ni, self.batch_size * 8, 1000)) if self.augment else 0
# Cache images
if cache == "ram" and not self.check_cache_ram():
cache = False
# Cache images (options are cache = True, False, None, "ram", "disk")
self.ims, self.im_hw0, self.im_hw = [None] * self.ni, [None] * self.ni, [None] * self.ni
self.npy_files = [Path(f).with_suffix(".npy") for f in self.im_files]
if cache:
self.cache_images(cache)
self.cache = cache.lower() if isinstance(cache, str) else "ram" if cache is True else None
if (self.cache == "ram" and self.check_cache_ram()) or self.cache == "disk":
self.cache_images()
# Transforms
self.transforms = self.build_transforms(hyp=hyp)
@ -116,13 +115,11 @@ class BaseDataset(Dataset):
raise FileNotFoundError(f"{self.prefix}{p} does not exist")
im_files = sorted(x.replace("/", os.sep) for x in f if x.split(".")[-1].lower() in IMG_FORMATS)
# self.img_files = sorted([x for x in f if x.suffix[1:].lower() in IMG_FORMATS]) # pathlib
assert im_files, f"{self.prefix}No images found in {img_path}"
assert im_files, f"{self.prefix}No images found in {img_path}. {FORMATS_HELP_MSG}"
except Exception as e:
raise FileNotFoundError(f"{self.prefix}Error loading data from {img_path}\n{HELP_URL}") from e
if self.fraction < 1:
# im_files = im_files[: round(len(im_files) * self.fraction)]
num_elements_to_select = round(len(im_files) * self.fraction)
im_files = random.sample(im_files, num_elements_to_select)
im_files = im_files[: round(len(im_files) * self.fraction)] # retain a fraction of the dataset
return im_files
def update_labels(self, include_class: Optional[list]):
@ -175,26 +172,27 @@ class BaseDataset(Dataset):
self.buffer.append(i)
if len(self.buffer) >= self.max_buffer_length:
j = self.buffer.pop(0)
self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None
if self.cache != "ram":
self.ims[j], self.im_hw0[j], self.im_hw[j] = None, None, None
return im, (h0, w0), im.shape[:2]
return self.ims[i], self.im_hw0[i], self.im_hw[i]
def cache_images(self, cache):
def cache_images(self):
"""Cache images to memory or disk."""
b, gb = 0, 1 << 30 # bytes of cached images, bytes per gigabytes
fcn = self.cache_images_to_disk if cache == "disk" else self.load_image
fcn, storage = (self.cache_images_to_disk, "Disk") if self.cache == "disk" else (self.load_image, "RAM")
with ThreadPool(NUM_THREADS) as pool:
results = pool.imap(fcn, range(self.ni))
pbar = TQDM(enumerate(results), total=self.ni, disable=LOCAL_RANK > 0)
for i, x in pbar:
if cache == "disk":
if self.cache == "disk":
b += self.npy_files[i].stat().st_size
else: # 'ram'
self.ims[i], self.im_hw0[i], self.im_hw[i] = x # im, hw_orig, hw_resized = load_image(self, i)
b += self.ims[i].nbytes
pbar.desc = f"{self.prefix}Caching images ({b / gb:.1f}GB {cache})"
pbar.desc = f"{self.prefix}Caching images ({b / gb:.1f}GB {storage})"
pbar.close()
def cache_images_to_disk(self, i):
@ -213,15 +211,15 @@ class BaseDataset(Dataset):
b += im.nbytes * ratio**2
mem_required = b * self.ni / n * (1 + safety_margin) # GB required to cache dataset into RAM
mem = psutil.virtual_memory()
cache = mem_required < mem.available # to cache or not to cache, that is the question
if not cache:
success = mem_required < mem.available # to cache or not to cache, that is the question
if not success:
self.cache = None
LOGGER.info(
f'{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images '
f'with {int(safety_margin * 100)}% safety margin but only '
f'{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, '
f"{'caching images ✅' if cache else 'not caching images ⚠'}"
f"{self.prefix}{mem_required / gb:.1f}GB RAM required to cache images "
f"with {int(safety_margin * 100)}% safety margin but only "
f"{mem.available / gb:.1f}/{mem.total / gb:.1f}GB available, not caching images ⚠"
)
return cache
return success
def set_rectangle(self):
"""Sets the shape of bounding boxes for YOLO detections as rectangles."""

@ -22,7 +22,7 @@ from ultralytics.data.loaders import (
from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
from ultralytics.utils import RANK, colorstr
from ultralytics.utils.checks import check_file
from .dataset import YOLODataset
from .dataset import GroundingDataset, YOLODataset, YOLOMultiModalDataset
from .utils import PIN_MEMORY
@ -82,9 +82,10 @@ def seed_worker(worker_id): # noqa
random.seed(worker_seed)
def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, stride=32):
def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, stride=32, multi_modal=False):
"""Build YOLO Dataset."""
return YOLODataset(
dataset = YOLOMultiModalDataset if multi_modal else YOLODataset
return dataset(
img_path=img_path,
imgsz=cfg.imgsz,
batch_size=batch,
@ -103,6 +104,27 @@ def build_yolo_dataset(cfg, img_path, batch, data, mode="train", rect=False, str
)
def build_grounding(cfg, img_path, json_file, batch, mode="train", rect=False, stride=32):
"""Build YOLO Dataset."""
return GroundingDataset(
img_path=img_path,
json_file=json_file,
imgsz=cfg.imgsz,
batch_size=batch,
augment=mode == "train", # augmentation
hyp=cfg, # TODO: probably add a get_hyps_from_cfg function
rect=cfg.rect or rect, # rectangular batches
cache=cfg.cache or None,
single_cls=cfg.single_cls or False,
stride=int(stride),
pad=0.0 if mode == "train" else 0.5,
prefix=colorstr(f"{mode}: "),
task=cfg.task,
classes=cfg.classes,
fraction=cfg.fraction if mode == "train" else 1.0,
)
def build_dataloader(dataset, batch, workers, shuffle=True, rank=-1):
"""Return an InfiniteDataLoader or DataLoader for training or validation set."""
batch = min(batch, len(dataset))

@ -219,6 +219,7 @@ def convert_coco(
use_segments=False,
use_keypoints=False,
cls91to80=True,
lvis=False,
):
"""
Converts COCO dataset annotations to a YOLO annotation format suitable for training YOLO models.
@ -229,12 +230,14 @@ def convert_coco(
use_segments (bool, optional): Whether to include segmentation masks in the output.
use_keypoints (bool, optional): Whether to include keypoint annotations in the output.
cls91to80 (bool, optional): Whether to map 91 COCO class IDs to the corresponding 80 COCO class IDs.
lvis (bool, optional): Whether to convert data in lvis dataset way.
Example:
```python
from ultralytics.data.converter import convert_coco
convert_coco('../datasets/coco/annotations/', use_segments=True, use_keypoints=False, cls91to80=True)
convert_coco('../datasets/lvis/annotations/', use_segments=True, use_keypoints=False, cls91to80=False, lvis=True)
```
Output:
@ -251,8 +254,14 @@ def convert_coco(
# Import json
for json_file in sorted(Path(labels_dir).resolve().glob("*.json")):
fn = Path(save_dir) / "labels" / json_file.stem.replace("instances_", "") # folder name
lname = "" if lvis else json_file.stem.replace("instances_", "")
fn = Path(save_dir) / "labels" / lname # folder name
fn.mkdir(parents=True, exist_ok=True)
if lvis:
# NOTE: create folders for both train and val in advance,
# since LVIS val set contains images from COCO 2017 train in addition to the COCO 2017 val split.
(fn / "train2017").mkdir(parents=True, exist_ok=True)
(fn / "val2017").mkdir(parents=True, exist_ok=True)
with open(json_file) as f:
data = json.load(f)
@ -263,16 +272,20 @@ def convert_coco(
for ann in data["annotations"]:
imgToAnns[ann["image_id"]].append(ann)
image_txt = []
# Write labels file
for img_id, anns in TQDM(imgToAnns.items(), desc=f"Annotations {json_file}"):
img = images[f"{img_id:d}"]
h, w, f = img["height"], img["width"], img["file_name"]
h, w = img["height"], img["width"]
f = str(Path(img["coco_url"]).relative_to("http://images.cocodataset.org")) if lvis else img["file_name"]
if lvis:
image_txt.append(str(Path("./images") / f))
bboxes = []
segments = []
keypoints = []
for ann in anns:
if ann["iscrowd"]:
if ann.get("iscrowd", False):
continue
# The COCO box format is [top left x, top left y, width, height]
box = np.array(ann["bbox"], dtype=np.float64)
@ -314,7 +327,12 @@ def convert_coco(
) # cls, box or segments
file.write(("%g " * len(line)).rstrip() % line + "\n")
LOGGER.info(f"COCO data converted successfully.\nResults saved to {save_dir.resolve()}")
if lvis:
with open((Path(save_dir) / json_file.name.replace("lvis_v1_", "").replace(".json", ".txt")), "a") as f:
for l in image_txt:
f.write(f"{l}\n")
LOGGER.info(f"{'LVIS' if lvis else 'COCO'} data converted successfully.\nResults saved to {save_dir.resolve()}")
def convert_dota_to_yolo_obb(dota_root_path: str):
@ -463,7 +481,7 @@ def merge_multi_segment(segments):
segments[i] = np.roll(segments[i], -idx[0], axis=0)
segments[i] = np.concatenate([segments[i], segments[i][:1]])
# Deal with the first segment and the last one
if i in [0, len(idx_list) - 1]:
if i in {0, len(idx_list) - 1}:
s.append(segments[i])
else:
idx = [0, idx[1] - idx[0]]
@ -471,7 +489,7 @@ def merge_multi_segment(segments):
else:
for i in range(len(idx_list) - 1, -1, -1):
if i not in [0, len(idx_list) - 1]:
if i not in {0, len(idx_list) - 1}:
idx = idx_list[i]
nidx = abs(idx[1] - idx[0])
s.append(segments[i][nidx:])
@ -501,11 +519,12 @@ def yolo_bbox2segment(im_dir, save_dir=None, sam_model="sam_b.pt"):
..
NNN.txt
"""
from tqdm import tqdm
from ultralytics import SAM
from ultralytics.data import YOLODataset
from ultralytics.utils.ops import xywh2xyxy
from ultralytics.utils import LOGGER
from ultralytics import SAM
from tqdm import tqdm
from ultralytics.utils.ops import xywh2xyxy
# NOTE: add placeholder to pass class index check
dataset = YOLODataset(im_dir, data=dict(names=list(range(1000))))

@ -1,5 +1,8 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import contextlib
import json
from collections import defaultdict
from itertools import repeat
from multiprocessing.pool import ThreadPool
from pathlib import Path
@ -7,14 +10,32 @@ from pathlib import Path
import cv2
import numpy as np
import torch
import torchvision
from PIL import Image
from torch.utils.data import ConcatDataset
from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM, colorstr, is_dir_writeable
from ultralytics.utils import LOCAL_RANK, NUM_THREADS, TQDM, colorstr
from ultralytics.utils.ops import resample_segments
from .augment import Compose, Format, Instances, LetterBox, classify_augmentations, classify_transforms, v8_transforms
from .augment import (
Compose,
Format,
Instances,
LetterBox,
RandomLoadText,
classify_augmentations,
classify_transforms,
v8_transforms,
)
from .base import BaseDataset
from .utils import HELP_URL, LOGGER, get_hash, img2label_paths, verify_image, verify_image_label
from .utils import (
HELP_URL,
LOGGER,
get_hash,
img2label_paths,
load_dataset_cache_file,
save_dataset_cache_file,
verify_image,
verify_image_label,
)
# Ultralytics dataset *.cache version, >= 1.0.0 for YOLOv8
DATASET_CACHE_VERSION = "1.0.3"
@ -56,7 +77,7 @@ class YOLODataset(BaseDataset):
desc = f"{self.prefix}Scanning {path.parent / path.stem}..."
total = len(self.im_files)
nkpt, ndim = self.data.get("kpt_shape", (0, 0))
if self.use_keypoints and (nkpt <= 0 or ndim not in (2, 3)):
if self.use_keypoints and (nkpt <= 0 or ndim not in {2, 3}):
raise ValueError(
"'kpt_shape' in data.yaml missing or incorrect. Should be a list with [number of "
"keypoints, number of dims (2 for x,y or 3 for x,y,visible)], i.e. 'kpt_shape: [17, 3]'"
@ -82,16 +103,16 @@ class YOLODataset(BaseDataset):
nc += nc_f
if im_file:
x["labels"].append(
dict(
im_file=im_file,
shape=shape,
cls=lb[:, 0:1], # n, 1
bboxes=lb[:, 1:], # n, 4
segments=segments,
keypoints=keypoint,
normalized=True,
bbox_format="xywh",
)
{
"im_file": im_file,
"shape": shape,
"cls": lb[:, 0:1], # n, 1
"bboxes": lb[:, 1:], # n, 4
"segments": segments,
"keypoints": keypoint,
"normalized": True,
"bbox_format": "xywh",
}
)
if msg:
msgs.append(msg)
@ -105,7 +126,7 @@ class YOLODataset(BaseDataset):
x["hash"] = get_hash(self.label_files + self.im_files)
x["results"] = nf, nm, ne, nc, len(self.im_files)
x["msgs"] = msgs # warnings
save_dataset_cache_file(self.prefix, path, x)
save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
return x
def get_labels(self):
@ -121,7 +142,7 @@ class YOLODataset(BaseDataset):
# Display cache
nf, nm, ne, nc, n = cache.pop("results") # found, missing, empty, corrupt, total
if exists and LOCAL_RANK in (-1, 0):
if exists and LOCAL_RANK in {-1, 0}:
d = f"Scanning {cache_path}... {nf} images, {nm + ne} backgrounds, {nc} corrupt"
TQDM(None, desc=self.prefix + d, total=n, initial=n) # display results
if cache["msgs"]:
@ -214,7 +235,7 @@ class YOLODataset(BaseDataset):
value = values[i]
if k == "img":
value = torch.stack(value, 0)
if k in ["masks", "keypoints", "bboxes", "cls", "segments", "obb"]:
if k in {"masks", "keypoints", "bboxes", "cls", "segments", "obb"}:
value = torch.cat(value, 0)
new_batch[k] = value
new_batch["batch_idx"] = list(new_batch["batch_idx"])
@ -224,8 +245,142 @@ class YOLODataset(BaseDataset):
return new_batch
# Classification dataloaders -------------------------------------------------------------------------------------------
class ClassificationDataset(torchvision.datasets.ImageFolder):
class YOLOMultiModalDataset(YOLODataset):
"""
Dataset class for loading object detection and/or segmentation labels in YOLO format.
Args:
data (dict, optional): A dataset YAML dictionary. Defaults to None.
task (str): An explicit arg to point current task, Defaults to 'detect'.
Returns:
(torch.utils.data.Dataset): A PyTorch dataset object that can be used for training an object detection model.
"""
def __init__(self, *args, data=None, task="detect", **kwargs):
"""Initializes a dataset object for object detection tasks with optional specifications."""
super().__init__(*args, data=data, task=task, **kwargs)
def update_labels_info(self, label):
"""Add texts information for multi modal model training."""
labels = super().update_labels_info(label)
# NOTE: some categories are concatenated with its synonyms by `/`.
labels["texts"] = [v.split("/") for _, v in self.data["names"].items()]
return labels
def build_transforms(self, hyp=None):
"""Enhances data transformations with optional text augmentation for multi-modal training."""
transforms = super().build_transforms(hyp)
if self.augment:
# NOTE: hard-coded the args for now.
transforms.insert(-1, RandomLoadText(max_samples=min(self.data["nc"], 80), padding=True))
return transforms
class GroundingDataset(YOLODataset):
def __init__(self, *args, task="detect", json_file, **kwargs):
"""Initializes a GroundingDataset for object detection, loading annotations from a specified JSON file."""
assert task == "detect", "`GroundingDataset` only support `detect` task for now!"
self.json_file = json_file
super().__init__(*args, task=task, data={}, **kwargs)
def get_img_files(self, img_path):
"""The image files would be read in `get_labels` function, return empty list here."""
return []
def get_labels(self):
"""Loads annotations from a JSON file, filters, and normalizes bounding boxes for each image."""
labels = []
LOGGER.info("Loading annotation file...")
with open(self.json_file, "r") as f:
annotations = json.load(f)
images = {f'{x["id"]:d}': x for x in annotations["images"]}
imgToAnns = defaultdict(list)
for ann in annotations["annotations"]:
imgToAnns[ann["image_id"]].append(ann)
for img_id, anns in TQDM(imgToAnns.items(), desc=f"Reading annotations {self.json_file}"):
img = images[f"{img_id:d}"]
h, w, f = img["height"], img["width"], img["file_name"]
im_file = Path(self.img_path) / f
if not im_file.exists():
continue
self.im_files.append(str(im_file))
bboxes = []
cat2id = {}
texts = []
for ann in anns:
if ann["iscrowd"]:
continue
box = np.array(ann["bbox"], dtype=np.float32)
box[:2] += box[2:] / 2
box[[0, 2]] /= float(w)
box[[1, 3]] /= float(h)
if box[2] <= 0 or box[3] <= 0:
continue
cat_name = " ".join([img["caption"][t[0] : t[1]] for t in ann["tokens_positive"]])
if cat_name not in cat2id:
cat2id[cat_name] = len(cat2id)
texts.append([cat_name])
cls = cat2id[cat_name] # class
box = [cls] + box.tolist()
if box not in bboxes:
bboxes.append(box)
lb = np.array(bboxes, dtype=np.float32) if len(bboxes) else np.zeros((0, 5), dtype=np.float32)
labels.append(
{
"im_file": im_file,
"shape": (h, w),
"cls": lb[:, 0:1], # n, 1
"bboxes": lb[:, 1:], # n, 4
"normalized": True,
"bbox_format": "xywh",
"texts": texts,
}
)
return labels
def build_transforms(self, hyp=None):
"""Configures augmentations for training with optional text loading; `hyp` adjusts augmentation intensity."""
transforms = super().build_transforms(hyp)
if self.augment:
# NOTE: hard-coded the args for now.
transforms.insert(-1, RandomLoadText(max_samples=80, padding=True))
return transforms
class YOLOConcatDataset(ConcatDataset):
"""
Dataset as a concatenation of multiple datasets.
This class is useful to assemble different existing datasets.
"""
@staticmethod
def collate_fn(batch):
"""Collates data samples into batches."""
return YOLODataset.collate_fn(batch)
# TODO: support semantic segmentation
class SemanticDataset(BaseDataset):
"""
Semantic Segmentation Dataset.
This class is responsible for handling datasets used for semantic segmentation tasks. It inherits functionalities
from the BaseDataset class.
Note:
This class is currently a placeholder and needs to be populated with methods and attributes for supporting
semantic segmentation tasks.
"""
def __init__(self):
"""Initialize a SemanticDataset object."""
super().__init__()
class ClassificationDataset:
"""
Extends torchvision ImageFolder to support YOLO classification tasks, offering functionalities like image
augmentation, caching, and verification. It's designed to efficiently handle large datasets for training deep
@ -257,12 +412,19 @@ class ClassificationDataset(torchvision.datasets.ImageFolder):
prefix (str, optional): Prefix for logging and cache filenames, aiding in dataset identification and
debugging. Default is an empty string.
"""
super().__init__(root=root)
import torchvision # scope for faster 'import ultralytics'
# Base class assigned as attribute rather than used as base class to allow for scoping slow torchvision import
self.base = torchvision.datasets.ImageFolder(root=root)
self.samples = self.base.samples
self.root = self.base.root
# Initialize attributes
if augment and args.fraction < 1.0: # reduce training fraction
self.samples = self.samples[: round(len(self.samples) * args.fraction)]
self.prefix = colorstr(f"{prefix}: ") if prefix else ""
self.cache_ram = args.cache is True or args.cache == "ram" # cache images into RAM
self.cache_disk = args.cache == "disk" # cache images on hard drive as uncompressed *.npy files
self.cache_ram = args.cache is True or str(args.cache).lower() == "ram" # cache images into RAM
self.cache_disk = str(args.cache).lower() == "disk" # cache images on hard drive as uncompressed *.npy files
self.samples = self.verify_images() # filter out bad images
self.samples = [list(x) + [Path(x[0]).with_suffix(".npy"), None] for x in self.samples] # file, index, npy, im
scale = (1.0 - args.scale, 1.0) # (0.08, 1.0)
@ -285,8 +447,9 @@ class ClassificationDataset(torchvision.datasets.ImageFolder):
def __getitem__(self, i):
"""Returns subset of data and targets corresponding to given indices."""
f, j, fn, im = self.samples[i] # filename, index, filename.with_suffix('.npy'), image
if self.cache_ram and im is None:
im = self.samples[i][3] = cv2.imread(f)
if self.cache_ram:
if im is None: # Warning: two separate if statements required here, do not combine this with previous line
im = self.samples[i][3] = cv2.imread(f)
elif self.cache_disk:
if not fn.exists(): # load npy
np.save(fn.as_posix(), cv2.imread(f), allow_pickle=False)
@ -312,7 +475,7 @@ class ClassificationDataset(torchvision.datasets.ImageFolder):
assert cache["version"] == DATASET_CACHE_VERSION # matches current version
assert cache["hash"] == get_hash([x[0] for x in self.samples]) # identical hash
nf, nc, n, samples = cache.pop("results") # found, missing, empty, corrupt, total
if LOCAL_RANK in (-1, 0):
if LOCAL_RANK in {-1, 0}:
d = f"{desc} {nf} images, {nc} corrupt"
TQDM(None, desc=d, total=n, initial=n)
if cache["msgs"]:
@ -338,46 +501,5 @@ class ClassificationDataset(torchvision.datasets.ImageFolder):
x["hash"] = get_hash([x[0] for x in self.samples])
x["results"] = nf, nc, len(samples), samples
x["msgs"] = msgs # warnings
save_dataset_cache_file(self.prefix, path, x)
save_dataset_cache_file(self.prefix, path, x, DATASET_CACHE_VERSION)
return samples
def load_dataset_cache_file(path):
"""Load an Ultralytics *.cache dictionary from path."""
import gc
gc.disable() # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585
cache = np.load(str(path), allow_pickle=True).item() # load dict
gc.enable()
return cache
def save_dataset_cache_file(prefix, path, x):
"""Save an Ultralytics dataset *.cache dictionary x to path."""
x["version"] = DATASET_CACHE_VERSION # add cache version
if is_dir_writeable(path.parent):
if path.exists():
path.unlink() # remove *.cache file if exists
np.save(str(path), x) # save cache for next time
path.with_suffix(".cache.npy").rename(path) # remove .npy suffix
LOGGER.info(f"{prefix}New cache created: {path}")
else:
LOGGER.warning(f"{prefix}WARNING ⚠ Cache directory {path.parent} is not writeable, cache not saved.")
# TODO: support semantic segmentation
class SemanticDataset(BaseDataset):
"""
Semantic Segmentation Dataset.
This class is responsible for handling datasets used for semantic segmentation tasks. It inherits functionalities
from the BaseDataset class.
Note:
This class is currently a placeholder and needs to be populated with methods and attributes for supporting
semantic segmentation tasks.
"""
def __init__(self):
"""Initialize a SemanticDataset object."""
super().__init__()

@ -9,14 +9,13 @@ import numpy as np
import torch
from PIL import Image
from matplotlib import pyplot as plt
from pandas import DataFrame
from tqdm import tqdm
from ultralytics.data.augment import Format
from ultralytics.data.dataset import YOLODataset
from ultralytics.data.utils import check_det_dataset
from ultralytics.models.yolo.model import YOLO
from ultralytics.utils import LOGGER, IterableSimpleNamespace, checks, USER_CONFIG_DIR
from ultralytics.utils import LOGGER, USER_CONFIG_DIR, IterableSimpleNamespace, checks
from .utils import get_sim_index_schema, get_table_schema, plot_query_result, prompt_sql_query, sanitize_batch
@ -172,7 +171,7 @@ class Explorer:
def sql_query(
self, query: str, return_type: str = "pandas"
) -> Union[DataFrame, Any, None]: # pandas.dataframe or pyarrow.Table
) -> Union[Any, None]: # pandas.DataFrame or pyarrow.Table
"""
Run a SQL-Like query on the table. Utilizes LanceDB predicate pushdown.
@ -204,7 +203,8 @@ class Explorer:
table = self.table.to_arrow() # noqa NOTE: Don't comment this. This line is used by DuckDB
if not query.startswith("SELECT") and not query.startswith("WHERE"):
raise ValueError(
f"Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE clause. found {query}"
f"Query must start with SELECT or WHERE. You can either pass the entire query or just the WHERE "
f"clause. found {query}"
)
if query.startswith("WHERE"):
query = f"SELECT * FROM 'table' {query}"
@ -247,7 +247,7 @@ class Explorer:
idx: Union[int, List[int]] = None,
limit: int = 25,
return_type: str = "pandas",
) -> Union[DataFrame, Any]: # pandas.dataframe or pyarrow.Table
) -> Any: # pandas.DataFrame or pyarrow.Table
"""
Query the table for similar images. Accepts a single image or a list of images.
@ -312,20 +312,20 @@ class Explorer:
img = plot_query_result(similar, plot_labels=labels)
return Image.fromarray(img)
def similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> DataFrame:
def similarity_index(self, max_dist: float = 0.2, top_k: float = None, force: bool = False) -> Any: # pd.DataFrame
"""
Calculate the similarity index of all the images in the table. Here, the index will contain the data points that
are max_dist or closer to the image in the embedding space at a given index.
Args:
max_dist (float): maximum L2 distance between the embeddings to consider. Defaults to 0.2.
top_k (float): Percentage of the closest data points to consider when counting. Used to apply limit when running
top_k (float): Percentage of the closest data points to consider when counting. Used to apply limit.
vector search. Defaults: None.
force (bool): Whether to overwrite the existing similarity index or not. Defaults to True.
Returns:
(pandas.DataFrame): A dataframe containing the similarity index. Each row corresponds to an image, and columns
include indices of similar images and their respective distances.
(pandas.DataFrame): A dataframe containing the similarity index. Each row corresponds to an image,
and columns include indices of similar images and their respective distances.
Example:
```python
@ -447,12 +447,11 @@ class Explorer:
"""
result = prompt_sql_query(query)
try:
df = self.sql_query(result)
return self.sql_query(result)
except Exception as e:
LOGGER.error("AI generated query is not valid. Please try again with a different prompt")
LOGGER.error(e)
return None
return df
def visualize(self, result):
"""

@ -3,8 +3,6 @@
import time
from threading import Thread
import pandas as pd
from ultralytics import Explorer
from ultralytics.utils import ROOT, SETTINGS
from ultralytics.utils.checks import check_requirements
@ -148,12 +146,14 @@ def run_ai_query():
'OpenAI API key not found in settings. Please run yolo settings openai_api_key="..."'
)
return
import pandas # scope for faster 'import ultralytics'
st.session_state["error"] = None
query = st.session_state.get("ai_query")
if query.rstrip().lstrip():
exp = st.session_state["explorer"]
res = exp.ask_ai(query)
if not isinstance(res, pd.DataFrame) or res.empty:
if not isinstance(res, pandas.DataFrame) or res.empty:
st.session_state["error"] = "No results found using AI generated query. Try another query or rerun it."
return
st.session_state["imgs"] = res["im_file"].to_list()

@ -5,7 +5,6 @@ from typing import List
import cv2
import numpy as np
import pandas as pd
from ultralytics.data.augment import LetterBox
from ultralytics.utils import LOGGER as logger
@ -64,8 +63,10 @@ def plot_query_result(similar_set, plot_labels=True):
similar_set (list): Pyarrow or pandas object containing the similar data points
plot_labels (bool): Whether to plot labels or not
"""
import pandas # scope for faster 'import ultralytics'
similar_set = (
similar_set.to_dict(orient="list") if isinstance(similar_set, pd.DataFrame) else similar_set.to_pydict()
similar_set.to_dict(orient="list") if isinstance(similar_set, pandas.DataFrame) else similar_set.to_pydict()
)
empty_masks = [[[]]]
empty_boxes = [[]]

@ -15,8 +15,8 @@ import requests
import torch
from PIL import Image
from ultralytics.data.utils import IMG_FORMATS, VID_FORMATS
from ultralytics.utils import LOGGER, is_colab, is_kaggle, ops
from ultralytics.data.utils import FORMATS_HELP_MSG, IMG_FORMATS, VID_FORMATS
from ultralytics.utils import IS_COLAB, IS_KAGGLE, LOGGER, ops
from ultralytics.utils.checks import check_requirements
@ -83,11 +83,11 @@ class LoadStreams:
for i, s in enumerate(sources): # index, source
# Start thread to read frames from video stream
st = f"{i + 1}/{n}: {s}... "
if urlparse(s).hostname in ("www.youtube.com", "youtube.com", "youtu.be"): # if source is YouTube video
if urlparse(s).hostname in {"www.youtube.com", "youtube.com", "youtu.be"}: # if source is YouTube video
# YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/LNwODJXcvt4'
s = get_best_youtube_url(s)
s = eval(s) if s.isnumeric() else s # i.e. s = '0' local webcam
if s == 0 and (is_colab() or is_kaggle()):
if s == 0 and (IS_COLAB or IS_KAGGLE):
raise NotImplementedError(
"'source=0' webcam not supported in Colab and Kaggle notebooks. "
"Try running 'source=0' in a local environment."
@ -291,8 +291,14 @@ class LoadImagesAndVideos:
else:
raise FileNotFoundError(f"{p} does not exist")
images = [x for x in files if x.split(".")[-1].lower() in IMG_FORMATS]
videos = [x for x in files if x.split(".")[-1].lower() in VID_FORMATS]
# Define files as images or videos
images, videos = [], []
for f in files:
suffix = f.split(".")[-1].lower() # Get file extension without the dot and lowercase
if suffix in IMG_FORMATS:
images.append(f)
elif suffix in VID_FORMATS:
videos.append(f)
ni, nv = len(images), len(videos)
self.files = images + videos
@ -307,10 +313,7 @@ class LoadImagesAndVideos:
else:
self.cap = None
if self.nf == 0:
raise FileNotFoundError(
f"No images or videos found in {p}. "
f"Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}"
)
raise FileNotFoundError(f"No images or videos found in {p}. {FORMATS_HELP_MSG}")
def __iter__(self):
"""Returns an iterator object for VideoStream or ImageFolder."""

@ -71,7 +71,7 @@ def load_yolo_dota(data_root, split="train"):
- train
- val
"""
assert split in ["train", "val"]
assert split in {"train", "val"}, f"Split must be 'train' or 'val', not {split}."
im_dir = Path(data_root) / "images" / split
assert im_dir.exists(), f"Can't find {im_dir}, please check your data root."
im_files = glob(str(Path(data_root) / "images" / split / "*"))

@ -27,6 +27,7 @@ from ultralytics.utils import (
clean_url,
colorstr,
emojis,
is_dir_writeable,
yaml_load,
yaml_save,
)
@ -38,6 +39,7 @@ HELP_URL = "See https://docs.ultralytics.com/datasets/detect for dataset formatt
IMG_FORMATS = {"bmp", "dng", "jpeg", "jpg", "mpo", "png", "tif", "tiff", "webp", "pfm"} # image suffixes
VID_FORMATS = {"asf", "avi", "gif", "m4v", "mkv", "mov", "mp4", "mpeg", "mpg", "ts", "wmv", "webm"} # video suffixes
PIN_MEMORY = str(os.getenv("PIN_MEMORY", True)).lower() == "true" # global pin_memory for dataloaders
FORMATS_HELP_MSG = f"Supported formats are:\nimages: {IMG_FORMATS}\nvideos: {VID_FORMATS}"
def img2label_paths(img_paths):
@ -62,7 +64,7 @@ def exif_size(img: Image.Image):
exif = img.getexif()
if exif:
rotation = exif.get(274, None) # the EXIF key for the orientation tag is 274
if rotation in [6, 8]: # rotation 270 or 90
if rotation in {6, 8}: # rotation 270 or 90
s = s[1], s[0]
return s
@ -78,8 +80,8 @@ def verify_image(args):
shape = exif_size(im) # image size
shape = (shape[1], shape[0]) # hw
assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}"
if im.format.lower() in ("jpg", "jpeg"):
assert im.format.lower() in IMG_FORMATS, f"Invalid image format {im.format}. {FORMATS_HELP_MSG}"
if im.format.lower() in {"jpg", "jpeg"}:
with open(im_file, "rb") as f:
f.seek(-2, 2)
if f.read() != b"\xff\xd9": # corrupt JPEG
@ -104,8 +106,8 @@ def verify_image_label(args):
shape = exif_size(im) # image size
shape = (shape[1], shape[0]) # hw
assert (shape[0] > 9) & (shape[1] > 9), f"image size {shape} <10 pixels"
assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}"
if im.format.lower() in ("jpg", "jpeg"):
assert im.format.lower() in IMG_FORMATS, f"invalid image format {im.format}. {FORMATS_HELP_MSG}"
if im.format.lower() in {"jpg", "jpeg"}:
with open(im_file, "rb") as f:
f.seek(-2, 2)
if f.read() != b"\xff\xd9": # corrupt JPEG
@ -303,7 +305,7 @@ def check_det_dataset(dataset, autodownload=True):
# Set paths
data["path"] = path # download scripts
for k in "train", "val", "test":
for k in "train", "val", "test", "minival":
if data.get(k): # prepend path
if isinstance(data[k], str):
x = (path / data[k]).resolve()
@ -335,7 +337,7 @@ def check_det_dataset(dataset, autodownload=True):
else: # python script
exec(s, {"yaml": data})
dt = f"({round(time.time() - t, 1)}s)"
s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in (0, None) else f"failure {dt}"
s = f"success ✅ {dt}, saved to {colorstr('bold', DATASETS_DIR)}" if r in {0, None} else f"failure {dt}"
LOGGER.info(f"Dataset download {s}\n")
check_font("Arial.ttf" if is_ascii(data["names"]) else "Arial.Unicode.ttf") # download fonts
@ -365,7 +367,7 @@ def check_cls_dataset(dataset, split=""):
# Download (optional if dataset=https://file.zip is passed directly)
if str(dataset).startswith(("http:/", "https:/")):
dataset = safe_download(dataset, dir=DATASETS_DIR, unzip=True, delete=False)
elif Path(dataset).suffix in (".zip", ".tar", ".gz"):
elif Path(dataset).suffix in {".zip", ".tar", ".gz"}:
file = check_file(dataset)
dataset = safe_download(file, dir=DATASETS_DIR, unzip=True, delete=False)
@ -649,3 +651,26 @@ def autosplit(path=DATASETS_DIR / "coco8/images", weights=(0.9, 0.1, 0.0), annot
if not annotated_only or Path(img2label_paths([str(img)])[0]).exists(): # check label
with open(path.parent / txt[i], "a") as f:
f.write(f"./{img.relative_to(path.parent).as_posix()}" + "\n") # add image to txt file
def load_dataset_cache_file(path):
"""Load an Ultralytics *.cache dictionary from path."""
import gc
gc.disable() # reduce pickle load time https://github.com/ultralytics/ultralytics/pull/1585
cache = np.load(str(path), allow_pickle=True).item() # load dict
gc.enable()
return cache
def save_dataset_cache_file(prefix, path, x, version):
"""Save an Ultralytics dataset *.cache dictionary x to path."""
x["version"] = version # add cache version
if is_dir_writeable(path.parent):
if path.exists():
path.unlink() # remove *.cache file if exists
np.save(str(path), x) # save cache for next time
path.with_suffix(".cache.npy").rename(path) # remove .npy suffix
LOGGER.info(f"{prefix}New cache created: {path}")
else:
LOGGER.warning(f"{prefix}WARNING ⚠ Cache directory {path.parent} is not writeable, cache not saved.")

@ -75,6 +75,7 @@ from ultralytics.utils import (
LINUX,
LOGGER,
MACOS,
PYTHON_VERSION,
ROOT,
WINDOWS,
__version__,
@ -83,7 +84,7 @@ from ultralytics.utils import (
get_default_args,
yaml_save,
)
from ultralytics.utils.checks import PYTHON_VERSION, check_imgsz, check_is_path_safe, check_requirements, check_version
from ultralytics.utils.checks import check_imgsz, check_is_path_safe, check_requirements, check_version
from ultralytics.utils.downloads import attempt_download_asset, get_github_assets
from ultralytics.utils.files import file_size, spaces_in_path
from ultralytics.utils.ops import Profile
@ -92,7 +93,7 @@ from ultralytics.utils.torch_utils import TORCH_1_13, get_latest_opset, select_d
def export_formats():
"""YOLOv8 export formats."""
import pandas
import pandas # scope for faster 'import ultralytics'
x = [
["PyTorch", "-", ".pt", True, True],
@ -159,7 +160,7 @@ class Exporter:
_callbacks (dict, optional): Dictionary of callback functions. Defaults to None.
"""
self.args = get_cfg(cfg, overrides)
if self.args.format.lower() in ("coreml", "mlmodel"): # fix attempt for protobuf<3.20.x errors
if self.args.format.lower() in {"coreml", "mlmodel"}: # fix attempt for protobuf<3.20.x errors
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python" # must run before TensorBoard callback
self.callbacks = _callbacks or callbacks.get_default_callbacks()
@ -171,9 +172,9 @@ class Exporter:
self.run_callbacks("on_export_start")
t = time.time()
fmt = self.args.format.lower() # to lowercase
if fmt in ("tensorrt", "trt"): # 'engine' aliases
if fmt in {"tensorrt", "trt"}: # 'engine' aliases
fmt = "engine"
if fmt in ("mlmodel", "mlpackage", "mlprogram", "apple", "ios", "coreml"): # 'coreml' aliases
if fmt in {"mlmodel", "mlpackage", "mlprogram", "apple", "ios", "coreml"}: # 'coreml' aliases
fmt = "coreml"
fmts = tuple(export_formats()["Argument"][1:]) # available export formats
flags = [x == fmt for x in fmts]

@ -145,7 +145,7 @@ class Model(nn.Module):
return
# Load or create new YOLO model
if Path(model).suffix in (".yaml", ".yml"):
if Path(model).suffix in {".yaml", ".yml"}:
self._new(model, task=task, verbose=verbose)
else:
self._load(model, task=task)
@ -321,9 +321,10 @@ class Model(nn.Module):
AssertionError: If the model is not a PyTorch model.
"""
self._check_is_pytorch_model()
from ultralytics import __version__
from datetime import datetime
from ultralytics import __version__
updates = {
"date": datetime.now().isoformat(),
"version": __version__,
@ -666,7 +667,7 @@ class Model(nn.Module):
self.trainer.hub_session = self.session # attach optional HUB session
self.trainer.train()
# Update model and cfg after training
if RANK in (-1, 0):
if RANK in {-1, 0}:
ckpt = self.trainer.best if self.trainer.best.exists() else self.trainer.last
self.model, _ = attempt_load_one_weight(ckpt)
self.overrides = self.model.args
@ -733,7 +734,13 @@ class Model(nn.Module):
"""
from ultralytics.nn.autobackend import check_class_names
return check_class_names(self.model.names) if hasattr(self.model, "names") else None
if hasattr(self.model, "names"):
return check_class_names(self.model.names)
else:
if not self.predictor: # export formats will not have predictor defined until predict() is called
self.predictor = self._smart_load("predictor")(overrides=self.overrides, _callbacks=self.callbacks)
self.predictor.setup_model(model=self.model, verbose=False)
return self.predictor.model.names
@property
def device(self) -> torch.device:

@ -470,7 +470,7 @@ class Boxes(BaseTensor):
if boxes.ndim == 1:
boxes = boxes[None, :]
n = boxes.shape[-1]
assert n in (6, 7), f"expected 6 or 7 values but got {n}" # xyxy, track_id, conf, cls
assert n in {6, 7}, f"expected 6 or 7 values but got {n}" # xyxy, track_id, conf, cls
super().__init__(boxes, orig_shape)
self.is_track = n == 7
self.orig_shape = orig_shape
@ -687,7 +687,7 @@ class OBB(BaseTensor):
if boxes.ndim == 1:
boxes = boxes[None, :]
n = boxes.shape[-1]
assert n in (7, 8), f"expected 7 or 8 values but got {n}" # xywh, rotation, track_id, conf, cls
assert n in {7, 8}, f"expected 7 or 8 values but got {n}" # xywh, rotation, track_id, conf, cls
super().__init__(boxes, orig_shape)
self.is_track = n == 8
self.orig_shape = orig_shape

@ -42,7 +42,7 @@ from ultralytics.utils.files import get_latest_run
from ultralytics.utils.torch_utils import (
EarlyStopping,
ModelEMA,
de_parallel,
convert_optimizer_state_dict_to_fp16,
init_seeds,
one_cycle,
select_device,
@ -107,7 +107,7 @@ class BaseTrainer:
self.save_dir = get_save_dir(self.args)
self.args.name = self.save_dir.name # update name for loggers
self.wdir = self.save_dir / "weights" # weights dir
if RANK in (-1, 0):
if RANK in {-1, 0}:
self.wdir.mkdir(parents=True, exist_ok=True) # make dir
self.args.save_dir = str(self.save_dir)
yaml_save(self.save_dir / "args.yaml", vars(self.args)) # save run args
@ -122,27 +122,12 @@ class BaseTrainer:
print_args(vars(self.args))
# Device
if self.device.type in ("cpu", "mps"):
if self.device.type in {"cpu", "mps"}:
self.args.workers = 0 # faster CPU training as time dominated by inference, not dataloading
# Model and Dataset
self.model = check_model_file_from_stem(self.args.model) # add suffix, i.e. yolov8n -> yolov8n.pt
try:
if self.args.task == "classify":
self.data = check_cls_dataset(self.args.data)
elif self.args.data.split(".")[-1] in ("yaml", "yml") or self.args.task in (
"detect",
"segment",
"pose",
"obb",
):
self.data = check_det_dataset(self.args.data)
if "yaml_file" in self.data:
self.args.data = self.data["yaml_file"] # for validating 'yolo train data=url.zip' usage
except Exception as e:
raise RuntimeError(emojis(f"Dataset '{clean_url(self.args.data)}' error ❌ {e}")) from e
self.trainset, self.testset = self.get_dataset(self.data)
self.trainset, self.testset = self.get_dataset()
self.ema = None
# Optimization utils init
@ -160,7 +145,7 @@ class BaseTrainer:
# Callbacks
self.callbacks = _callbacks or callbacks.get_default_callbacks()
if RANK in (-1, 0):
if RANK in {-1, 0}:
callbacks.add_integration_callbacks(self)
def add_callback(self, event: str, callback):
@ -226,9 +211,9 @@ class BaseTrainer:
torch.cuda.set_device(RANK)
self.device = torch.device("cuda", RANK)
# LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
os.environ["NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
dist.init_process_group(
"nccl" if dist.is_nccl_available() else "gloo",
backend="nccl" if dist.is_nccl_available() else "gloo",
timeout=timedelta(seconds=10800), # 3 hours
rank=RANK,
world_size=world_size,
@ -267,7 +252,7 @@ class BaseTrainer:
# Check AMP
self.amp = torch.tensor(self.args.amp).to(self.device) # True or False
if self.amp and RANK in (-1, 0): # Single-GPU and DDP
if self.amp and RANK in {-1, 0}: # Single-GPU and DDP
callbacks_backup = callbacks.default_callbacks.copy() # backup callbacks as check_amp() resets them
self.amp = torch.tensor(check_amp(self.model), device=self.device)
callbacks.default_callbacks = callbacks_backup # restore callbacks
@ -290,7 +275,7 @@ class BaseTrainer:
# Dataloaders
batch_size = self.batch_size // max(world_size, 1)
self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode="train")
if RANK in (-1, 0):
if RANK in {-1, 0}:
# Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
self.test_loader = self.get_dataloader(
self.testset, batch_size=batch_size if self.args.task == "obb" else batch_size * 2, rank=-1, mode="val"
@ -347,6 +332,10 @@ class BaseTrainer:
while True:
self.epoch = epoch
self.run_callbacks("on_train_epoch_start")
with warnings.catch_warnings():
warnings.simplefilter("ignore") # suppress 'Detected lr_scheduler.step() before optimizer.step()'
self.scheduler.step()
self.model.train()
if RANK != -1:
self.train_loader.sampler.set_epoch(epoch)
@ -356,7 +345,7 @@ class BaseTrainer:
self._close_dataloader_mosaic()
self.train_loader.reset()
if RANK in (-1, 0):
if RANK in {-1, 0}:
LOGGER.info(self.progress_string())
pbar = TQDM(enumerate(self.train_loader), total=nb)
self.tloss = None
@ -408,7 +397,7 @@ class BaseTrainer:
mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G" # (GB)
loss_len = self.tloss.shape[0] if len(self.tloss.shape) else 1
losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
if RANK in (-1, 0):
if RANK in {-1, 0}:
pbar.set_description(
("%11s" * 2 + "%11.4g" * (2 + loss_len))
% (f"{epoch + 1}/{self.epochs}", mem, *losses, batch["cls"].shape[0], batch["img"].shape[-1])
@ -421,7 +410,7 @@ class BaseTrainer:
self.lr = {f"lr/pg{ir}": x["lr"] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers
self.run_callbacks("on_train_epoch_end")
if RANK in (-1, 0):
if RANK in {-1, 0}:
final_epoch = epoch + 1 >= self.epochs
self.ema.update_attr(self.model, include=["yaml", "nc", "args", "names", "stride", "class_weights"])
@ -442,15 +431,12 @@ class BaseTrainer:
t = time.time()
self.epoch_time = t - self.epoch_time_start
self.epoch_time_start = t
with warnings.catch_warnings():
warnings.simplefilter("ignore") # suppress 'Detected lr_scheduler.step() before optimizer.step()'
if self.args.time:
mean_epoch_time = (t - self.train_time_start) / (epoch - self.start_epoch + 1)
self.epochs = self.args.epochs = math.ceil(self.args.time * 3600 / mean_epoch_time)
self._setup_scheduler()
self.scheduler.last_epoch = self.epoch # do not move
self.stop |= epoch >= self.epochs # stop if exceeded epochs
self.scheduler.step()
if self.args.time:
mean_epoch_time = (t - self.train_time_start) / (epoch - self.start_epoch + 1)
self.epochs = self.args.epochs = math.ceil(self.args.time * 3600 / mean_epoch_time)
self._setup_scheduler()
self.scheduler.last_epoch = self.epoch # do not move
self.stop |= epoch >= self.epochs # stop if exceeded epochs
self.run_callbacks("on_fit_epoch_end")
torch.cuda.empty_cache() # clear GPU memory at end of epoch, may help reduce CUDA out of memory errors
@ -463,7 +449,7 @@ class BaseTrainer:
break # must break all DDP ranks
epoch += 1
if RANK in (-1, 0):
if RANK in {-1, 0}:
# Do final val with best.pt
LOGGER.info(
f"\n{epoch - self.start_epoch + 1} epochs completed in "
@ -478,43 +464,60 @@ class BaseTrainer:
def save_model(self):
"""Save model training checkpoints with additional metadata."""
import pandas as pd # scope for faster startup
metrics = {**self.metrics, **{"fitness": self.fitness}}
results = {k.strip(): v for k, v in pd.read_csv(self.csv).to_dict(orient="list").items()}
ckpt = {
"epoch": self.epoch,
"best_fitness": self.best_fitness,
"model": deepcopy(de_parallel(self.model)).half(),
"ema": deepcopy(self.ema.ema).half(),
"updates": self.ema.updates,
"optimizer": self.optimizer.state_dict(),
"train_args": vars(self.args), # save as dict
"train_metrics": metrics,
"train_results": results,
"date": datetime.now().isoformat(),
"version": __version__,
"license": "AGPL-3.0 (https://ultralytics.com/license)",
"docs": "https://docs.ultralytics.com",
}
if self.args.close_mosaic and self.epoch == (self.epochs - self.args.close_mosaic - 1):
torch.save(ckpt, self.last_mosaic)
# Save last and best
torch.save(ckpt, self.last)
import io
import pandas as pd # scope for faster 'import ultralytics'
# Serialize ckpt to a byte buffer once (faster than repeated torch.save() calls)
buffer = io.BytesIO()
torch.save(
{
"epoch": self.epoch,
"best_fitness": self.best_fitness,
"model": None, # resume and final checkpoints derive from EMA
"ema": deepcopy(self.ema.ema).half(),
"updates": self.ema.updates,
"optimizer": convert_optimizer_state_dict_to_fp16(deepcopy(self.optimizer.state_dict())),
"train_args": vars(self.args), # save as dict
"train_metrics": {**self.metrics, **{"fitness": self.fitness}},
"train_results": {k.strip(): v for k, v in pd.read_csv(self.csv).to_dict(orient="list").items()},
"date": datetime.now().isoformat(),
"version": __version__,
"license": "AGPL-3.0 (https://ultralytics.com/license)",
"docs": "https://docs.ultralytics.com",
},
buffer,
)
serialized_ckpt = buffer.getvalue() # get the serialized content to save
# Save checkpoints
self.last.write_bytes(serialized_ckpt) # save last.pt
if self.best_fitness == self.fitness:
torch.save(ckpt, self.best)
self.best.write_bytes(serialized_ckpt) # save best.pt
if (self.save_period > 0) and (self.epoch > 0) and (self.epoch % self.save_period == 0):
torch.save(ckpt, self.wdir / f"epoch{self.epoch}.pt")
(self.wdir / f"epoch{self.epoch}.pt").write_bytes(serialized_ckpt) # save epoch, i.e. 'epoch3.pt'
@staticmethod
def get_dataset(data):
def get_dataset(self):
"""
Get train, val path from data dict if it exists.
Returns None if data format is not recognized.
"""
try:
if self.args.task == "classify":
data = check_cls_dataset(self.args.data)
elif self.args.data.split(".")[-1] in {"yaml", "yml"} or self.args.task in {
"detect",
"segment",
"pose",
"obb",
}:
data = check_det_dataset(self.args.data)
if "yaml_file" in data:
self.args.data = data["yaml_file"] # for validating 'yolo train data=url.zip' usage
except Exception as e:
raise RuntimeError(emojis(f"Dataset '{clean_url(self.args.data)}' error ❌ {e}")) from e
self.data = data
return data["train"], data.get("val") or data.get("test")
def setup_model(self):
@ -526,7 +529,7 @@ class BaseTrainer:
ckpt = None
if str(model).endswith(".pt"):
weights, ckpt = attempt_load_one_weight(model)
cfg = ckpt["model"].yaml
cfg = weights.yaml
else:
cfg = model
self.model = self.get_model(cfg=cfg, weights=weights, verbose=RANK == -1) # calls Model(cfg, weights)
@ -648,8 +651,8 @@ class BaseTrainer:
resume = True
self.args = get_cfg(ckpt_args)
self.args.model = str(last) # reinstate model
for k in "imgsz", "batch": # allow arg updates to reduce memory on resume if crashed due to CUDA OOM
self.args.model = self.args.resume = str(last) # reinstate model
for k in "imgsz", "batch", "device": # allow arg updates to reduce memory or update device on resume
if k in overrides:
setattr(self.args, k, overrides[k])
@ -662,24 +665,21 @@ class BaseTrainer:
def resume_training(self, ckpt):
"""Resume YOLO training from given epoch and best fitness."""
if ckpt is None:
if ckpt is None or not self.resume:
return
best_fitness = 0.0
start_epoch = ckpt["epoch"] + 1
if ckpt["optimizer"] is not None:
start_epoch = ckpt.get("epoch", -1) + 1
if ckpt.get("optimizer", None) is not None:
self.optimizer.load_state_dict(ckpt["optimizer"]) # optimizer
best_fitness = ckpt["best_fitness"]
if self.ema and ckpt.get("ema"):
self.ema.ema.load_state_dict(ckpt["ema"].float().state_dict()) # EMA
self.ema.updates = ckpt["updates"]
if self.resume:
assert start_epoch > 0, (
f"{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n"
f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'"
)
LOGGER.info(
f"Resuming training from {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs"
)
assert start_epoch > 0, (
f"{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n"
f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'"
)
LOGGER.info(f"Resuming training {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs")
if self.epochs < start_epoch:
LOGGER.info(
f"{self.model} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {self.epochs} more epochs."
@ -740,7 +740,7 @@ class BaseTrainer:
else: # weight (with decay)
g[0].append(param)
if name in ("Adam", "Adamax", "AdamW", "NAdam", "RAdam"):
if name in {"Adam", "Adamax", "AdamW", "NAdam", "RAdam"}:
optimizer = getattr(optim, name, optim.Adam)(g[2], lr=lr, betas=(momentum, 0.999), weight_decay=0.0)
elif name == "RMSProp":
optimizer = optim.RMSprop(g[2], lr=lr, momentum=momentum)

@ -218,7 +218,7 @@ class Tuner:
for ckpt in weights_dir.glob("*.pt"):
shutil.copy2(ckpt, self.tune_dir / "weights")
elif cleanup:
shutil.rmtree(ckpt_file.parent) # remove iteration weights/ dir to reduce storage space
shutil.rmtree(weights_dir, ignore_errors=True) # remove iteration weights/ dir to reduce storage space
# Plot tune results
plot_tune_results(self.tune_csv)

@ -139,14 +139,14 @@ class BaseValidator:
self.args.batch = 1 # export.py models default to batch-size 1
LOGGER.info(f"Forcing batch=1 square inference (1,3,{imgsz},{imgsz}) for non-PyTorch models")
if str(self.args.data).split(".")[-1] in ("yaml", "yml"):
if str(self.args.data).split(".")[-1] in {"yaml", "yml"}:
self.data = check_det_dataset(self.args.data)
elif self.args.task == "classify":
self.data = check_cls_dataset(self.args.data, split=self.args.split)
else:
raise FileNotFoundError(emojis(f"Dataset '{self.args.data}' for task={self.args.task} not found ❌"))
if self.device.type in ("cpu", "mps"):
if self.device.type in {"cpu", "mps"}:
self.args.workers = 0 # faster CPU val as time dominated by inference, not dataloading
if not pt:
self.args.rect = False

@ -3,7 +3,7 @@
import requests
from ultralytics.hub.utils import HUB_API_ROOT, HUB_WEB_ROOT, PREFIX, request_with_credentials
from ultralytics.utils import LOGGER, SETTINGS, emojis, is_colab
from ultralytics.utils import IS_COLAB, LOGGER, SETTINGS, emojis
API_KEY_URL = f"{HUB_WEB_ROOT}/settings?tab=api+keys"
@ -50,7 +50,7 @@ class Auth:
# Attempt to authenticate with the provided API key
success = self.authenticate()
# If the API key is not provided and the environment is a Google Colab notebook
elif is_colab():
elif IS_COLAB:
# Attempt to authenticate using browser cookies
success = self.auth_with_cookies()
else:
@ -109,7 +109,7 @@ class Auth:
Returns:
(bool): True if authentication is successful, False otherwise.
"""
if not is_colab():
if not IS_COLAB:
return False # Currently only works with Colab
try:
authn = request_with_credentials(f"{HUB_API_ROOT}/v1/auth/auto")

@ -7,11 +7,11 @@ from pathlib import Path
import requests
from ultralytics.hub.utils import HUB_WEB_ROOT, HELP_MSG, PREFIX, TQDM
from ultralytics.utils import LOGGER, SETTINGS, __version__, checks, emojis, is_colab
from ultralytics.hub.utils import HELP_MSG, HUB_WEB_ROOT, PREFIX, TQDM
from ultralytics.utils import IS_COLAB, LOGGER, SETTINGS, __version__, checks, emojis
from ultralytics.utils.errors import HUBModelError
AGENT_NAME = f"python-{__version__}-colab" if is_colab() else f"python-{__version__}-local"
AGENT_NAME = f"python-{__version__}-colab" if IS_COLAB else f"python-{__version__}-local"
class HUBTrainingSession:

@ -12,6 +12,9 @@ import requests
from ultralytics.utils import (
ARGV,
ENVIRONMENT,
IS_COLAB,
IS_GIT_DIR,
IS_PIP_PACKAGE,
LOGGER,
ONLINE,
RANK,
@ -22,9 +25,6 @@ from ultralytics.utils import (
__version__,
colorstr,
get_git_origin_url,
is_colab,
is_git_dir,
is_pip_package,
)
from ultralytics.utils.downloads import GITHUB_ASSETS_NAMES
@ -48,7 +48,7 @@ def request_with_credentials(url: str) -> any:
Raises:
OSError: If the function is not run in a Google Colab environment.
"""
if not is_colab():
if not IS_COLAB:
raise OSError("request_with_credentials() must run in a Colab environment")
from google.colab import output # noqa
from IPython import display # noqa
@ -189,7 +189,7 @@ class Events:
self.t = 0.0 # rate limit timer (seconds)
self.metadata = {
"cli": Path(ARGV[0]).name == "yolo",
"install": "git" if is_git_dir() else "pip" if is_pip_package() else "other",
"install": "git" if IS_GIT_DIR else "pip" if IS_PIP_PACKAGE else "other",
"python": ".".join(platform.python_version_tuple()[:2]), # i.e. 3.10
"version": __version__,
"env": ENVIRONMENT,
@ -198,10 +198,10 @@ class Events:
}
self.enabled = (
SETTINGS["sync"]
and RANK in (-1, 0)
and RANK in {-1, 0}
and not TESTS_RUNNING
and ONLINE
and (is_pip_package() or get_git_origin_url() == "https://github.com/ultralytics/ultralytics.git")
and (IS_PIP_PACKAGE or get_git_origin_url() == "https://github.com/ultralytics/ultralytics.git")
)
def __call__(self, cfg):

@ -24,7 +24,7 @@ class FastSAM(Model):
"""Call the __init__ method of the parent class (YOLO) with the updated default model."""
if str(model) == "FastSAM.pt":
model = "FastSAM-x.pt"
assert Path(model).suffix not in (".yaml", ".yml"), "FastSAM models only support pre-trained models."
assert Path(model).suffix not in {".yaml", ".yml"}, "FastSAM models only support pre-trained models."
super().__init__(model=model, task="segment")
@property

@ -4,12 +4,11 @@ import os
from pathlib import Path
import cv2
import matplotlib.pyplot as plt
import numpy as np
import torch
from PIL import Image
from ultralytics.utils import TQDM
from ultralytics.utils import TQDM, checks
class FastSAMPrompt:
@ -33,9 +32,7 @@ class FastSAMPrompt:
try:
import clip
except ImportError:
from ultralytics.utils.checks import check_requirements
check_requirements("git+https://github.com/openai/CLIP.git")
checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
import clip
self.clip = clip
@ -115,10 +112,13 @@ class FastSAMPrompt:
points (list, optional): Points to be plotted. Defaults to None.
point_label (list, optional): Labels for the points. Defaults to None.
mask_random_color (bool, optional): Whether to use random color for masks. Defaults to True.
better_quality (bool, optional): Whether to apply morphological transformations for better mask quality. Defaults to True.
better_quality (bool, optional): Whether to apply morphological transformations for better mask quality.
Defaults to True.
retina (bool, optional): Whether to use retina mask. Defaults to False.
with_contours (bool, optional): Whether to plot contours. Defaults to True.
"""
import matplotlib.pyplot as plt
pbar = TQDM(annotations, total=len(annotations))
for ann in pbar:
result_name = os.path.basename(ann.path)
@ -203,6 +203,8 @@ class FastSAMPrompt:
target_height (int, optional): Target height for resizing. Defaults to 960.
target_width (int, optional): Target width for resizing. Defaults to 960.
"""
import matplotlib.pyplot as plt
n, h, w = annotation.shape # batch, height, width
areas = np.sum(annotation, axis=(1, 2))

@ -45,7 +45,7 @@ class NAS(Model):
def __init__(self, model="yolo_nas_s.pt") -> None:
"""Initializes the NAS model with the provided or default 'yolo_nas_s.pt' model."""
assert Path(model).suffix not in (".yaml", ".yml"), "YOLO-NAS models only support pre-trained models."
assert Path(model).suffix not in {".yaml", ".yml"}, "YOLO-NAS models only support pre-trained models."
super().__init__(model, task="detect")
@smart_inference_mode()

@ -125,7 +125,7 @@ class RTDETRValidator(DetectionValidator):
bbox = ops.xywh2xyxy(bbox) # target boxes
bbox[..., [0, 2]] *= ori_shape[1] # native-space pred
bbox[..., [1, 3]] *= ori_shape[0] # native-space pred
return dict(cls=cls, bbox=bbox, ori_shape=ori_shape, imgsz=imgsz, ratio_pad=ratio_pad)
return {"cls": cls, "bbox": bbox, "ori_shape": ori_shape, "imgsz": imgsz, "ratio_pad": ratio_pad}
def _prepare_pred(self, pred, pbatch):
"""Prepares and returns a batch with transformed bounding boxes and class labels."""

@ -41,7 +41,7 @@ class SAM(Model):
Raises:
NotImplementedError: If the model file extension is not .pt or .pth.
"""
if model and Path(model).suffix not in (".pt", ".pth"):
if model and Path(model).suffix not in {".pt", ".pth"}:
raise NotImplementedError("SAM prediction requires pre-trained *.pt or *.pth model.")
super().__init__(model=model, task="segment")

@ -112,7 +112,7 @@ class PatchMerging(nn.Module):
self.out_dim = out_dim
self.act = activation()
self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0)
stride_c = 1 if out_dim in [320, 448, 576] else 2
stride_c = 1 if out_dim in {320, 448, 576} else 2
self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim)
self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0)
@ -584,9 +584,9 @@ class TinyViT(nn.Module):
img_size (int, optional): The input image size. Defaults to 224.
in_chans (int, optional): Number of input channels. Defaults to 3.
num_classes (int, optional): Number of classification classes. Defaults to 1000.
embed_dims (List[int], optional): List of embedding dimensions for each layer. Defaults to [96, 192, 384, 768].
embed_dims (List[int], optional): List of embedding dimensions per layer. Defaults to [96, 192, 384, 768].
depths (List[int], optional): List of depths for each layer. Defaults to [2, 2, 6, 2].
num_heads (List[int], optional): List of number of attention heads for each layer. Defaults to [3, 6, 12, 24].
num_heads (List[int], optional): List of number of attention heads per layer. Defaults to [3, 6, 12, 24].
window_sizes (List[int], optional): List of window sizes for each layer. Defaults to [7, 7, 14, 7].
mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension. Defaults to 4.
drop_rate (float, optional): Dropout rate. Defaults to 0.

@ -222,7 +222,7 @@ class Attention(nn.Module):
downsample_rate (int, optional): The factor by which the internal dimensions are downsampled. Defaults to 1.
Raises:
AssertionError: If 'num_heads' does not evenly divide the internal dimension (embedding_dim / downsample_rate).
AssertionError: If 'num_heads' does not evenly divide the internal dim (embedding_dim / downsample_rate).
"""
super().__init__()
self.embedding_dim = embedding_dim

@ -11,7 +11,6 @@ segmentation tasks.
import numpy as np
import torch
import torch.nn.functional as F
import torchvision
from ultralytics.data.augment import LetterBox
from ultralytics.engine.predictor import BasePredictor
@ -128,10 +127,10 @@ class Predictor(BasePredictor):
Args:
im (torch.Tensor): The preprocessed input image in tensor format, with shape (N, C, H, W).
bboxes (np.ndarray | List, optional): Bounding boxes with shape (N, 4), in XYXY format.
points (np.ndarray | List, optional): Points indicating object locations with shape (N, 2), in pixel coordinates.
labels (np.ndarray | List, optional): Labels for point prompts, shape (N, ). 1 for foreground and 0 for background.
masks (np.ndarray, optional): Low-resolution masks from previous predictions. Shape should be (N, H, W). For SAM, H=W=256.
multimask_output (bool, optional): Flag to return multiple masks. Helpful for ambiguous prompts. Defaults to False.
points (np.ndarray | List, optional): Points indicating object locations with shape (N, 2), in pixels.
labels (np.ndarray | List, optional): Labels for point prompts, shape (N, ). 1 = foreground, 0 = background.
masks (np.ndarray, optional): Low-resolution masks from previous predictions shape (N,H,W). For SAM H=W=256.
multimask_output (bool, optional): Flag to return multiple masks. Helpful for ambiguous prompts.
Returns:
(tuple): Contains the following three elements.
@ -157,10 +156,10 @@ class Predictor(BasePredictor):
Args:
im (torch.Tensor): The preprocessed input image in tensor format, with shape (N, C, H, W).
bboxes (np.ndarray | List, optional): Bounding boxes with shape (N, 4), in XYXY format.
points (np.ndarray | List, optional): Points indicating object locations with shape (N, 2), in pixel coordinates.
labels (np.ndarray | List, optional): Labels for point prompts, shape (N, ). 1 for foreground and 0 for background.
masks (np.ndarray, optional): Low-resolution masks from previous predictions. Shape should be (N, H, W). For SAM, H=W=256.
multimask_output (bool, optional): Flag to return multiple masks. Helpful for ambiguous prompts. Defaults to False.
points (np.ndarray | List, optional): Points indicating object locations with shape (N, 2), in pixels.
labels (np.ndarray | List, optional): Labels for point prompts, shape (N, ). 1 = foreground, 0 = background.
masks (np.ndarray, optional): Low-resolution masks from previous predictions shape (N,H,W). For SAM H=W=256.
multimask_output (bool, optional): Flag to return multiple masks. Helpful for ambiguous prompts.
Returns:
(tuple): Contains the following three elements.
@ -231,7 +230,7 @@ class Predictor(BasePredictor):
im (torch.Tensor): Input tensor representing the preprocessed image with dimensions (N, C, H, W).
crop_n_layers (int): Specifies the number of layers for additional mask predictions on image crops.
Each layer produces 2**i_layer number of image crops.
crop_overlap_ratio (float): Determines the extent of overlap between crops. Scaled down in subsequent layers.
crop_overlap_ratio (float): Determines the overlap between crops. Scaled down in subsequent layers.
crop_downscale_factor (int): Scaling factor for the number of sampled points-per-side in each layer.
point_grids (list[np.ndarray], optional): Custom grids for point sampling normalized to [0,1].
Used in the nth crop layer.
@ -241,11 +240,13 @@ class Predictor(BasePredictor):
conf_thres (float): Confidence threshold [0,1] for filtering based on the model's mask quality prediction.
stability_score_thresh (float): Stability threshold [0,1] for mask filtering based on mask stability.
stability_score_offset (float): Offset value for calculating stability score.
crop_nms_thresh (float): IoU cutoff for Non-Maximum Suppression (NMS) to remove duplicate masks between crops.
crop_nms_thresh (float): IoU cutoff for NMS to remove duplicate masks between crops.
Returns:
(tuple): A tuple containing segmented masks, confidence scores, and bounding boxes.
"""
import torchvision # scope for faster 'import ultralytics'
self.segment_all = True
ih, iw = im.shape[2:]
crop_regions, layer_idxs = generate_crop_boxes((ih, iw), crop_n_layers, crop_overlap_ratio)
@ -350,8 +351,8 @@ class Predictor(BasePredictor):
"""
Post-processes SAM's inference outputs to generate object detection masks and bounding boxes.
The method scales masks and boxes to the original image size and applies a threshold to the mask predictions. The
SAM model uses advanced architecture and promptable segmentation tasks to achieve real-time performance.
The method scales masks and boxes to the original image size and applies a threshold to the mask predictions.
The SAM model uses advanced architecture and promptable segmentation tasks to achieve real-time performance.
Args:
preds (tuple): The output from SAM model inference, containing masks, scores, and optional bounding boxes.
@ -449,6 +450,8 @@ class Predictor(BasePredictor):
- new_masks (torch.Tensor): The processed masks with small regions removed. Shape is (N, H, W).
- keep (List[int]): The indices of the remaining masks post-NMS, which can be used to filter the boxes.
"""
import torchvision # scope for faster 'import ultralytics'
if len(masks) == 0:
return masks

@ -1,7 +1,7 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from ultralytics.models.yolo import classify, detect, obb, pose, segment
from ultralytics.models.yolo import classify, detect, obb, pose, segment, world
from .model import YOLO, YOLOWorld
__all__ = "classify", "segment", "detect", "pose", "obb", "YOLO", "YOLOWorld"
__all__ = "classify", "segment", "detect", "pose", "obb", "world", "YOLO", "YOLOWorld"

@ -1,7 +1,6 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import torch
import torchvision
from ultralytics.data import ClassificationDataset, build_dataloader
from ultralytics.engine.trainer import BaseTrainer
@ -59,6 +58,8 @@ class ClassificationTrainer(BaseTrainer):
def setup_model(self):
"""Load, create or download model for any task."""
import torchvision # scope for faster 'import ultralytics'
if isinstance(self.model, torch.nn.Module): # if model is loaded beforehand. No setup needed
return
@ -68,7 +69,7 @@ class ClassificationTrainer(BaseTrainer):
self.model, ckpt = attempt_load_one_weight(model, device="cpu")
for p in self.model.parameters():
p.requires_grad = True # for training
elif model.split(".")[-1] in ("yaml", "yml"):
elif model.split(".")[-1] in {"yaml", "yml"}:
self.model = self.get_model(cfg=model)
elif model in torchvision.models.__dict__:
self.model = torchvision.models.__dict__[model](weights="IMAGENET1K_V1" if self.args.pretrained else None)

@ -44,7 +44,7 @@ class DetectionTrainer(BaseTrainer):
def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode="train"):
"""Construct and return dataloader."""
assert mode in ["train", "val"]
assert mode in {"train", "val"}, f"Mode must be 'train' or 'val', not {mode}."
with torch_distributed_zero_first(rank): # init dataset *.cache only once if DDP
dataset = self.build_dataset(dataset_path, mode, batch_size)
shuffle = mode == "train"

@ -33,6 +33,7 @@ class DetectionValidator(BaseValidator):
super().__init__(dataloader, save_dir, pbar, args, _callbacks)
self.nt_per_class = None
self.is_coco = False
self.is_lvis = False
self.class_map = None
self.args.task = "detect"
self.metrics = DetMetrics(save_dir=self.save_dir, on_plot=self.on_plot)
@ -66,8 +67,9 @@ class DetectionValidator(BaseValidator):
"""Initialize evaluation metrics for YOLO."""
val = self.data.get(self.args.split, "") # validation path
self.is_coco = isinstance(val, str) and "coco" in val and val.endswith(f"{os.sep}val2017.txt") # is COCO
self.class_map = converter.coco80_to_coco91_class() if self.is_coco else list(range(1000))
self.args.save_json |= self.is_coco and not self.training # run on final val if training COCO
self.is_lvis = isinstance(val, str) and "lvis" in val and not self.is_coco # is LVIS
self.class_map = converter.coco80_to_coco91_class() if self.is_coco else list(range(len(model.names)))
self.args.save_json |= (self.is_coco or self.is_lvis) and not self.training # run on final val if training COCO
self.names = model.names
self.nc = len(model.names)
self.metrics.names = self.names
@ -104,7 +106,7 @@ class DetectionValidator(BaseValidator):
if len(cls):
bbox = ops.xywh2xyxy(bbox) * torch.tensor(imgsz, device=self.device)[[1, 0, 1, 0]] # target boxes
ops.scale_boxes(imgsz, bbox, ori_shape, ratio_pad=ratio_pad) # native-space labels
return dict(cls=cls, bbox=bbox, ori_shape=ori_shape, imgsz=imgsz, ratio_pad=ratio_pad)
return {"cls": cls, "bbox": bbox, "ori_shape": ori_shape, "imgsz": imgsz, "ratio_pad": ratio_pad}
def _prepare_pred(self, pred, pbatch):
"""Prepares a batch of images and annotations for validation."""
@ -266,7 +268,8 @@ class DetectionValidator(BaseValidator):
self.jdict.append(
{
"image_id": image_id,
"category_id": self.class_map[int(p[5])],
"category_id": self.class_map[int(p[5])]
+ (1 if self.is_lvis else 0), # index starts from 1 if it's lvis
"bbox": [round(x, 3) for x in b],
"score": round(p[4], 5),
}
@ -274,26 +277,42 @@ class DetectionValidator(BaseValidator):
def eval_json(self, stats):
"""Evaluates YOLO output in JSON format and returns performance statistics."""
if self.args.save_json and self.is_coco and len(self.jdict):
anno_json = self.data["path"] / "annotations/instances_val2017.json" # annotations
if self.args.save_json and (self.is_coco or self.is_lvis) and len(self.jdict):
pred_json = self.save_dir / "predictions.json" # predictions
LOGGER.info(f"\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...")
anno_json = (
self.data["path"]
/ "annotations"
/ ("instances_val2017.json" if self.is_coco else f"lvis_v1_{self.args.split}.json")
) # annotations
pkg = "pycocotools" if self.is_coco else "lvis"
LOGGER.info(f"\nEvaluating {pkg} mAP using {pred_json} and {anno_json}...")
try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
check_requirements("pycocotools>=2.0.6")
from pycocotools.coco import COCO # noqa
from pycocotools.cocoeval import COCOeval # noqa
for x in anno_json, pred_json:
for x in pred_json, anno_json:
assert x.is_file(), f"{x} file not found"
anno = COCO(str(anno_json)) # init annotations api
pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path)
eval = COCOeval(anno, pred, "bbox")
check_requirements("pycocotools>=2.0.6" if self.is_coco else "lvis>=0.5.3")
if self.is_coco:
eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # images to eval
from pycocotools.coco import COCO # noqa
from pycocotools.cocoeval import COCOeval # noqa
anno = COCO(str(anno_json)) # init annotations api
pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path)
eval = COCOeval(anno, pred, "bbox")
else:
from lvis import LVIS, LVISEval
anno = LVIS(str(anno_json)) # init annotations api
pred = anno._load_json(str(pred_json)) # init predictions api (must pass string, not Path)
eval = LVISEval(anno, pred, "bbox")
eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # images to eval
eval.evaluate()
eval.accumulate()
eval.summarize()
stats[self.metrics.keys[-1]], stats[self.metrics.keys[-2]] = eval.stats[:2] # update mAP50-95 and mAP50
if self.is_lvis:
eval.print_results() # explicitly call print_results
# update mAP50-95 and mAP50
stats[self.metrics.keys[-1]], stats[self.metrics.keys[-2]] = (
eval.stats[:2] if self.is_coco else [eval.results["AP50"], eval.results["AP"]]
)
except Exception as e:
LOGGER.warning(f"pycocotools unable to run: {e}")
LOGGER.warning(f"{pkg} unable to run: {e}")
return stats

@ -5,7 +5,7 @@ from pathlib import Path
from ultralytics.engine.model import Model
from ultralytics.models import yolo
from ultralytics.nn.tasks import ClassificationModel, DetectionModel, OBBModel, PoseModel, SegmentationModel, WorldModel
from ultralytics.utils import yaml_load, ROOT
from ultralytics.utils import ROOT, yaml_load
class YOLO(Model):
@ -83,6 +83,7 @@ class YOLOWorld(Model):
"model": WorldModel,
"validator": yolo.detect.DetectionValidator,
"predictor": yolo.detect.DetectionPredictor,
"trainer": yolo.world.WorldTrainer,
}
}

@ -78,7 +78,7 @@ class OBBValidator(DetectionValidator):
if len(cls):
bbox[..., :4].mul_(torch.tensor(imgsz, device=self.device)[[1, 0, 1, 0]]) # target boxes
ops.scale_boxes(imgsz, bbox, ori_shape, ratio_pad=ratio_pad, xywh=True) # native-space labels
return dict(cls=cls, bbox=bbox, ori_shape=ori_shape, imgsz=imgsz, ratio_pad=ratio_pad)
return {"cls": cls, "bbox": bbox, "ori_shape": ori_shape, "imgsz": imgsz, "ratio_pad": ratio_pad}
def _prepare_pred(self, pred, pbatch):
"""Prepares and returns a batch for OBB validation with scaled and padded bounding boxes."""

@ -0,0 +1,5 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from .train import WorldTrainer
__all__ = ["WorldTrainer"]

@ -0,0 +1,92 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
import itertools
from ultralytics.data import build_yolo_dataset
from ultralytics.models import yolo
from ultralytics.nn.tasks import WorldModel
from ultralytics.utils import DEFAULT_CFG, RANK, checks
from ultralytics.utils.torch_utils import de_parallel
def on_pretrain_routine_end(trainer):
"""Callback."""
if RANK in {-1, 0}:
# NOTE: for evaluation
names = [name.split("/")[0] for name in list(trainer.test_loader.dataset.data["names"].values())]
de_parallel(trainer.ema.ema).set_classes(names, cache_clip_model=False)
device = next(trainer.model.parameters()).device
trainer.text_model, _ = trainer.clip.load("ViT-B/32", device=device)
for p in trainer.text_model.parameters():
p.requires_grad_(False)
class WorldTrainer(yolo.detect.DetectionTrainer):
"""
A class to fine-tune a world model on a close-set dataset.
Example:
```python
from ultralytics.models.yolo.world import WorldModel
args = dict(model='yolov8s-world.pt', data='coco8.yaml', epochs=3)
trainer = WorldTrainer(overrides=args)
trainer.train()
```
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initialize a WorldTrainer object with given arguments."""
if overrides is None:
overrides = {}
super().__init__(cfg, overrides, _callbacks)
# Import and assign clip
try:
import clip
except ImportError:
checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
import clip
self.clip = clip
def get_model(self, cfg=None, weights=None, verbose=True):
"""Return WorldModel initialized with specified config and weights."""
# NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
# NOTE: Following the official config, nc hard-coded to 80 for now.
model = WorldModel(
cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
ch=3,
nc=min(self.data["nc"], 80),
verbose=verbose and RANK == -1,
)
if weights:
model.load(weights)
self.add_callback("on_pretrain_routine_end", on_pretrain_routine_end)
return model
def build_dataset(self, img_path, mode="train", batch=None):
"""
Build YOLO Dataset.
Args:
img_path (str): Path to the folder containing images.
mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
"""
gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
return build_yolo_dataset(
self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"
)
def preprocess_batch(self, batch):
"""Preprocesses a batch of images for YOLOWorld training, adjusting formatting and dimensions as needed."""
batch = super().preprocess_batch(batch)
# NOTE: add text features
texts = list(itertools.chain(*batch["texts"]))
text_token = self.clip.tokenize(texts).to(batch["img"].device)
txt_feats = self.text_model.encode_text(text_token).to(dtype=batch["img"].dtype) # torch.float32
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
batch["txt_feats"] = txt_feats.reshape(len(batch["texts"]), -1, txt_feats.shape[-1])
return batch

@ -0,0 +1,108 @@
from ultralytics.data import YOLOConcatDataset, build_grounding, build_yolo_dataset
from ultralytics.data.utils import check_det_dataset
from ultralytics.models.yolo.world import WorldTrainer
from ultralytics.utils import DEFAULT_CFG
from ultralytics.utils.torch_utils import de_parallel
class WorldTrainerFromScratch(WorldTrainer):
"""
A class extending the WorldTrainer class for training a world model from scratch on open-set dataset.
Example:
```python
from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
from ultralytics import YOLOWorld
data = dict(
train=dict(
yolo_data=["Objects365.yaml"],
grounding_data=[
dict(
img_path="../datasets/flickr30k/images",
json_file="../datasets/flickr30k/final_flickr_separateGT_train.json",
),
dict(
img_path="../datasets/GQA/images",
json_file="../datasets/GQA/final_mixed_train_no_coco.json",
),
],
),
val=dict(yolo_data=["lvis.yaml"]),
)
model = YOLOWorld("yolov8s-worldv2.yaml")
model.train(data=data, trainer=WorldTrainerFromScratch)
```
"""
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
"""Initialize a WorldTrainer object with given arguments."""
if overrides is None:
overrides = {}
super().__init__(cfg, overrides, _callbacks)
def build_dataset(self, img_path, mode="train", batch=None):
"""
Build YOLO Dataset.
Args:
img_path (List[str] | str): Path to the folder containing images.
mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
"""
gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
if mode == "train":
dataset = [
build_yolo_dataset(self.args, im_path, batch, self.data, stride=gs, multi_modal=True)
if isinstance(im_path, str)
else build_grounding(self.args, im_path["img_path"], im_path["json_file"], batch, stride=gs)
for im_path in img_path
]
return YOLOConcatDataset(dataset) if len(dataset) > 1 else dataset[0]
else:
return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs)
def get_dataset(self):
"""
Get train, val path from data dict if it exists.
Returns None if data format is not recognized.
"""
final_data = dict()
data_yaml = self.args.data
assert data_yaml.get("train", False) # object365.yaml
assert data_yaml.get("val", False) # lvis.yaml
data = {k: [check_det_dataset(d) for d in v.get("yolo_data", [])] for k, v in data_yaml.items()}
assert len(data["val"]) == 1, f"Only support validating on 1 dataset for now, but got {len(data['val'])}."
val_split = "minival" if "lvis" in data["val"][0]["val"] else "val"
for d in data["val"]:
if d.get("minival") is None: # for lvis dataset
continue
d["minival"] = str(d["path"] / d["minival"])
for s in ["train", "val"]:
final_data[s] = [d["train" if s == "train" else val_split] for d in data[s]]
# save grounding data if there's one
grounding_data = data_yaml[s].get("grounding_data")
if grounding_data is None:
continue
grounding_data = [grounding_data] if not isinstance(grounding_data, list) else grounding_data
for g in grounding_data:
assert isinstance(g, dict), f"Grounding data should be provided in dict format, but got {type(g)}"
final_data[s] += grounding_data
# NOTE: to make training work properly, set `nc` and `names`
final_data["nc"] = data["val"][0]["nc"]
final_data["names"] = data["val"][0]["names"]
self.data = final_data
return final_data["train"], final_data["val"][0]
def plot_training_labels(self):
"""DO NOT plot labels."""
pass
def final_eval(self):
"""Performs final evaluation and validation for object detection YOLO-World model."""
val = self.args.data["val"]["yolo_data"][0]
self.validator.args.data = val
self.validator.args.split = "minival" if isinstance(val, str) and "lvis" in val else "val"
return super().final_eval()

@ -374,9 +374,9 @@ class AutoBackend(nn.Module):
metadata = yaml_load(metadata)
if metadata:
for k, v in metadata.items():
if k in ("stride", "batch"):
if k in {"stride", "batch"}:
metadata[k] = int(v)
elif k in ("imgsz", "names", "kpt_shape") and isinstance(v, str):
elif k in {"imgsz", "names", "kpt_shape"} and isinstance(v, str):
metadata[k] = eval(v)
stride = metadata["stride"]
task = metadata["task"]
@ -531,8 +531,8 @@ class AutoBackend(nn.Module):
self.names = {i: f"class{i}" for i in range(nc)}
else: # Lite or Edge TPU
details = self.input_details[0]
integer = details["dtype"] in (np.int8, np.int16) # is TFLite quantized int8 or int16 model
if integer:
is_int = details["dtype"] in {np.int8, np.int16} # is TFLite quantized int8 or int16 model
if is_int:
scale, zero_point = details["quantization"]
im = (im / scale + zero_point).astype(details["dtype"]) # de-scale
self.interpreter.set_tensor(details["index"], im)
@ -540,10 +540,10 @@ class AutoBackend(nn.Module):
y = []
for output in self.output_details:
x = self.interpreter.get_tensor(output["index"])
if integer:
if is_int:
scale, zero_point = output["quantization"]
x = (x.astype(np.float32) - zero_point) * scale # re-scale
if x.ndim > 2: # if task is not classification
if x.ndim == 3: # if task is not classification, excluding masks (ndim=4) as well
# Denormalize xywh by image size. See https://github.com/ultralytics/ultralytics/pull/1695
# xywh are normalized in TFLite/EdgeTPU to mitigate quantization error of integer models
x[:, [0, 2]] *= w

@ -24,27 +24,27 @@ from .block import (
C3TR,
DFL,
SPP,
SPPELAN,
SPPF,
ADown,
BNContrastiveHead,
Bottleneck,
BottleneckCSP,
C2f,
C2fAttn,
ImagePoolingAttn,
C3Ghost,
C3x,
CBFuse,
CBLinear,
ContrastiveHead,
GhostBottleneck,
HGBlock,
HGStem,
ImagePoolingAttn,
Proto,
RepC3,
ResNetLayer,
ContrastiveHead,
BNContrastiveHead,
RepNCSPELAN4,
ADown,
SPPELAN,
CBFuse,
CBLinear,
ResNetLayer,
Silence,
C2f2,
C3k2,

@ -571,7 +571,8 @@ class ContrastiveHead(nn.Module):
def __init__(self):
"""Initializes ContrastiveHead with specified region-text similarity parameters."""
super().__init__()
self.bias = nn.Parameter(torch.zeros([]))
# NOTE: use -10.0 to keep the init cls loss consistency with other losses
self.bias = nn.Parameter(torch.tensor([-10.0]))
self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
def forward(self, x, w):
@ -594,7 +595,8 @@ class BNContrastiveHead(nn.Module):
"""Initialize ContrastiveHead with region-text similarity parameters."""
super().__init__()
self.norm = nn.BatchNorm2d(embed_dims)
self.bias = nn.Parameter(torch.zeros([]))
# NOTE: use -10.0 to keep the init cls loss consistency with other losses
self.bias = nn.Parameter(torch.tensor([-10.0]))
# use -1.0 is more stable
self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))

@ -296,7 +296,7 @@ class SpatialAttention(nn.Module):
def __init__(self, kernel_size=7):
"""Initialize Spatial-attention module with kernel size argument."""
super().__init__()
assert kernel_size in (3, 7), "kernel size must be 3 or 7"
assert kernel_size in {3, 7}, "kernel size must be 3 or 7"
padding = 3 if kernel_size == 7 else 1
self.cv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
self.act = nn.Sigmoid()

@ -8,7 +8,7 @@ import torch.nn as nn
from torch.nn.init import constant_, xavier_uniform_
from ultralytics.utils.tal import TORCH_1_10, dist2bbox, dist2rbox, make_anchors
from .block import DFL, Proto, ContrastiveHead, BNContrastiveHead
from .block import DFL, BNContrastiveHead, ContrastiveHead, Proto
from .conv import Conv
from .transformer import MLP, DeformableTransformerDecoder, DeformableTransformerDecoderLayer
from .utils import bias_init_with_prob, linear_init
@ -54,13 +54,13 @@ class Detect(nn.Module):
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
self.shape = shape
if self.export and self.format in ("saved_model", "pb", "tflite", "edgetpu", "tfjs"): # avoid TF FlexSplitV ops
if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}: # avoid TF FlexSplitV ops
box = x_cat[:, : self.reg_max * 4]
cls = x_cat[:, self.reg_max * 4 :]
else:
box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
if self.export and self.format in ("tflite", "edgetpu"):
if self.export and self.format in {"tflite", "edgetpu"}:
# Precompute normalization factor to increase numerical stability
# See https://github.com/ultralytics/ultralytics/issues/7371
grid_h = shape[2]
@ -230,13 +230,13 @@ class WorldDetect(Detect):
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
self.shape = shape
if self.export and self.format in ("saved_model", "pb", "tflite", "edgetpu", "tfjs"): # avoid TF FlexSplitV ops
if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}: # avoid TF FlexSplitV ops
box = x_cat[:, : self.reg_max * 4]
cls = x_cat[:, self.reg_max * 4 :]
else:
box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
if self.export and self.format in ("tflite", "edgetpu"):
if self.export and self.format in {"tflite", "edgetpu"}:
# Precompute normalization factor to increase numerical stability
# See https://github.com/ultralytics/ultralytics/issues/7371
grid_h = shape[2]
@ -250,6 +250,15 @@ class WorldDetect(Detect):
y = torch.cat((dbox, cls.sigmoid()), 1)
return y if self.export else (y, x)
def bias_init(self):
"""Initialize Detect() biases, WARNING: requires stride availability."""
m = self # self.model[-1] # Detect() module
# cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
# ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency
for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
a[-1].bias.data[:] = 1.0 # box
# b[-1].bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
class RTDETRDecoder(nn.Module):
"""

@ -15,7 +15,9 @@ from ultralytics.nn.modules import (
C3TR,
OBB,
SPP,
SPPELAN,
SPPF,
ADown,
Bottleneck,
BottleneckCSP,
C2f,
@ -23,9 +25,10 @@ from ultralytics.nn.modules import (
C3k2,
C3k3,
C2fAttn,
ImagePoolingAttn,
C3Ghost,
C3x,
CBFuse,
CBLinear,
Classify,
Concat,
Conv,
@ -39,19 +42,16 @@ from ultralytics.nn.modules import (
GhostConv,
HGBlock,
HGStem,
ImagePoolingAttn,
Pose,
RepC3,
RepConv,
RepNCSPELAN4,
ResNetLayer,
RTDETRDecoder,
Segment,
WorldDetect,
RepNCSPELAN4,
ADown,
SPPELAN,
CBFuse,
CBLinear,
Silence,
WorldDetect,
)
from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml
@ -567,28 +567,28 @@ class WorldModel(DetectionModel):
self.clip_model = None # CLIP model placeholder
super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
def set_classes(self, text):
"""Perform a forward pass with optional profiling, visualization, and embedding extraction."""
def set_classes(self, text, batch=80, cache_clip_model=True):
"""Set classes in advance so that model could do offline-inference without clip model."""
try:
import clip
except ImportError:
check_requirements("git+https://github.com/openai/CLIP.git")
check_requirements("git+https://github.com/ultralytics/CLIP.git")
import clip
if not getattr(self, "clip_model", None): # for backwards compatibility of models lacking clip_model attribute
if (
not getattr(self, "clip_model", None) and cache_clip_model
): # for backwards compatibility of models lacking clip_model attribute
self.clip_model = clip.load("ViT-B/32")[0]
device = next(self.clip_model.parameters()).device
model = self.clip_model if cache_clip_model else clip.load("ViT-B/32")[0]
device = next(model.parameters()).device
text_token = clip.tokenize(text).to(device)
txt_feats = self.clip_model.encode_text(text_token).to(dtype=torch.float32)
txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
self.txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1]).detach()
self.txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
self.model[-1].nc = len(text)
def init_criterion(self):
"""Initialize the loss criterion for the model."""
raise NotImplementedError
def predict(self, x, profile=False, visualize=False, augment=False, embed=None):
def predict(self, x, profile=False, visualize=False, txt_feats=None, augment=False, embed=None):
"""
Perform a forward pass through the model.
@ -596,13 +596,14 @@ class WorldModel(DetectionModel):
x (torch.Tensor): The input tensor.
profile (bool, optional): If True, profile the computation time for each layer. Defaults to False.
visualize (bool, optional): If True, save feature maps for visualization. Defaults to False.
txt_feats (torch.Tensor): The text features, use it if it's given. Defaults to None.
augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.
embed (list, optional): A list of feature vectors/embeddings to return.
Returns:
(torch.Tensor): Model's output tensor.
"""
txt_feats = self.txt_feats.to(device=x.device, dtype=x.dtype)
txt_feats = (self.txt_feats if txt_feats is None else txt_feats).to(device=x.device, dtype=x.dtype)
if len(txt_feats) != len(x):
txt_feats = txt_feats.repeat(len(x), 1, 1)
ori_txt_feats = txt_feats.clone()
@ -630,6 +631,21 @@ class WorldModel(DetectionModel):
return torch.unbind(torch.cat(embeddings, 1), dim=0)
return x
def loss(self, batch, preds=None):
"""
Compute loss.
Args:
batch (dict): Batch to compute loss on.
preds (torch.Tensor | List[torch.Tensor]): Predictions.
"""
if not hasattr(self, "criterion"):
self.criterion = self.init_criterion()
if preds is None:
preds = self.forward(batch["img"], txt_feats=batch["txt_feats"])
return self.criterion(preds, batch)
class Ensemble(nn.ModuleList):
"""Ensemble of models."""

@ -81,7 +81,7 @@ class AIGym:
self.annotator = Annotator(im0, line_width=2)
for ind, k in enumerate(reversed(self.keypoints)):
if self.pose_type in ["pushup", "pullup"]:
if self.pose_type in {"pushup", "pullup"}:
self.angle[ind] = self.annotator.estimate_pose_angle(
k[int(self.kpts_to_check[0])].cpu(),
k[int(self.kpts_to_check[1])].cpu(),

@ -24,10 +24,13 @@ class Heatmap:
self.view_img = False
self.shape = "circle"
self.names = None # Classes names
# Image information
self.imw = None
self.imh = None
self.im0 = None
self.tf = 2
self.view_in_counts = True
self.view_out_counts = True
@ -52,10 +55,11 @@ class Heatmap:
# Object Counting Information
self.in_counts = 0
self.out_counts = 0
self.counting_list = []
self.count_txt_thickness = 0
self.count_ids = []
self.class_wise_count = {}
self.count_txt_color = (0, 0, 0)
self.count_color = (255, 255, 255)
self.count_bg_color = (255, 255, 255)
self.cls_txtdisplay_gap = 50
# Decay factor
self.decay_factor = 0.99
@ -67,18 +71,19 @@ class Heatmap:
self,
imw,
imh,
classes_names=None,
colormap=cv2.COLORMAP_JET,
heatmap_alpha=0.5,
view_img=False,
view_in_counts=True,
view_out_counts=True,
count_reg_pts=None,
count_txt_thickness=2,
count_txt_color=(0, 0, 0),
count_color=(255, 255, 255),
count_bg_color=(255, 255, 255),
count_reg_color=(255, 0, 255),
region_thickness=5,
line_dist_thresh=15,
line_thickness=2,
decay_factor=0.99,
shape="circle",
):
@ -89,20 +94,23 @@ class Heatmap:
colormap (cv2.COLORMAP): The colormap to be set.
imw (int): The width of the frame.
imh (int): The height of the frame.
classes_names (dict): Classes names
line_thickness (int): Line thickness for bounding boxes.
heatmap_alpha (float): alpha value for heatmap display
view_img (bool): Flag indicating frame display
view_in_counts (bool): Flag to control whether to display the incounts on video stream.
view_out_counts (bool): Flag to control whether to display the outcounts on video stream.
count_reg_pts (list): Object counting region points
count_txt_thickness (int): Text thickness for object counting display
count_txt_color (RGB color): count text color value
count_color (RGB color): count text background color value
count_bg_color (RGB color): count highlighter line color
count_reg_color (RGB color): Color of object counting region
region_thickness (int): Object counting Region thickness
line_dist_thresh (int): Euclidean Distance threshold for line counter
decay_factor (float): value for removing heatmap area after object passed
shape (str): Heatmap shape, rect or circle shape supported
"""
self.tf = line_thickness
self.names = classes_names
self.imw = imw
self.imh = imh
self.heatmap_alpha = heatmap_alpha
@ -116,24 +124,21 @@ class Heatmap:
if len(count_reg_pts) == 2:
print("Line Counter Initiated.")
self.count_reg_pts = count_reg_pts
self.counting_region = LineString(count_reg_pts)
elif len(count_reg_pts) == 4:
print("Region Counter Initiated.")
self.counting_region = LineString(self.count_reg_pts)
elif len(count_reg_pts) >= 3:
print("Polygon Counter Initiated.")
self.count_reg_pts = count_reg_pts
self.counting_region = Polygon(self.count_reg_pts)
else:
print("Region or line points Invalid, 2 or 4 points supported")
print("Invalid Region points provided, region_points must be 2 for lines or >= 3 for polygons.")
print("Using Line Counter Now")
self.counting_region = Polygon([(20, 400), (1260, 400)]) # dummy points
self.counting_region = LineString(self.count_reg_pts)
# Heatmap new frame
self.heatmap = np.zeros((int(self.imh), int(self.imw)), dtype=np.float32)
self.count_txt_thickness = count_txt_thickness
self.count_txt_color = count_txt_color
self.count_color = count_color
self.count_bg_color = count_bg_color
self.region_color = count_reg_color
self.region_thickness = region_thickness
self.decay_factor = decay_factor
@ -141,7 +146,7 @@ class Heatmap:
self.shape = shape
# shape of heatmap, if not selected
if self.shape not in ["circle", "rect"]:
if self.shape not in {"circle", "rect"}:
print("Unknown shape value provided, 'circle' & 'rect' supported")
print("Using Circular shape now")
self.shape = "circle"
@ -173,7 +178,7 @@ class Heatmap:
return im0
self.heatmap *= self.decay_factor # decay factor
self.extract_results(tracks)
self.annotator = Annotator(self.im0, self.count_txt_thickness, None)
self.annotator = Annotator(self.im0, self.tf, None)
if self.count_reg_pts is not None:
# Draw counting region
@ -183,6 +188,12 @@ class Heatmap:
)
for box, cls, track_id in zip(self.boxes, self.clss, self.track_ids):
# Store class info
if self.names[cls] not in self.class_wise_count:
if len(self.names[cls]) > 5:
self.names[cls] = self.names[cls][:5]
self.class_wise_count[self.names[cls]] = {"in": 0, "out": 0}
if self.shape == "circle":
center = (int((box[0] + box[2]) // 2), int((box[1] + box[3]) // 2))
radius = min(int(box[2]) - int(box[0]), int(box[3]) - int(box[1])) // 2
@ -203,23 +214,36 @@ class Heatmap:
if len(track_line) > 30:
track_line.pop(0)
# Count objects
if len(self.count_reg_pts) == 4:
if self.counting_region.contains(Point(track_line[-1])) and track_id not in self.counting_list:
self.counting_list.append(track_id)
if box[0] < self.counting_region.centroid.x:
self.out_counts += 1
else:
prev_position = self.track_history[track_id][-2] if len(self.track_history[track_id]) > 1 else None
# Count objects in any polygon
if len(self.count_reg_pts) >= 3:
is_inside = self.counting_region.contains(Point(track_line[-1]))
if prev_position is not None and is_inside and track_id not in self.count_ids:
self.count_ids.append(track_id)
if (box[0] - prev_position[0]) * (self.counting_region.centroid.x - prev_position[0]) > 0:
self.in_counts += 1
self.class_wise_count[self.names[cls]]["in"] += 1
else:
self.out_counts += 1
self.class_wise_count[self.names[cls]]["out"] += 1
# Count objects using line
elif len(self.count_reg_pts) == 2:
distance = Point(track_line[-1]).distance(self.counting_region)
if distance < self.line_dist_thresh and track_id not in self.counting_list:
self.counting_list.append(track_id)
if box[0] < self.counting_region.centroid.x:
self.out_counts += 1
else:
self.in_counts += 1
if prev_position is not None and track_id not in self.count_ids:
distance = Point(track_line[-1]).distance(self.counting_region)
if distance < self.line_dist_thresh and track_id not in self.count_ids:
self.count_ids.append(track_id)
if (box[0] - prev_position[0]) * (self.counting_region.centroid.x - prev_position[0]) > 0:
self.in_counts += 1
self.class_wise_count[self.names[cls]]["in"] += 1
else:
self.out_counts += 1
self.class_wise_count[self.names[cls]]["out"] += 1
else:
for box, cls in zip(self.boxes, self.clss):
if self.shape == "circle":
@ -240,26 +264,27 @@ class Heatmap:
heatmap_normalized = cv2.normalize(self.heatmap, None, 0, 255, cv2.NORM_MINMAX)
heatmap_colored = cv2.applyColorMap(heatmap_normalized.astype(np.uint8), self.colormap)
incount_label = f"In Count : {self.in_counts}"
outcount_label = f"OutCount : {self.out_counts}"
# Display counts based on user choice
counts_label = None
if not self.view_in_counts and not self.view_out_counts:
counts_label = None
elif not self.view_in_counts:
counts_label = outcount_label
elif not self.view_out_counts:
counts_label = incount_label
else:
counts_label = f"{incount_label} {outcount_label}"
if self.count_reg_pts is not None and counts_label is not None:
self.annotator.count_labels(
counts=counts_label,
count_txt_size=self.count_txt_thickness,
txt_color=self.count_txt_color,
color=self.count_color,
label = "Ultralytics Analytics \t"
for key, value in self.class_wise_count.items():
if value["in"] != 0 or value["out"] != 0:
if not self.view_in_counts and not self.view_out_counts:
label = None
elif not self.view_in_counts:
label += f"{str.capitalize(key)}: IN {value['in']} \t"
elif not self.view_out_counts:
label += f"{str.capitalize(key)}: OUT {value['out']} \t"
else:
label += f"{str.capitalize(key)}: IN {value['in']} OUT {value['out']} \t"
label = label.rstrip()
label = label.split("\t")
if self.count_reg_pts is not None and label is not None:
self.annotator.display_counts(
counts=label,
count_txt_color=self.count_txt_color,
count_bg_color=self.count_bg_color,
)
self.im0 = cv2.addWeighted(self.im0, 1 - self.heatmap_alpha, heatmap_colored, self.heatmap_alpha, 0)

@ -43,16 +43,19 @@ class ObjectCounter:
# Object counting Information
self.in_counts = 0
self.out_counts = 0
self.counting_dict = {}
self.count_ids = []
self.class_wise_count = {}
self.count_txt_thickness = 0
self.count_txt_color = (0, 0, 0)
self.count_color = (255, 255, 255)
self.count_txt_color = (255, 255, 255)
self.count_bg_color = (255, 255, 255)
self.cls_txtdisplay_gap = 50
self.fontsize = 0.6
# Tracks info
self.track_history = defaultdict(list)
self.track_thickness = 2
self.draw_tracks = False
self.track_color = (0, 255, 0)
self.track_color = None
# Check if environment support imshow
self.env_check = check_imshow(warn=True)
@ -62,18 +65,18 @@ class ObjectCounter:
classes_names,
reg_pts,
count_reg_color=(255, 0, 255),
count_txt_color=(0, 0, 0),
count_bg_color=(255, 255, 255),
line_thickness=2,
track_thickness=2,
view_img=False,
view_in_counts=True,
view_out_counts=True,
draw_tracks=False,
count_txt_thickness=2,
count_txt_color=(0, 0, 0),
count_color=(255, 255, 255),
track_color=(0, 255, 0),
track_color=None,
region_thickness=5,
line_dist_thresh=15,
cls_txtdisplay_gap=50,
):
"""
Configures the Counter's image, bounding box line thickness, and counting region points.
@ -87,13 +90,13 @@ class ObjectCounter:
classes_names (dict): Classes names
track_thickness (int): Track thickness
draw_tracks (Bool): draw tracks
count_txt_thickness (int): Text thickness for object counting display
count_txt_color (RGB color): count text color value
count_color (RGB color): count text background color value
count_bg_color (RGB color): count highlighter line color
count_reg_color (RGB color): Color of object counting region
track_color (RGB color): color for tracks
region_thickness (int): Object counting Region thickness
line_dist_thresh (int): Euclidean Distance threshold for line counter
cls_txtdisplay_gap (int): Display gap between each class count
"""
self.tf = line_thickness
self.view_img = view_img
@ -108,7 +111,7 @@ class ObjectCounter:
self.reg_pts = reg_pts
self.counting_region = LineString(self.reg_pts)
elif len(reg_pts) >= 3:
print("Region Counter Initiated.")
print("Polygon Counter Initiated.")
self.reg_pts = reg_pts
self.counting_region = Polygon(self.reg_pts)
else:
@ -118,12 +121,12 @@ class ObjectCounter:
self.names = classes_names
self.track_color = track_color
self.count_txt_thickness = count_txt_thickness
self.count_txt_color = count_txt_color
self.count_color = count_color
self.count_bg_color = count_bg_color
self.region_color = count_reg_color
self.region_thickness = region_thickness
self.line_dist_thresh = line_dist_thresh
self.cls_txtdisplay_gap = cls_txtdisplay_gap
def mouse_event_for_region(self, event, x, y, flags, params):
"""
@ -163,6 +166,9 @@ class ObjectCounter:
# Annotator Init and region drawing
self.annotator = Annotator(self.im0, self.tf, self.names)
# Draw region or line
self.annotator.draw_region(reg_pts=self.reg_pts, color=self.region_color, thickness=self.region_thickness)
if tracks[0].boxes.id is not None:
boxes = tracks[0].boxes.xyxy.cpu()
clss = tracks[0].boxes.cls.cpu().tolist()
@ -171,7 +177,13 @@ class ObjectCounter:
# Extract tracks
for box, track_id, cls in zip(boxes, track_ids, clss):
# Draw bounding box
self.annotator.box_label(box, label=f"{track_id}:{self.names[cls]}", color=colors(int(track_id), True))
self.annotator.box_label(box, label=f"{self.names[cls]}#{track_id}", color=colors(int(track_id), True))
# Store class info
if self.names[cls] not in self.class_wise_count:
if len(self.names[cls]) > 5:
self.names[cls] = self.names[cls][:5]
self.class_wise_count[self.names[cls]] = {"in": 0, "out": 0}
# Draw Tracks
track_line = self.track_history[track_id]
@ -182,74 +194,67 @@ class ObjectCounter:
# Draw track trails
if self.draw_tracks:
self.annotator.draw_centroid_and_tracks(
track_line, color=self.track_color, track_thickness=self.track_thickness
track_line,
color=self.track_color if self.track_color else colors(int(track_id), True),
track_thickness=self.track_thickness,
)
prev_position = self.track_history[track_id][-2] if len(self.track_history[track_id]) > 1 else None
centroid = Point((box[:2] + box[2:]) / 2)
# Count objects
if len(self.reg_pts) >= 3: # any polygon
is_inside = self.counting_region.contains(centroid)
current_position = "in" if is_inside else "out"
# Count objects in any polygon
if len(self.reg_pts) >= 3:
is_inside = self.counting_region.contains(Point(track_line[-1]))
if prev_position is not None:
if self.counting_dict[track_id] != current_position and is_inside:
if prev_position is not None and is_inside and track_id not in self.count_ids:
self.count_ids.append(track_id)
if (box[0] - prev_position[0]) * (self.counting_region.centroid.x - prev_position[0]) > 0:
self.in_counts += 1
self.counting_dict[track_id] = "in"
elif self.counting_dict[track_id] != current_position and not is_inside:
self.out_counts += 1
self.counting_dict[track_id] = "out"
self.class_wise_count[self.names[cls]]["in"] += 1
else:
self.counting_dict[track_id] = current_position
else:
self.counting_dict[track_id] = current_position
self.out_counts += 1
self.class_wise_count[self.names[cls]]["out"] += 1
# Count objects using line
elif len(self.reg_pts) == 2:
if prev_position is not None:
is_inside = (box[0] - prev_position[0]) * (
self.counting_region.centroid.x - prev_position[0]
) > 0
current_position = "in" if is_inside else "out"
if self.counting_dict[track_id] != current_position and is_inside:
self.in_counts += 1
self.counting_dict[track_id] = "in"
elif self.counting_dict[track_id] != current_position and not is_inside:
self.out_counts += 1
self.counting_dict[track_id] = "out"
else:
self.counting_dict[track_id] = current_position
else:
self.counting_dict[track_id] = None
incount_label = f"In Count : {self.in_counts}"
outcount_label = f"OutCount : {self.out_counts}"
# Display counts based on user choice
counts_label = None
if not self.view_in_counts and not self.view_out_counts:
counts_label = None
elif not self.view_in_counts:
counts_label = outcount_label
elif not self.view_out_counts:
counts_label = incount_label
else:
counts_label = f"{incount_label} {outcount_label}"
if counts_label is not None:
self.annotator.count_labels(
counts=counts_label,
count_txt_size=self.count_txt_thickness,
txt_color=self.count_txt_color,
color=self.count_color,
if prev_position is not None and track_id not in self.count_ids:
distance = Point(track_line[-1]).distance(self.counting_region)
if distance < self.line_dist_thresh and track_id not in self.count_ids:
self.count_ids.append(track_id)
if (box[0] - prev_position[0]) * (self.counting_region.centroid.x - prev_position[0]) > 0:
self.in_counts += 1
self.class_wise_count[self.names[cls]]["in"] += 2
else:
self.out_counts += 1
self.class_wise_count[self.names[cls]]["out"] += 1
label = "Ultralytics Analytics \t"
for key, value in self.class_wise_count.items():
if value["in"] != 0 or value["out"] != 0:
if not self.view_in_counts and not self.view_out_counts:
label = None
elif not self.view_in_counts:
label += f"{str.capitalize(key)}: IN {value['in']} \t"
elif not self.view_out_counts:
label += f"{str.capitalize(key)}: OUT {value['out']} \t"
else:
label += f"{str.capitalize(key)}: IN {value['in']} OUT {value['out']} \t"
label = label.rstrip()
label = label.split("\t")
if label is not None:
self.annotator.display_counts(
counts=label,
count_txt_color=self.count_txt_color,
count_bg_color=self.count_bg_color,
)
def display_frames(self):
"""Display frame."""
if self.env_check:
self.annotator.draw_region(reg_pts=self.reg_pts, color=self.region_color, thickness=self.region_thickness)
cv2.namedWindow(self.window_name)
if len(self.reg_pts) == 4: # only add mouse event If user drawn region
cv2.setMouseCallback(self.window_name, self.mouse_event_for_region, {"region_points": self.reg_pts})

@ -0,0 +1,187 @@
# Ultralytics YOLO 🚀, AGPL-3.0 license
from collections import defaultdict
import cv2
from ultralytics.utils.checks import check_imshow, check_requirements
from ultralytics.utils.plotting import Annotator, colors
check_requirements("shapely>=2.0.0")
from shapely.geometry import Point, Polygon
class QueueManager:
"""A class to manage the queue management in real-time video stream based on their tracks."""
def __init__(self):
"""Initializes the queue manager with default values for various tracking and counting parameters."""
# Mouse events
self.is_drawing = False
self.selected_point = None
# Region & Line Information
self.reg_pts = [(20, 60), (20, 680), (1120, 680), (1120, 60)]
self.counting_region = None
self.region_color = (255, 0, 255)
self.region_thickness = 5
# Image and annotation Information
self.im0 = None
self.tf = None
self.view_img = False
self.view_queue_counts = True
self.fontsize = 0.6
self.names = None # Classes names
self.annotator = None # Annotator
self.window_name = "Ultralytics YOLOv8 Queue Manager"
# Object counting Information
self.counts = 0
self.count_txt_color = (255, 255, 255)
# Tracks info
self.track_history = defaultdict(list)
self.track_thickness = 2
self.draw_tracks = False
self.track_color = None
# Check if environment support imshow
self.env_check = check_imshow(warn=True)
def set_args(
self,
classes_names,
reg_pts,
line_thickness=2,
track_thickness=2,
view_img=False,
region_color=(255, 0, 255),
view_queue_counts=True,
draw_tracks=False,
count_txt_color=(255, 255, 255),
track_color=None,
region_thickness=5,
fontsize=0.7,
):
"""
Configures the Counter's image, bounding box line thickness, and counting region points.
Args:
line_thickness (int): Line thickness for bounding boxes.
view_img (bool): Flag to control whether to display the video stream.
view_queue_counts (bool): Flag to control whether to display the counts on video stream.
reg_pts (list): Initial list of points defining the counting region.
classes_names (dict): Classes names
region_color (RGB color): Color of queue region
track_thickness (int): Track thickness
draw_tracks (Bool): draw tracks
count_txt_color (RGB color): count text color value
track_color (RGB color): color for tracks
region_thickness (int): Object counting Region thickness
fontsize (float): Text display font size
"""
self.tf = line_thickness
self.view_img = view_img
self.view_queue_counts = view_queue_counts
self.track_thickness = track_thickness
self.draw_tracks = draw_tracks
self.region_color = region_color
if len(reg_pts) >= 3:
print("Queue region initiated...")
self.reg_pts = reg_pts
self.counting_region = Polygon(self.reg_pts)
else:
print("Invalid region points provided...")
print("Using default region now....")
self.counting_region = Polygon(self.reg_pts)
self.names = classes_names
self.track_color = track_color
self.count_txt_color = count_txt_color
self.region_thickness = region_thickness
self.fontsize = fontsize
def extract_and_process_tracks(self, tracks):
"""Extracts and processes tracks for queue management in a video stream."""
# Annotator Init and queue region drawing
self.annotator = Annotator(self.im0, self.tf, self.names)
if tracks[0].boxes.id is not None:
boxes = tracks[0].boxes.xyxy.cpu()
clss = tracks[0].boxes.cls.cpu().tolist()
track_ids = tracks[0].boxes.id.int().cpu().tolist()
# Extract tracks
for box, track_id, cls in zip(boxes, track_ids, clss):
# Draw bounding box
self.annotator.box_label(box, label=f"{self.names[cls]}#{track_id}", color=colors(int(track_id), True))
# Draw Tracks
track_line = self.track_history[track_id]
track_line.append((float((box[0] + box[2]) / 2), float((box[1] + box[3]) / 2)))
if len(track_line) > 30:
track_line.pop(0)
# Draw track trails
if self.draw_tracks:
self.annotator.draw_centroid_and_tracks(
track_line,
color=self.track_color if self.track_color else colors(int(track_id), True),
track_thickness=self.track_thickness,
)
prev_position = self.track_history[track_id][-2] if len(self.track_history[track_id]) > 1 else None
if len(self.reg_pts) >= 3:
is_inside = self.counting_region.contains(Point(track_line[-1]))
if prev_position is not None and is_inside:
self.counts += 1
label = "Queue Counts : " + str(self.counts)
if label is not None:
self.annotator.queue_counts_display(
label,
points=self.reg_pts,
region_color=self.region_color,
txt_color=self.count_txt_color,
fontsize=self.fontsize,
)
self.counts = 0
self.display_frames()
def display_frames(self):
"""Display frame."""
if self.env_check:
self.annotator.draw_region(reg_pts=self.reg_pts, thickness=self.region_thickness, color=self.region_color)
cv2.namedWindow(self.window_name)
cv2.imshow(self.window_name, self.im0)
# Break Window
if cv2.waitKey(1) & 0xFF == ord("q"):
return
def process_queue(self, im0, tracks):
"""
Main function to start the queue management process.
Args:
im0 (ndarray): Current frame from the video stream.
tracks (list): List of tracks obtained from the object tracking process.
"""
self.im0 = im0 # store image
self.extract_and_process_tracks(tracks) # draw region even if no objects
if self.view_img:
self.display_frames()
return self.im0
if __name__ == "__main__":
QueueManager()

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save