diff --git a/docs/models/fast-sam.md b/docs/models/fast-sam.md index 79d9f49093..d41ca4797b 100644 --- a/docs/models/fast-sam.md +++ b/docs/models/fast-sam.md @@ -47,22 +47,17 @@ To perform object detection on an image, use the `predict` method as shown below from ultralytics import FastSAM from ultralytics.models.fastsam import FastSAMPrompt - # Define image path and inference device - IMAGE_PATH = 'ultralytics/assets/bus.jpg' - DEVICE = 'cpu' + # Define an inference source + source = 'path/to/bus.jpg' # Create a FastSAM model model = FastSAM('FastSAM-s.pt') # or FastSAM-x.pt # Run inference on an image - everything_results = model(IMAGE_PATH, - device=DEVICE, - retina_masks=True, - imgsz=1024, - conf=0.4, - iou=0.9) + everything_results = model(source, device='cpu', retina_masks=True, imgsz=1024, conf=0.4, iou=0.9) - prompt_process = FastSAMPrompt(IMAGE_PATH, everything_results, device=DEVICE) + # Prepare a Prompt Process object + prompt_process = FastSAMPrompt(source, everything_results, device='cpu') # Everything prompt ann = prompt_process.everything_prompt() @@ -79,6 +74,12 @@ To perform object detection on an image, use the `predict` method as shown below ann = prompt_process.point_prompt(points=[[200, 200]], pointlabel=[1]) prompt_process.plot(annotations=ann, output='./') ``` + + === "CLI" + ```bash + # Load a FastSAM model and segment everything with it + yolo segment predict model=FastSAM-s.pt source=path/to/bus.jpg imgsz=640 + ``` This snippet demonstrates the simplicity of loading a pre-trained model and running a prediction on an image. @@ -89,7 +90,6 @@ Validation of the model on a dataset can be done as follows: !!! example "" === "Python" - ```python from ultralytics import FastSAM @@ -100,6 +100,12 @@ Validation of the model on a dataset can be done as follows: results = model.val(data='coco8-seg.yaml') ``` + === "CLI" + ```bash + # Load a FastSAM model and validate it on the COCO8 example dataset at image size 640 + yolo segment val model=FastSAM-s.pt data=coco8.yaml imgsz=640 + ``` + Please note that FastSAM only supports detection and segmentation of a single class of object. This means it will recognize and segment all objects as the same class. Therefore, when preparing the dataset, you need to convert all object category IDs to 0. ### FastSAM official Usage diff --git a/tests/conftest.py b/tests/conftest.py index 020ac7a3ae..153dc86bc7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,3 +1,5 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license + import pytest diff --git a/tests/test_cli.py b/tests/test_cli.py index 6cf68d122f..1ee412fc09 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -8,12 +8,16 @@ import pytest from ultralytics.utils import ONLINE, ROOT, SETTINGS WEIGHT_DIR = Path(SETTINGS['weights_dir']) -TASK_ARGS = [ # (task, model, data) - ('detect', 'yolov8n', 'coco8.yaml'), ('segment', 'yolov8n-seg', 'coco8-seg.yaml'), - ('classify', 'yolov8n-cls', 'imagenet10'), ('pose', 'yolov8n-pose', 'coco8-pose.yaml')] -EXPORT_ARGS = [ # (model, format) - ('yolov8n', 'torchscript'), ('yolov8n-seg', 'torchscript'), ('yolov8n-cls', 'torchscript'), - ('yolov8n-pose', 'torchscript')] +TASK_ARGS = [ + ('detect', 'yolov8n', 'coco8.yaml'), + ('segment', 'yolov8n-seg', 'coco8-seg.yaml'), + ('classify', 'yolov8n-cls', 'imagenet10'), + ('pose', 'yolov8n-pose', 'coco8-pose.yaml'), ] # (task, model, data) +EXPORT_ARGS = [ + ('yolov8n', 'torchscript'), + ('yolov8n-seg', 'torchscript'), + ('yolov8n-cls', 'torchscript'), + ('yolov8n-pose', 'torchscript'), ] # (model, format) def run(cmd): @@ -22,9 +26,12 @@ def run(cmd): def test_special_modes(): - run('yolo checks') - run('yolo settings') run('yolo help') + run('yolo checks') + run('yolo version') + run('yolo settings reset') + run('yolo copy-cfg') + run('yolo cfg') @pytest.mark.parametrize('task,model,data', TASK_ARGS) @@ -34,21 +41,82 @@ def test_train(task, model, data): @pytest.mark.parametrize('task,model,data', TASK_ARGS) def test_val(task, model, data): - run(f'yolo val {task} model={model}.pt data={data} imgsz=32') + run(f'yolo val {task} model={WEIGHT_DIR / model}.pt data={data} imgsz=32') @pytest.mark.parametrize('task,model,data', TASK_ARGS) def test_predict(task, model, data): - run(f"yolo predict model={model}.pt source={ROOT / 'assets'} imgsz=32 save save_crop save_txt") - if ONLINE: - run(f'yolo predict model={model}.pt source=https://ultralytics.com/images/bus.jpg imgsz=32') - run(f'yolo predict model={model}.pt source=https://ultralytics.com/assets/decelera_landscape_min.mov imgsz=32') - run(f'yolo predict model={model}.pt source=https://ultralytics.com/assets/decelera_portrait_min.mov imgsz=32') + run(f"yolo predict model={WEIGHT_DIR / model}.pt source={ROOT / 'assets'} imgsz=32 save save_crop save_txt") + + +@pytest.mark.skipif(not ONLINE, reason='environment is offline') +@pytest.mark.parametrize('task,model,data', TASK_ARGS) +def test_predict_online(task, model, data): + mode = 'track' if task in ('detect', 'segment', 'pose') else 'predict' # mode for video inference + run(f'yolo predict model={WEIGHT_DIR / model}.pt source=https://ultralytics.com/images/bus.jpg imgsz=32') + run(f'yolo {mode} model={WEIGHT_DIR / model}.pt source=https://ultralytics.com/assets/decelera_landscape_min.mov imgsz=32' + ) + + # Run Python YouTube tracking because CLI is broken. TODO: fix CLI YouTube + # run(f'yolo {mode} model={model}.pt source=https://youtu.be/G17sBkb38XQ imgsz=32 tracker=bytetrack.yaml') @pytest.mark.parametrize('model,format', EXPORT_ARGS) def test_export(model, format): - run(f'yolo export model={model}.pt format={format}') + run(f'yolo export model={WEIGHT_DIR / model}.pt format={format} imgsz=32') + + +# Test SAM, RTDETR Models +def test_rtdetr(task='detect', model='yolov8n-rtdetr.yaml', data='coco8.yaml'): + # Warning: MUST use imgsz=640 + run(f'yolo train {task} model={model} data={data} imgsz=640 epochs=1 cache=disk') + run(f'yolo val {task} model={model} data={data} imgsz=640') + run(f"yolo predict {task} model={model} source={ROOT / 'assets/bus.jpg'} imgsz=640 save save_crop save_txt") + + +def test_fastsam(task='segment', model='FastSAM-s.pt', data='coco8-seg.yaml'): + source = ROOT / 'assets/bus.jpg' + + run(f'yolo segment val {task} model={model} data={data} imgsz=32') + run(f'yolo segment predict model={model} source={source} imgsz=32 save save_crop save_txt') + + from ultralytics import FastSAM + from ultralytics.models.fastsam import FastSAMPrompt + + # Create a FastSAM model + model = FastSAM('FastSAM-s.pt') # or FastSAM-x.pt + + # Run inference on an image + everything_results = model(source, device='cpu', retina_masks=True, imgsz=1024, conf=0.4, iou=0.9) + + # Everything prompt + prompt_process = FastSAMPrompt(source, everything_results, device='cpu') + ann = prompt_process.everything_prompt() + + # Bbox default shape [0,0,0,0] -> [x1,y1,x2,y2] + ann = prompt_process.box_prompt(bbox=[200, 200, 300, 300]) + + # Text prompt + ann = prompt_process.text_prompt(text='a photo of a dog') + + # Point prompt + # points default [[0,0]] [[x1,y1],[x2,y2]] + # point_label default [0] [1,0] 0:background, 1:foreground + ann = prompt_process.point_prompt(points=[[200, 200]], pointlabel=[1]) + prompt_process.plot(annotations=ann, output='./') + + +def test_mobilesam(): + from ultralytics import SAM + + # Load the model + model = SAM('mobile_sam.pt') + + # Predict a segment based on a point prompt + model.predict(ROOT / 'assets/zidane.jpg', points=[900, 370], labels=[1]) + + # Predict a segment based on a box prompt + model.predict(ROOT / 'assets/zidane.jpg', bboxes=[439, 437, 524, 709]) # Slow Tests diff --git a/tests/test_engine.py b/tests/test_engine.py index e98b90eaf8..77342692c5 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -10,13 +10,13 @@ from ultralytics.utils import DEFAULT_CFG, ROOT, SETTINGS CFG_DET = 'yolov8n.yaml' CFG_SEG = 'yolov8n-seg.yaml' -CFG_CLS = 'squeezenet1_0' +CFG_CLS = 'yolov8n-cls.yaml' # or 'squeezenet1_0' CFG = get_cfg(DEFAULT_CFG) MODEL = Path(SETTINGS['weights_dir']) / 'yolov8n' SOURCE = ROOT / 'assets' -def test_func(model=None): +def test_func(*args): # noqa print('callback test passed') @@ -31,6 +31,7 @@ def test_export(): def test_detect(): overrides = {'data': 'coco8.yaml', 'model': CFG_DET, 'imgsz': 32, 'epochs': 1, 'save': False} CFG.data = 'coco8.yaml' + CFG.imgsz = 32 # Trainer trainer = detect.DetectionTrainer(overrides=overrides) @@ -65,6 +66,7 @@ def test_detect(): def test_segment(): overrides = {'data': 'coco8-seg.yaml', 'model': CFG_SEG, 'imgsz': 32, 'epochs': 1, 'save': False} CFG.data = 'coco8-seg.yaml' + CFG.imgsz = 32 # YOLO(CFG_SEG).train(**overrides) # works # trainer @@ -99,7 +101,7 @@ def test_segment(): def test_classify(): - overrides = {'data': 'imagenet10', 'model': 'yolov8n-cls.yaml', 'imgsz': 32, 'epochs': 1, 'save': False} + overrides = {'data': 'imagenet10', 'model': CFG_CLS, 'imgsz': 32, 'epochs': 1, 'save': False} CFG.data = 'imagenet10' CFG.imgsz = 32 # YOLO(CFG_SEG).train(**overrides) # works diff --git a/tests/test_python.py b/tests/test_python.py index 4053480e99..37fe782918 100644 --- a/tests/test_python.py +++ b/tests/test_python.py @@ -10,9 +10,11 @@ from torchvision.transforms import ToTensor from ultralytics import RTDETR, YOLO from ultralytics.data.build import load_inference_source -from ultralytics.utils import LINUX, ONLINE, ROOT, SETTINGS +from ultralytics.utils import LINUX, MACOS, ONLINE, ROOT, SETTINGS +from ultralytics.utils.torch_utils import TORCH_1_9 -MODEL = Path(SETTINGS['weights_dir']) / 'path with spaces' / 'yolov8n.pt' # test spaces in path +WEIGHTS_DIR = Path(SETTINGS['weights_dir']) +MODEL = WEIGHTS_DIR / 'path with spaces' / 'yolov8n.pt' # test spaces in path CFG = 'yolov8n.yaml' SOURCE = ROOT / 'assets/bus.jpg' SOURCE_GREYSCALE = Path(f'{SOURCE.parent / SOURCE.stem}_greyscale.jpg') @@ -26,39 +28,35 @@ im.convert('RGBA').save(SOURCE_RGBA) # 4-ch PNG with alpha def test_model_forward(): model = YOLO(CFG) - model(SOURCE) + model(SOURCE, imgsz=32) def test_model_info(): - model = YOLO(CFG) - model.info() model = YOLO(MODEL) model.info(verbose=True) def test_model_fuse(): - model = YOLO(CFG) - model.fuse() model = YOLO(MODEL) model.fuse() def test_predict_dir(): model = YOLO(MODEL) - model(source=ROOT / 'assets') + model(source=ROOT / 'assets', imgsz=32) def test_predict_img(): model = YOLO(MODEL) - seg_model = YOLO('yolov8n-seg.pt') - cls_model = YOLO('yolov8n-cls.pt') - pose_model = YOLO('yolov8n-pose.pt') + seg_model = YOLO(WEIGHTS_DIR / 'yolov8n-seg.pt') + cls_model = YOLO(WEIGHTS_DIR / 'yolov8n-cls.pt') + pose_model = YOLO(WEIGHTS_DIR / 'yolov8n-pose.pt') im = cv2.imread(str(SOURCE)) - assert len(model(source=Image.open(SOURCE), save=True, verbose=True)) == 1 # PIL - assert len(model(source=im, save=True, save_txt=True)) == 1 # ndarray - assert len(model(source=[im, im], save=True, save_txt=True)) == 2 # batch - assert len(list(model(source=[im, im], save=True, stream=True))) == 2 # stream - assert len(model(torch.zeros(320, 640, 3).numpy())) == 1 # tensor to numpy + assert len(model(source=Image.open(SOURCE), save=True, verbose=True, imgsz=32)) == 1 # PIL + assert len(model(source=im, save=True, save_txt=True, imgsz=32)) == 1 # ndarray + assert len(model(source=[im, im], save=True, save_txt=True, imgsz=32)) == 2 # batch + assert len(list(model(source=[im, im], save=True, stream=True, imgsz=32))) == 2 # stream + assert len(model(torch.zeros(320, 640, 3).numpy(), imgsz=32)) == 1 # tensor to numpy batch = [ str(SOURCE), # filename Path(SOURCE), # Path @@ -66,20 +64,20 @@ def test_predict_img(): cv2.imread(str(SOURCE)), # OpenCV Image.open(SOURCE), # PIL np.zeros((320, 640, 3))] # numpy - assert len(model(batch, visualize=True)) == len(batch) # multiple sources in a batch + assert len(model(batch, imgsz=32)) == len(batch) # multiple sources in a batch # Test tensor inference im = cv2.imread(str(SOURCE)) # OpenCV t = cv2.resize(im, (32, 32)) t = ToTensor()(t) t = torch.stack([t, t, t, t]) - results = model(t, visualize=True) + results = model(t, imgsz=32) assert len(results) == t.shape[0] - results = seg_model(t, visualize=True) + results = seg_model(t, imgsz=32) assert len(results) == t.shape[0] - results = cls_model(t, visualize=True) + results = cls_model(t, imgsz=32) assert len(results) == t.shape[0] - results = pose_model(t, visualize=True) + results = pose_model(t, imgsz=32) assert len(results) == t.shape[0] @@ -87,16 +85,17 @@ def test_predict_grey_and_4ch(): model = YOLO(MODEL) for f in SOURCE_RGBA, SOURCE_GREYSCALE: for source in Image.open(f), cv2.imread(str(f)), f: - model(source, save=True, verbose=True) + model(source, save=True, verbose=True, imgsz=32) -def test_val(): +def test_track_stream(): + # Test YouTube streaming inference (short 10 frame video) with non-default ByteTrack tracker model = YOLO(MODEL) - model.val(data='coco8.yaml', imgsz=32) + model.track('https://youtu.be/G17sBkb38XQ', imgsz=32, tracker='bytetrack.yaml') -def test_val_scratch(): - model = YOLO(CFG) +def test_val(): + model = YOLO(MODEL) model.val(data='coco8.yaml', imgsz=32) @@ -109,7 +108,7 @@ def test_amp(): def test_train_scratch(): model = YOLO(CFG) - model.train(data='coco8.yaml', epochs=1, imgsz=32, cache='disk') # test disk caching + model.train(data='coco8.yaml', epochs=1, imgsz=32, cache='disk', batch=-1) # test disk caching with AutoBatch model(SOURCE) @@ -125,12 +124,6 @@ def test_export_torchscript(): YOLO(f)(SOURCE) # exported model inference -def test_export_torchscript_scratch(): - model = YOLO(CFG) - f = model.export(format='torchscript') - YOLO(f)(SOURCE) # exported model inference - - def test_export_onnx(): model = YOLO(MODEL) f = model.export(format='onnx') @@ -138,14 +131,15 @@ def test_export_onnx(): def test_export_openvino(): - model = YOLO(MODEL) - f = model.export(format='openvino') - YOLO(f)(SOURCE) # exported model inference + if not MACOS: + model = YOLO(MODEL) + f = model.export(format='openvino') + YOLO(f)(SOURCE) # exported model inference def test_export_coreml(): # sourcery skip: move-assign model = YOLO(MODEL) - model.export(format='coreml') + model.export(format='coreml', nms=True) # if MACOS: # YOLO(f)(SOURCE) # model prediction only supported on macOS @@ -174,9 +168,10 @@ def test_export_paddle(enabled=False): def test_all_model_yamls(): - for m in list((ROOT / 'models').rglob('yolo*.yaml')): - if m.name == 'yolov8-rtdetr.yaml': # except the rtdetr model - RTDETR(m.name) + for m in (ROOT / 'cfg' / 'models').rglob('*.yaml'): + if 'rtdetr' in m.name: + if TORCH_1_9: # torch<=1.8 issue - TypeError: __init__() got an unexpected keyword argument 'batch_first' + RTDETR(m.name) else: YOLO(m.name) @@ -190,10 +185,9 @@ def test_workflow(): def test_predict_callback_and_setup(): - # test callback addition for prediction + # Test callback addition for prediction def on_predict_batch_end(predictor): # results -> List[batch_size] path, im0s, _, _ = predictor.batch - # print('on_predict_batch_end', im0s[0].shape) im0s = im0s if isinstance(im0s, list) else [im0s] bs = [predictor.dataset.bs for _ in range(len(path))] predictor.results = zip(predictor.results, im0s, bs) @@ -204,42 +198,26 @@ def test_predict_callback_and_setup(): dataset = load_inference_source(source=SOURCE) bs = dataset.bs # noqa access predictor properties results = model.predict(dataset, stream=True) # source already setup - for _, (result, im0, bs) in enumerate(results): + for r, im0, bs in results: print('test_callback', im0.shape) print('test_callback', bs) - boxes = result.boxes # Boxes object for bbox outputs + boxes = r.boxes # Boxes object for bbox outputs print(boxes) -def _test_results_api(res): - # General apis except plot - res = res.cpu().numpy() - # res = res.cuda() - res = res.to(device='cpu', dtype=torch.float32) - res.save_txt('label.txt', save_conf=False) - res.save_txt('label.txt', save_conf=True) - res.save_crop('crops/') - res.tojson(normalize=False) - res.tojson(normalize=True) - res.plot(pil=True) - res.plot(conf=True, boxes=False) - res.plot() - print(res) - print(res.path) - for k in res.keys: - print(getattr(res, k)) - - def test_results(): - for m in ['yolov8n-pose.pt', 'yolov8n-seg.pt', 'yolov8n.pt', 'yolov8n-cls.pt']: - model = YOLO(m) - res = model([SOURCE, SOURCE]) - _test_results_api(res[0]) - - -def test_track(): - im = cv2.imread(str(SOURCE)) - for m in ['yolov8n-pose.pt', 'yolov8n-seg.pt', 'yolov8n.pt']: + for m in 'yolov8n-pose.pt', 'yolov8n-seg.pt', 'yolov8n.pt', 'yolov8n-cls.pt': model = YOLO(m) - res = model.track(source=im) - _test_results_api(res[0]) + results = model([SOURCE, SOURCE]) + for r in results: + r = r.cpu().numpy() + r = r.to(device='cpu', dtype=torch.float32) + r.save_txt(txt_file='label.txt', save_conf=True) + r.save_crop(save_dir='crops/') + r.tojson(normalize=True) + r.plot(pil=True) + r.plot(conf=True, boxes=True) + print(r) + print(r.path) + for k in r.keys: + print(getattr(r, k)) diff --git a/ultralytics/models/fastsam/prompt.py b/ultralytics/models/fastsam/prompt.py index ead6319117..0d42c40951 100644 --- a/ultralytics/models/fastsam/prompt.py +++ b/ultralytics/models/fastsam/prompt.py @@ -1,6 +1,7 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license import os +from pathlib import Path import cv2 import matplotlib.pyplot as plt @@ -8,6 +9,8 @@ import numpy as np import torch from PIL import Image +from ultralytics.utils import LOGGER + class FastSAMPrompt: @@ -15,8 +18,8 @@ class FastSAMPrompt: # self.img_path = img_path self.device = device self.results = results - self.img_path = img_path - self.ori_img = cv2.imread(img_path) + self.img_path = str(img_path) + self.ori_img = cv2.imread(self.img_path) # Import and assign clip try: @@ -111,7 +114,7 @@ class FastSAMPrompt: original_w = image.shape[1] # for macOS only # plt.switch_backend('TkAgg') - plt.figure(figsize=(original_w / 100, original_h / 100)) + fig = plt.figure(figsize=(original_w / 100, original_h / 100)) # Add subplot with no margin. plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0) plt.margins(0, 0) @@ -174,21 +177,11 @@ class FastSAMPrompt: contour_mask = temp / 255 * color.reshape(1, 1, -1) plt.imshow(contour_mask) - save_path = output - if not os.path.exists(save_path): - os.makedirs(save_path) + save_path = Path(output) / result_name + save_path.parent.mkdir(exist_ok=True, parents=True) plt.axis('off') - fig = plt.gcf() - plt.draw() - - try: - buf = fig.canvas.tostring_rgb() - except AttributeError: - fig.canvas.draw() - buf = fig.canvas.tostring_rgb() - cols, rows = fig.canvas.get_width_height() - img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3) - cv2.imwrite(os.path.join(save_path, result_name), cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR)) + fig.savefig(save_path) + LOGGER.info(f'Saved to {save_path.absolute()}') # CPU post process def fast_show_mask( diff --git a/ultralytics/models/fastsam/val.py b/ultralytics/models/fastsam/val.py index 1d8442a10a..fa25e49ff3 100644 --- a/ultralytics/models/fastsam/val.py +++ b/ultralytics/models/fastsam/val.py @@ -1,231 +1,14 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license -from multiprocessing.pool import ThreadPool -from pathlib import Path +from ultralytics.models.yolo.segment import SegmentationValidator +from ultralytics.utils.metrics import SegmentMetrics -import numpy as np -import torch -import torch.nn.functional as F -from ultralytics.models.yolo.detect import DetectionValidator -from ultralytics.utils import LOGGER, NUM_THREADS, ops -from ultralytics.utils.checks import check_requirements -from ultralytics.utils.metrics import SegmentMetrics, box_iou, mask_iou -from ultralytics.utils.plotting import output_to_target, plot_images - - -class FastSAMValidator(DetectionValidator): +class FastSAMValidator(SegmentationValidator): def __init__(self, dataloader=None, save_dir=None, pbar=None, args=None, _callbacks=None): """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.""" super().__init__(dataloader, save_dir, pbar, args, _callbacks) self.args.task = 'segment' + self.args.plots = False # disable ConfusionMatrix and other plots to avoid errors self.metrics = SegmentMetrics(save_dir=self.save_dir, on_plot=self.on_plot) - - def preprocess(self, batch): - """Preprocesses batch by converting masks to float and sending to device.""" - batch = super().preprocess(batch) - batch['masks'] = batch['masks'].to(self.device).float() - return batch - - def init_metrics(self, model): - """Initialize metrics and select mask processing function based on save_json flag.""" - super().init_metrics(model) - self.plot_masks = [] - if self.args.save_json: - check_requirements('pycocotools>=2.0.6') - self.process = ops.process_mask_upsample # more accurate - else: - self.process = ops.process_mask # faster - - def get_desc(self): - """Return a formatted description of evaluation metrics.""" - return ('%22s' + '%11s' * 10) % ('Class', 'Images', 'Instances', 'Box(P', 'R', 'mAP50', 'mAP50-95)', 'Mask(P', - 'R', 'mAP50', 'mAP50-95)') - - def postprocess(self, preds): - """Post-processes YOLO predictions and returns output detections with proto.""" - p = ops.non_max_suppression(preds[0], - self.args.conf, - self.args.iou, - labels=self.lb, - multi_label=True, - agnostic=self.args.single_cls, - max_det=self.args.max_det, - nc=self.nc) - proto = preds[1][-1] if len(preds[1]) == 3 else preds[1] # second output is len 3 if pt, but only 1 if exported - return p, proto - - def update_metrics(self, preds, batch): - """Metrics.""" - for si, (pred, proto) in enumerate(zip(preds[0], preds[1])): - idx = batch['batch_idx'] == si - cls = batch['cls'][idx] - bbox = batch['bboxes'][idx] - nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions - shape = batch['ori_shape'][si] - correct_masks = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init - correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init - self.seen += 1 - - if npr == 0: - if nl: - self.stats.append((correct_bboxes, correct_masks, *torch.zeros( - (2, 0), device=self.device), cls.squeeze(-1))) - if self.args.plots: - self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1)) - continue - - # Masks - midx = [si] if self.args.overlap_mask else idx - gt_masks = batch['masks'][midx] - pred_masks = self.process(proto, pred[:, 6:], pred[:, :4], shape=batch['img'][si].shape[1:]) - - # Predictions - if self.args.single_cls: - pred[:, 5] = 0 - predn = pred.clone() - ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape, - ratio_pad=batch['ratio_pad'][si]) # native-space pred - - # Evaluate - if nl: - height, width = batch['img'].shape[2:] - tbox = ops.xywh2xyxy(bbox) * torch.tensor( - (width, height, width, height), device=self.device) # target boxes - ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape, - ratio_pad=batch['ratio_pad'][si]) # native-space labels - labelsn = torch.cat((cls, tbox), 1) # native-space labels - correct_bboxes = self._process_batch(predn, labelsn) - # TODO: maybe remove these `self.` arguments as they already are member variable - correct_masks = self._process_batch(predn, - labelsn, - pred_masks, - gt_masks, - overlap=self.args.overlap_mask, - masks=True) - if self.args.plots: - self.confusion_matrix.process_batch(predn, labelsn) - - # Append correct_masks, correct_boxes, pconf, pcls, tcls - self.stats.append((correct_bboxes, correct_masks, pred[:, 4], pred[:, 5], cls.squeeze(-1))) - - pred_masks = torch.as_tensor(pred_masks, dtype=torch.uint8) - if self.args.plots and self.batch_i < 3: - self.plot_masks.append(pred_masks[:15].cpu()) # filter top 15 to plot - - # Save - if self.args.save_json: - pred_masks = ops.scale_image(pred_masks.permute(1, 2, 0).contiguous().cpu().numpy(), - shape, - ratio_pad=batch['ratio_pad'][si]) - self.pred_to_json(predn, batch['im_file'][si], pred_masks) - # if self.args.save_txt: - # save_one_txt(predn, save_conf, shape, file=save_dir / 'labels' / f'{path.stem}.txt') - - def finalize_metrics(self, *args, **kwargs): - """Sets speed and confusion matrix for evaluation metrics.""" - self.metrics.speed = self.speed - self.metrics.confusion_matrix = self.confusion_matrix - - def _process_batch(self, detections, labels, pred_masks=None, gt_masks=None, overlap=False, masks=False): - """ - Return correct prediction matrix - Arguments: - detections (array[N, 6]), x1, y1, x2, y2, conf, class - labels (array[M, 5]), class, x1, y1, x2, y2 - Returns: - correct (array[N, 10]), for 10 IoU levels - """ - if masks: - if overlap: - nl = len(labels) - index = torch.arange(nl, device=gt_masks.device).view(nl, 1, 1) + 1 - gt_masks = gt_masks.repeat(nl, 1, 1) # shape(1,640,640) -> (n,640,640) - gt_masks = torch.where(gt_masks == index, 1.0, 0.0) - if gt_masks.shape[1:] != pred_masks.shape[1:]: - gt_masks = F.interpolate(gt_masks[None], pred_masks.shape[1:], mode='bilinear', align_corners=False)[0] - gt_masks = gt_masks.gt_(0.5) - iou = mask_iou(gt_masks.view(gt_masks.shape[0], -1), pred_masks.view(pred_masks.shape[0], -1)) - else: # boxes - iou = box_iou(labels[:, 1:], detections[:, :4]) - - return self.match_predictions(detections[:, 5], labels[:, 0], iou) - - def plot_val_samples(self, batch, ni): - """Plots validation samples with bounding box labels.""" - plot_images(batch['img'], - batch['batch_idx'], - batch['cls'].squeeze(-1), - batch['bboxes'], - batch['masks'], - paths=batch['im_file'], - fname=self.save_dir / f'val_batch{ni}_labels.jpg', - names=self.names, - on_plot=self.on_plot) - - def plot_predictions(self, batch, preds, ni): - """Plots batch predictions with masks and bounding boxes.""" - plot_images( - batch['img'], - *output_to_target(preds[0], max_det=15), # not set to self.args.max_det due to slow plotting speed - torch.cat(self.plot_masks, dim=0) if len(self.plot_masks) else self.plot_masks, - paths=batch['im_file'], - fname=self.save_dir / f'val_batch{ni}_pred.jpg', - names=self.names, - on_plot=self.on_plot) # pred - self.plot_masks.clear() - - def pred_to_json(self, predn, filename, pred_masks): - """Save one JSON result.""" - # Example result = {"image_id": 42, "category_id": 18, "bbox": [258.15, 41.29, 348.26, 243.78], "score": 0.236} - from pycocotools.mask import encode # noqa - - def single_encode(x): - """Encode predicted masks as RLE and append results to jdict.""" - rle = encode(np.asarray(x[:, :, None], order='F', dtype='uint8'))[0] - rle['counts'] = rle['counts'].decode('utf-8') - return rle - - stem = Path(filename).stem - image_id = int(stem) if stem.isnumeric() else stem - box = ops.xyxy2xywh(predn[:, :4]) # xywh - box[:, :2] -= box[:, 2:] / 2 # xy center to top-left corner - pred_masks = np.transpose(pred_masks, (2, 0, 1)) - with ThreadPool(NUM_THREADS) as pool: - rles = pool.map(single_encode, pred_masks) - for i, (p, b) in enumerate(zip(predn.tolist(), box.tolist())): - self.jdict.append({ - 'image_id': image_id, - 'category_id': self.class_map[int(p[5])], - 'bbox': [round(x, 3) for x in b], - 'score': round(p[4], 5), - 'segmentation': rles[i]}) - - def eval_json(self, stats): - """Return COCO-style object detection evaluation metrics.""" - if self.args.save_json and self.is_coco and len(self.jdict): - anno_json = self.data['path'] / 'annotations/instances_val2017.json' # annotations - pred_json = self.save_dir / 'predictions.json' # predictions - LOGGER.info(f'\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...') - try: # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb - check_requirements('pycocotools>=2.0.6') - from pycocotools.coco import COCO # noqa - from pycocotools.cocoeval import COCOeval # noqa - - for x in anno_json, pred_json: - assert x.is_file(), f'{x} file not found' - anno = COCO(str(anno_json)) # init annotations api - pred = anno.loadRes(str(pred_json)) # init predictions api (must pass string, not Path) - for i, eval in enumerate([COCOeval(anno, pred, 'bbox'), COCOeval(anno, pred, 'segm')]): - if self.is_coco: - eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files] # im to eval - eval.evaluate() - eval.accumulate() - eval.summarize() - idx = i * 4 + 2 - stats[self.metrics.keys[idx + 1]], stats[ - self.metrics.keys[idx]] = eval.stats[:2] # update mAP50-95 and mAP50 - except Exception as e: - LOGGER.warning(f'pycocotools unable to run: {e}') - return stats diff --git a/ultralytics/models/rtdetr/model.py b/ultralytics/models/rtdetr/model.py index aa99f9da67..c20d72f643 100644 --- a/ultralytics/models/rtdetr/model.py +++ b/ultralytics/models/rtdetr/model.py @@ -16,7 +16,7 @@ class RTDETR(Model): """ def __init__(self, model='rtdetr-l.pt') -> None: - if model and not model.split('.')[-1] in ('pt', 'yaml', 'yml'): + if model and model.split('.')[-1] not in ('pt', 'yaml', 'yml'): raise NotImplementedError('RT-DETR only supports creating from *.pt file or *.yaml file.') super().__init__(model=model, task='detect') diff --git a/ultralytics/nn/modules/head.py b/ultralytics/nn/modules/head.py index fffd102a3c..0b02eb3c64 100644 --- a/ultralytics/nn/modules/head.py +++ b/ultralytics/nn/modules/head.py @@ -9,7 +9,7 @@ import torch import torch.nn as nn from torch.nn.init import constant_, xavier_uniform_ -from ultralytics.utils.tal import dist2bbox, make_anchors +from ultralytics.utils.tal import TORCH_1_10, dist2bbox, make_anchors from .block import DFL, Proto from .conv import Conv @@ -267,9 +267,9 @@ class RTDETRDecoder(nn.Module): def _generate_anchors(self, shapes, grid_size=0.05, dtype=torch.float32, device='cpu', eps=1e-2): anchors = [] for i, (h, w) in enumerate(shapes): - grid_y, grid_x = torch.meshgrid(torch.arange(end=h, dtype=dtype, device=device), - torch.arange(end=w, dtype=dtype, device=device), - indexing='ij') + sy = torch.arange(end=h, dtype=dtype, device=device) + sx = torch.arange(end=w, dtype=dtype, device=device) + grid_y, grid_x = torch.meshgrid(sy, sx, indexing='ij') if TORCH_1_10 else torch.meshgrid(sy, sx) grid_xy = torch.stack([grid_x, grid_y], -1) # (h, w, 2) valid_WH = torch.tensor([h, w], dtype=dtype, device=device) diff --git a/ultralytics/nn/modules/transformer.py b/ultralytics/nn/modules/transformer.py index b3304cc8d8..3ef5e12178 100644 --- a/ultralytics/nn/modules/transformer.py +++ b/ultralytics/nn/modules/transformer.py @@ -22,6 +22,10 @@ class TransformerEncoderLayer(nn.Module): def __init__(self, c1, cm=2048, num_heads=8, dropout=0.0, act=nn.GELU(), normalize_before=False): super().__init__() + from ...utils.torch_utils import TORCH_1_9 + if not TORCH_1_9: + raise ModuleNotFoundError( + 'TransformerEncoderLayer() requires torch>=1.9 to use nn.MultiheadAttention(batch_first=True).') self.ma = nn.MultiheadAttention(c1, num_heads, dropout=dropout, batch_first=True) # Implementation of Feedforward model self.fc1 = nn.Linear(c1, cm)