diff --git a/ultralytics/cfg/models/rt-detr/rtdetr-l.yaml b/ultralytics/cfg/models/rt-detr/rtdetr-l.yaml index bd20da16f0..b7bb57fb8a 100644 --- a/ultralytics/cfg/models/rt-detr/rtdetr-l.yaml +++ b/ultralytics/cfg/models/rt-detr/rtdetr-l.yaml @@ -13,7 +13,7 @@ backbone: - [-1, 6, HGBlock, [48, 128, 3]] # stage 1 - [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8 - - [-1, 6, HGBlock, [96, 512, 3]] # stage 2 + - [-1, 6, HGBlock, [96, 512, 3]] # stage 2 - [-1, 1, DWConv, [512, 3, 2, 1, False]] # 4-P3/16 - [-1, 6, HGBlock, [192, 1024, 5, True, False]] # cm, c2, k, light, shortcut @@ -26,25 +26,25 @@ backbone: head: - [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 10 input_proj.2 - [-1, 1, AIFI, [1024, 8]] - - [-1, 1, Conv, [256, 1, 1]] # 12, Y5, lateral_convs.0 + - [-1, 1, Conv, [256, 1, 1]] # 12, Y5, lateral_convs.0 - [-1, 1, nn.Upsample, [None, 2, 'nearest']] - [7, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14 input_proj.1 - [[-2, -1], 1, Concat, [1]] - [-1, 3, RepC3, [256]] # 16, fpn_blocks.0 - - [-1, 1, Conv, [256, 1, 1]] # 17, Y4, lateral_convs.1 + - [-1, 1, Conv, [256, 1, 1]] # 17, Y4, lateral_convs.1 - [-1, 1, nn.Upsample, [None, 2, 'nearest']] - [3, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 19 input_proj.0 - [[-2, -1], 1, Concat, [1]] # cat backbone P4 - - [-1, 3, RepC3, [256]] # X3 (21), fpn_blocks.1 + - [-1, 3, RepC3, [256]] # X3 (21), fpn_blocks.1 - - [-1, 1, Conv, [256, 3, 2]] # 22, downsample_convs.0 + - [-1, 1, Conv, [256, 3, 2]] # 22, downsample_convs.0 - [[-1, 17], 1, Concat, [1]] # cat Y4 - - [-1, 3, RepC3, [256]] # F4 (24), pan_blocks.0 + - [-1, 3, RepC3, [256]] # F4 (24), pan_blocks.0 - - [-1, 1, Conv, [256, 3, 2]] # 25, downsample_convs.1 + - [-1, 1, Conv, [256, 3, 2]] # 25, downsample_convs.1 - [[-1, 12], 1, Concat, [1]] # cat Y5 - - [-1, 3, RepC3, [256]] # F5 (27), pan_blocks.1 + - [-1, 3, RepC3, [256]] # F5 (27), pan_blocks.1 - [[21, 24, 27], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5) diff --git a/ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml b/ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml new file mode 100644 index 0000000000..949a7e6ad5 --- /dev/null +++ b/ultralytics/cfg/models/rt-detr/rtdetr-resnet101.yaml @@ -0,0 +1,42 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# RT-DETR-ResNet101 object detection model with P3-P5 outputs. + +# Parameters +nc: 80 # number of classes +scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n' + # [depth, width, max_channels] + l: [1.00, 1.00, 1024] + +backbone: + # [from, repeats, module, args] + - [-1, 1, ResNetLayer, [3, 64, 1, True, 1]] # 0 + - [-1, 1, ResNetLayer, [64, 64, 1, False, 3]] # 1 + - [-1, 1, ResNetLayer, [256, 128, 2, False, 4]] # 2 + - [-1, 1, ResNetLayer, [512, 256, 2, False, 23]] # 3 + - [-1, 1, ResNetLayer, [1024, 512, 2, False, 3]] # 4 + +head: + - [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 5 + - [-1, 1, AIFI, [1024, 8]] + - [-1, 1, Conv, [256, 1, 1]] # 7 + + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [3, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 9 + - [[-2, -1], 1, Concat, [1]] + - [-1, 3, RepC3, [256]] # 11 + - [-1, 1, Conv, [256, 1, 1]] # 12 + + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [2, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14 + - [[-2, -1], 1, Concat, [1]] # cat backbone P4 + - [-1, 3, RepC3, [256]] # X3 (16), fpn_blocks.1 + + - [-1, 1, Conv, [256, 3, 2]] # 17, downsample_convs.0 + - [[-1, 12], 1, Concat, [1]] # cat Y4 + - [-1, 3, RepC3, [256]] # F4 (19), pan_blocks.0 + + - [-1, 1, Conv, [256, 3, 2]] # 20, downsample_convs.1 + - [[-1, 7], 1, Concat, [1]] # cat Y5 + - [-1, 3, RepC3, [256]] # F5 (22), pan_blocks.1 + + - [[16, 19, 22], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5) diff --git a/ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml b/ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml new file mode 100644 index 0000000000..bd1228d17a --- /dev/null +++ b/ultralytics/cfg/models/rt-detr/rtdetr-resnet50.yaml @@ -0,0 +1,42 @@ +# Ultralytics YOLO 🚀, AGPL-3.0 license +# RT-DETR-ResNet50 object detection model with P3-P5 outputs. + +# Parameters +nc: 80 # number of classes +scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n' + # [depth, width, max_channels] + l: [1.00, 1.00, 1024] + +backbone: + # [from, repeats, module, args] + - [-1, 1, ResNetLayer, [3, 64, 1, True, 1]] # 0 + - [-1, 1, ResNetLayer, [64, 64, 1, False, 3]] # 1 + - [-1, 1, ResNetLayer, [256, 128, 2, False, 4]] # 2 + - [-1, 1, ResNetLayer, [512, 256, 2, False, 6]] # 3 + - [-1, 1, ResNetLayer, [1024, 512, 2, False, 3]] # 4 + +head: + - [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 5 + - [-1, 1, AIFI, [1024, 8]] + - [-1, 1, Conv, [256, 1, 1]] # 7 + + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [3, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 9 + - [[-2, -1], 1, Concat, [1]] + - [-1, 3, RepC3, [256]] # 11 + - [-1, 1, Conv, [256, 1, 1]] # 12 + + - [-1, 1, nn.Upsample, [None, 2, 'nearest']] + - [2, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14 + - [[-2, -1], 1, Concat, [1]] # cat backbone P4 + - [-1, 3, RepC3, [256]] # X3 (16), fpn_blocks.1 + + - [-1, 1, Conv, [256, 3, 2]] # 17, downsample_convs.0 + - [[-1, 12], 1, Concat, [1]] # cat Y4 + - [-1, 3, RepC3, [256]] # F4 (19), pan_blocks.0 + + - [-1, 1, Conv, [256, 3, 2]] # 20, downsample_convs.1 + - [[-1, 7], 1, Concat, [1]] # cat Y5 + - [-1, 3, RepC3, [256]] # F5 (22), pan_blocks.1 + + - [[16, 19, 22], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5) diff --git a/ultralytics/cfg/models/rt-detr/rtdetr-x.yaml b/ultralytics/cfg/models/rt-detr/rtdetr-x.yaml index 848cb52b1f..2894bc0d94 100644 --- a/ultralytics/cfg/models/rt-detr/rtdetr-x.yaml +++ b/ultralytics/cfg/models/rt-detr/rtdetr-x.yaml @@ -14,7 +14,7 @@ backbone: - [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8 - [-1, 6, HGBlock, [128, 512, 3]] - - [-1, 6, HGBlock, [128, 512, 3, False, True]] # 4-stage 2 + - [-1, 6, HGBlock, [128, 512, 3, False, True]] # 4-stage 2 - [-1, 1, DWConv, [512, 3, 2, 1, False]] # 5-P3/16 - [-1, 6, HGBlock, [256, 1024, 5, True, False]] # cm, c2, k, light, shortcut @@ -30,25 +30,25 @@ backbone: head: - [-1, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 14 input_proj.2 - [-1, 1, AIFI, [2048, 8]] - - [-1, 1, Conv, [384, 1, 1]] # 16, Y5, lateral_convs.0 + - [-1, 1, Conv, [384, 1, 1]] # 16, Y5, lateral_convs.0 - [-1, 1, nn.Upsample, [None, 2, 'nearest']] - [10, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 18 input_proj.1 - [[-2, -1], 1, Concat, [1]] - [-1, 3, RepC3, [384]] # 20, fpn_blocks.0 - - [-1, 1, Conv, [384, 1, 1]] # 21, Y4, lateral_convs.1 + - [-1, 1, Conv, [384, 1, 1]] # 21, Y4, lateral_convs.1 - [-1, 1, nn.Upsample, [None, 2, 'nearest']] - [4, 1, Conv, [384, 1, 1, None, 1, 1, False]] # 23 input_proj.0 - [[-2, -1], 1, Concat, [1]] # cat backbone P4 - - [-1, 3, RepC3, [384]] # X3 (25), fpn_blocks.1 + - [-1, 3, RepC3, [384]] # X3 (25), fpn_blocks.1 - - [-1, 1, Conv, [384, 3, 2]] # 26, downsample_convs.0 + - [-1, 1, Conv, [384, 3, 2]] # 26, downsample_convs.0 - [[-1, 21], 1, Concat, [1]] # cat Y4 - - [-1, 3, RepC3, [384]] # F4 (28), pan_blocks.0 + - [-1, 3, RepC3, [384]] # F4 (28), pan_blocks.0 - - [-1, 1, Conv, [384, 3, 2]] # 29, downsample_convs.1 + - [-1, 1, Conv, [384, 3, 2]] # 29, downsample_convs.1 - [[-1, 16], 1, Concat, [1]] # cat Y5 - - [-1, 3, RepC3, [384]] # F5 (31), pan_blocks.1 + - [-1, 3, RepC3, [384]] # F5 (31), pan_blocks.1 - [[25, 28, 31], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5) diff --git a/ultralytics/nn/modules/__init__.py b/ultralytics/nn/modules/__init__.py index 584a394f72..dfcb0ec97c 100644 --- a/ultralytics/nn/modules/__init__.py +++ b/ultralytics/nn/modules/__init__.py @@ -18,7 +18,7 @@ Example: """ from .block import (C1, C2, C3, C3TR, DFL, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, GhostBottleneck, - HGBlock, HGStem, Proto, RepC3) + HGBlock, HGStem, Proto, RepC3, ResNetLayer) from .conv import (CBAM, ChannelAttention, Concat, Conv, Conv2, ConvTranspose, DWConv, DWConvTranspose2d, Focus, GhostConv, LightConv, RepConv, SpatialAttention) from .head import Classify, Detect, Pose, RTDETRDecoder, Segment @@ -30,4 +30,4 @@ __all__ = ('Conv', 'Conv2', 'LightConv', 'RepConv', 'DWConv', 'DWConvTranspose2d 'TransformerBlock', 'MLPBlock', 'LayerNorm2d', 'DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost', 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'Detect', 'Segment', 'Pose', 'Classify', 'TransformerEncoderLayer', 'RepC3', 'RTDETRDecoder', 'AIFI', - 'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP') + 'DeformableTransformerDecoder', 'DeformableTransformerDecoderLayer', 'MSDeformAttn', 'MLP', 'ResNetLayer') diff --git a/ultralytics/nn/modules/block.py b/ultralytics/nn/modules/block.py index 778dcec380..c754cdfc0f 100644 --- a/ultralytics/nn/modules/block.py +++ b/ultralytics/nn/modules/block.py @@ -9,7 +9,7 @@ from .conv import Conv, DWConv, GhostConv, LightConv, RepConv from .transformer import TransformerBlock __all__ = ('DFL', 'HGBlock', 'HGStem', 'SPP', 'SPPF', 'C1', 'C2', 'C3', 'C2f', 'C3x', 'C3TR', 'C3Ghost', - 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3') + 'GhostBottleneck', 'Bottleneck', 'BottleneckCSP', 'Proto', 'RepC3', 'ResNetLayer') class DFL(nn.Module): @@ -331,3 +331,41 @@ class BottleneckCSP(nn.Module): y1 = self.cv3(self.m(self.cv1(x))) y2 = self.cv2(x) return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1)))) + + +class ResNetBlock(nn.Module): + """ResNet block with standard convolution layers.""" + + def __init__(self, c1, c2, s=1, e=4): + """Initialize convolution with given parameters.""" + super().__init__() + c3 = e * c2 + self.cv1 = Conv(c1, c2, k=1, s=1, act=True) + self.cv2 = Conv(c2, c2, k=3, s=s, p=1, act=True) + self.cv3 = Conv(c2, c3, k=1, act=False) + self.shortcut = nn.Sequential(Conv(c1, c3, k=1, s=s, act=False)) if s != 1 or c1 != c3 else nn.Identity() + + def forward(self, x): + """Forward pass through the ResNet block.""" + return F.relu(self.cv3(self.cv2(self.cv1(x))) + self.shortcut(x)) + + +class ResNetLayer(nn.Module): + """ResNet layer with multiple ResNet blocks.""" + + def __init__(self, c1, c2, s=1, is_first=False, n=1, e=4): + """Initializes the ResNetLayer given arguments.""" + super().__init__() + self.is_first = is_first + + if self.is_first: + self.layer = nn.Sequential(Conv(c1, c2, k=7, s=2, p=3, act=True), + nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) + else: + blocks = [ResNetBlock(c1, c2, s, e=e)] + blocks.extend([ResNetBlock(e * c2, c2, 1, e=e) for _ in range(n - 1)]) + self.layer = nn.Sequential(*blocks) + + def forward(self, x): + """Forward pass through the ResNet layer.""" + return self.layer(x) diff --git a/ultralytics/nn/tasks.py b/ultralytics/nn/tasks.py index 55aa51a3ce..a834f73dd3 100644 --- a/ultralytics/nn/tasks.py +++ b/ultralytics/nn/tasks.py @@ -10,7 +10,7 @@ import torch.nn as nn from ultralytics.nn.modules import (AIFI, C1, C2, C3, C3TR, SPP, SPPF, Bottleneck, BottleneckCSP, C2f, C3Ghost, C3x, Classify, Concat, Conv, Conv2, ConvTranspose, Detect, DWConv, DWConvTranspose2d, Focus, GhostBottleneck, GhostConv, HGBlock, HGStem, Pose, RepC3, RepConv, - RTDETRDecoder, Segment) + ResNetLayer, RTDETRDecoder, Segment) from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml from ultralytics.utils.loss import v8ClassificationLoss, v8DetectionLoss, v8PoseLoss, v8SegmentationLoss @@ -700,7 +700,8 @@ def parse_model(d, ch, verbose=True): # model_dict, input_channels(3) if m is HGBlock: args.insert(4, n) # number of repeats n = 1 - + elif m is ResNetLayer: + c2 = args[1] if args[3] else args[1] * 4 elif m is nn.BatchNorm2d: args = [ch[f]] elif m is Concat: