From 6367ff47484d176759e16e8834fc85fb7ff73dd6 Mon Sep 17 00:00:00 2001
From: Paula Derrenger <107626595+pderrenger@users.noreply.github.com>
Date: Sun, 9 Jun 2024 17:38:05 +0200
Subject: [PATCH] Code Refactor for Speed and Readability (#13450)

Signed-off-by: Glenn Jocher <glenn.jocher@ultralytics.com>
Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
---
 ultralytics/data/split_dota.py                |  9 ++--
 ultralytics/engine/predictor.py               | 12 ++++--
 .../models/sam/modules/tiny_encoder.py        | 42 +++++++++----------
 3 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/ultralytics/data/split_dota.py b/ultralytics/data/split_dota.py
index f0a85d91f..e9cfc686f 100644
--- a/ultralytics/data/split_dota.py
+++ b/ultralytics/data/split_dota.py
@@ -86,7 +86,7 @@ def load_yolo_dota(data_root, split="train"):
     return annos
 
 
-def get_windows(im_size, crop_sizes=[1024], gaps=[200], im_rate_thr=0.6, eps=0.01):
+def get_windows(im_size, crop_sizes=(1024,), gaps=(200,), im_rate_thr=0.6, eps=0.01):
     """
     Get the coordinates of windows.
 
@@ -95,6 +95,7 @@ def get_windows(im_size, crop_sizes=[1024], gaps=[200], im_rate_thr=0.6, eps=0.0
         crop_sizes (List(int)): Crop size of windows.
         gaps (List(int)): Gap between crops.
         im_rate_thr (float): Threshold of windows areas divided by image ares.
+        eps (float): Epsilon value for math operations.
     """
     h, w = im_size
     windows = []
@@ -187,7 +188,7 @@ def crop_and_save(anno, windows, window_objs, im_dir, lb_dir):
                 f.write(f"{int(lb[0])} {' '.join(formatted_coords)}\n")
 
 
-def split_images_and_labels(data_root, save_dir, split="train", crop_sizes=[1024], gaps=[200]):
+def split_images_and_labels(data_root, save_dir, split="train", crop_sizes=(1024,), gaps=(200,)):
     """
     Split both images and labels.
 
@@ -217,7 +218,7 @@ def split_images_and_labels(data_root, save_dir, split="train", crop_sizes=[1024
         crop_and_save(anno, windows, window_objs, str(im_dir), str(lb_dir))
 
 
-def split_trainval(data_root, save_dir, crop_size=1024, gap=200, rates=[1.0]):
+def split_trainval(data_root, save_dir, crop_size=1024, gap=200, rates=(1.0,)):
     """
     Split train and val set of DOTA.
 
@@ -247,7 +248,7 @@ def split_trainval(data_root, save_dir, crop_size=1024, gap=200, rates=[1.0]):
         split_images_and_labels(data_root, save_dir, split, crop_sizes, gaps)
 
 
-def split_test(data_root, save_dir, crop_size=1024, gap=200, rates=[1.0]):
+def split_test(data_root, save_dir, crop_size=1024, gap=200, rates=(1.0,)):
     """
     Split test set of DOTA, labels are not included within this set.
 
diff --git a/ultralytics/engine/predictor.py b/ultralytics/engine/predictor.py
index ad261eac8..8597a60a2 100644
--- a/ultralytics/engine/predictor.py
+++ b/ultralytics/engine/predictor.py
@@ -169,12 +169,18 @@ class BasePredictor:
 
     def predict_cli(self, source=None, model=None):
         """
-        Method used for CLI prediction.
+        Method used for Command Line Interface (CLI) prediction.
 
-        It uses always generator as outputs as not required by CLI mode.
+        This function is designed to run predictions using the CLI. It sets up the source and model, then processes
+        the inputs in a streaming manner. This method ensures that no outputs accumulate in memory by consuming the
+        generator without storing results.
+
+        Note:
+            Do not modify this function or remove the generator. The generator ensures that no outputs are
+            accumulated in memory, which is critical for preventing memory issues during long-running predictions.
         """
         gen = self.stream_inference(source, model)
-        for _ in gen:  # noqa, running CLI inference without accumulating any outputs (do not modify)
+        for _ in gen:  # sourcery skip: remove-empty-nested-block, noqa
             pass
 
     def setup_source(self, source):
diff --git a/ultralytics/models/sam/modules/tiny_encoder.py b/ultralytics/models/sam/modules/tiny_encoder.py
index 28b83f130..c56282e16 100644
--- a/ultralytics/models/sam/modules/tiny_encoder.py
+++ b/ultralytics/models/sam/modules/tiny_encoder.py
@@ -383,44 +383,44 @@ class TinyViTBlock(nn.Module):
         """Applies attention-based transformation or padding to input 'x' before passing it through a local
         convolution.
         """
-        H, W = self.input_resolution
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
+        h, w = self.input_resolution
+        b, l, c = x.shape
+        assert l == h * w, "input feature has wrong size"
         res_x = x
-        if H == self.window_size and W == self.window_size:
+        if h == self.window_size and w == self.window_size:
             x = self.attn(x)
         else:
-            x = x.view(B, H, W, C)
-            pad_b = (self.window_size - H % self.window_size) % self.window_size
-            pad_r = (self.window_size - W % self.window_size) % self.window_size
+            x = x.view(b, h, w, c)
+            pad_b = (self.window_size - h % self.window_size) % self.window_size
+            pad_r = (self.window_size - w % self.window_size) % self.window_size
             padding = pad_b > 0 or pad_r > 0
 
             if padding:
                 x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b))
 
-            pH, pW = H + pad_b, W + pad_r
+            pH, pW = h + pad_b, w + pad_r
             nH = pH // self.window_size
             nW = pW // self.window_size
             # Window partition
             x = (
-                x.view(B, nH, self.window_size, nW, self.window_size, C)
+                x.view(b, nH, self.window_size, nW, self.window_size, c)
                 .transpose(2, 3)
-                .reshape(B * nH * nW, self.window_size * self.window_size, C)
+                .reshape(b * nH * nW, self.window_size * self.window_size, c)
             )
             x = self.attn(x)
             # Window reverse
-            x = x.view(B, nH, nW, self.window_size, self.window_size, C).transpose(2, 3).reshape(B, pH, pW, C)
+            x = x.view(b, nH, nW, self.window_size, self.window_size, c).transpose(2, 3).reshape(b, pH, pW, c)
 
             if padding:
-                x = x[:, :H, :W].contiguous()
+                x = x[:, :h, :w].contiguous()
 
-            x = x.view(B, L, C)
+            x = x.view(b, l, c)
 
         x = res_x + self.drop_path(x)
 
-        x = x.transpose(1, 2).reshape(B, C, H, W)
+        x = x.transpose(1, 2).reshape(b, c, h, w)
         x = self.local_conv(x)
-        x = x.view(B, C, L).transpose(1, 2)
+        x = x.view(b, c, l).transpose(1, 2)
 
         return x + self.drop_path(self.mlp(x))
 
@@ -565,10 +565,10 @@ class TinyViT(nn.Module):
         img_size=224,
         in_chans=3,
         num_classes=1000,
-        embed_dims=[96, 192, 384, 768],
-        depths=[2, 2, 6, 2],
-        num_heads=[3, 6, 12, 24],
-        window_sizes=[7, 7, 14, 7],
+        embed_dims=(96, 192, 384, 768),
+        depths=(2, 2, 6, 2),
+        num_heads=(3, 6, 12, 24),
+        window_sizes=(7, 7, 14, 7),
         mlp_ratio=4.0,
         drop_rate=0.0,
         drop_path_rate=0.1,
@@ -732,8 +732,8 @@ class TinyViT(nn.Module):
         for i in range(start_i, len(self.layers)):
             layer = self.layers[i]
             x = layer(x)
-        B, _, C = x.shape
-        x = x.view(B, 64, 64, C)
+        batch, _, channel = x.shape
+        x = x.view(batch, 64, 64, channel)
         x = x.permute(0, 3, 1, 2)
         return self.neck(x)