HUB setup (#108)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Glenn Jocher <glenn.jocher@ultralytics.com>
2 years ago · 2bc9a5c87e
parent c6eb6720de
commit 2bc9a5c87e
16 changed files with 632 additions and 123 deletions
--- a/ultralytics/init.py
+++ b/ultralytics/init.py
@ -2,4 +2,4 @@ __version__ = "8.0.0.dev0"

 from ultralytics.yolo.engine.model import YOLO

-__all__ = ["__version__", "YOLO"]  # allow simpler import
+__all__ = ["__version__", "YOLO", "hub"]  # allow simpler import
--- a/ultralytics/hub/init.py
+++ b/ultralytics/hub/init.py
@ -0,0 +1,131 @@
+import os
+import shutil
+
+import psutil
+import requests
+from IPython import display  # to display images and clear console output
+
+from ultralytics.hub.auth import Auth
+from ultralytics.hub.session import HubTrainingSession
+from ultralytics.hub.utils import PREFIX, split_key
+from ultralytics.yolo.utils import LOGGER, emojis, is_colab
+from ultralytics.yolo.utils.torch_utils import select_device
+from ultralytics.yolo.v8.detect import DetectionTrainer
+
+
+def checks(verbose=True):
+    if is_colab():
+        shutil.rmtree('sample_data', ignore_errors=True)  # remove colab /sample_data directory
+
+    if verbose:
+        # System info
+        gib = 1 << 30  # bytes per GiB
+        ram = psutil.virtual_memory().total
+        total, used, free = shutil.disk_usage("/")
+        display.clear_output()
+        s = f'({os.cpu_count()} CPUs, {ram / gib:.1f} GB RAM, {(total - free) / gib:.1f}/{total / gib:.1f} GB disk)'
+    else:
+        s = ''
+
+    select_device(newline=False)
+    LOGGER.info(f'Setup complete ✅ {s}')
+
+
+def start(key=''):
+    # Start training models with Ultralytics HUB. Usage: from src.ultralytics import start; start('API_KEY')
+    def request_api_key(attempts=0):
+        """Prompt the user to input their API key"""
+        import getpass
+
+        max_attempts = 3
+        tries = f"Attempt {str(attempts + 1)} of {max_attempts}" if attempts > 0 else ""
+        LOGGER.info(f"{PREFIX}Login. {tries}")
+        input_key = getpass.getpass("Enter your Ultralytics HUB API key:\n")
+        auth.api_key, model_id = split_key(input_key)
+        if not auth.authenticate():
+            attempts += 1
+            LOGGER.warning(f"{PREFIX}Invalid API key ⚠️\n")
+            if attempts < max_attempts:
+                return request_api_key(attempts)
+            raise ConnectionError(emojis(f"{PREFIX}Failed to authenticate ❌"))
+        else:
+            return model_id
+
+    try:
+        api_key, model_id = split_key(key)
+        auth = Auth(api_key)  # attempts cookie login if no api key is present
+        attempts = 1 if len(key) else 0
+        if not auth.get_state():
+            if len(key):
+                LOGGER.warning(f"{PREFIX}Invalid API key ⚠️\n")
+            model_id = request_api_key(attempts)
+        LOGGER.info(f"{PREFIX}Authenticated ✅")
+        if not model_id:
+            raise ConnectionError(emojis('Connecting with global API key is not currently supported. ❌'))
+        session = HubTrainingSession(model_id=model_id, auth=auth)
+        session.check_disk_space()
+
+        # TODO: refactor, hardcoded for v8
+        args = session.model.copy()
+        args.pop("id")
+        args.pop("status")
+        args.pop("weights")
+        args["data"] = "coco128.yaml"
+        args["model"] = "yolov8n.yaml"
+        args["batch_size"] = 16
+        args["imgsz"] = 64
+
+        trainer = DetectionTrainer(overrides=args)
+        session.register_callbacks(trainer)
+        setattr(trainer, 'hub_session', session)
+        trainer.train()
+    except Exception as e:
+        LOGGER.warning(f"{PREFIX}{e}")
+
+
+def reset_model(key=''):
+    # Reset a trained model to an untrained state
+    api_key, model_id = split_key(key)
+    r = requests.post('https://api.ultralytics.com/model-reset', json={"apiKey": api_key, "modelId": model_id})
+
+    if r.status_code == 200:
+        LOGGER.info(f"{PREFIX}model reset successfully")
+        return
+    LOGGER.warning(f"{PREFIX}model reset failure {r.status_code} {r.reason}")
+
+
+def export_model(key='', format='torchscript'):
+    # Export a model to all formats
+    api_key, model_id = split_key(key)
+    formats = ('torchscript', 'onnx', 'openvino', 'engine', 'coreml', 'saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs',
+               'ultralytics_tflite', 'ultralytics_coreml')
+    assert format in formats, f"ERROR: Unsupported export format '{format}' passed, valid formats are {formats}"
+
+    r = requests.post('https://api.ultralytics.com/export',
+                      json={
+                          "apiKey": api_key,
+                          "modelId": model_id,
+                          "format": format})
+    assert r.status_code == 200, f"{PREFIX}{format} export failure {r.status_code} {r.reason}"
+    LOGGER.info(f"{PREFIX}{format} export started ✅")
+
+
+def get_export(key='', format='torchscript'):
+    # Get an exported model dictionary with download URL
+    api_key, model_id = split_key(key)
+    formats = ('torchscript', 'onnx', 'openvino', 'engine', 'coreml', 'saved_model', 'pb', 'tflite', 'edgetpu', 'tfjs',
+               'ultralytics_tflite', 'ultralytics_coreml')
+    assert format in formats, f"ERROR: Unsupported export format '{format}' passed, valid formats are {formats}"
+
+    r = requests.post('https://api.ultralytics.com/get-export',
+                      json={
+                          "apiKey": api_key,
+                          "modelId": model_id,
+                          "format": format})
+    assert r.status_code == 200, f"{PREFIX}{format} get_export failure {r.status_code} {r.reason}"
+    return r.json()
+
+
+# temp. For checking
+if __name__ == "__main__":
+    start(key="b3fba421be84a20dbe68644e14436d1cce1b0a0aaa_HeMfHgvHsseMPhdq7Ylz")
--- a/ultralytics/hub/auth.py
+++ b/ultralytics/hub/auth.py
@ -0,0 +1,69 @@
+import requests
+
+from ultralytics.hub.config import HUB_API_ROOT
+from ultralytics.hub.utils import request_with_credentials
+from ultralytics.yolo.utils import is_colab
+
+API_KEY_PATH = "https://hub.ultralytics.com/settings?tab=api+keys"
+
+
+class Auth:
+    id_token = api_key = model_key = False
+
+    def __init__(self, api_key=None):
+        self.api_key = self._clean_api_key(api_key)
+        self.authenticate() if self.api_key else self.auth_with_cookies()
+
+    @staticmethod
+    def _clean_api_key(key: str) -> str:
+        """Strip model from key if present"""
+        separator = "_"
+        return key.split(separator)[0] if separator in key else key
+
+    def authenticate(self) -> bool:
+        """Attempt to authenticate with server"""
+        try:
+            header = self.get_auth_header()
+            if header:
+                r = requests.post(f"{HUB_API_ROOT}/v1/auth", headers=header)
+                if not r.json().get('success', False):
+                    raise ConnectionError("Unable to authenticate.")
+                return True
+            raise ConnectionError("User has not authenticated locally.")
+        except ConnectionError:
+            self.id_token = self.api_key = False  # reset invalid
+            return False
+
+    def auth_with_cookies(self) -> bool:
+        """
+        Attempt to fetch authentication via cookies and set id_token.
+        User must be logged in to HUB and running in a supported browser.
+        """
+        if not is_colab():
+            return False  # Currently only works with Colab
+        try:
+            authn = request_with_credentials(f"{HUB_API_ROOT}/v1/auth/auto")
+            if authn.get("success", False):
+                self.id_token = authn.get("data", {}).get("idToken", None)
+                self.authenticate()
+                return True
+            raise ConnectionError("Unable to fetch browser authentication details.")
+        except ConnectionError:
+            self.id_token = False  # reset invalid
+            return False
+
+    def get_auth_header(self):
+        if self.id_token:
+            return {"authorization": f"Bearer {self.id_token}"}
+        elif self.api_key:
+            return {"x-api-key": self.api_key}
+        else:
+            return None
+
+    def get_state(self) -> bool:
+        """Get the authentication state"""
+        return self.id_token or self.api_key
+
+    def set_api_key(self, key: str):
+        """Get the authentication state"""
+        self.api_key = key
--- a/ultralytics/hub/config.py
+++ b/ultralytics/hub/config.py
@ -0,0 +1,12 @@
+import os
+
+# Global variables
+REPO_URL = "https://github.com/ultralytics/yolov5.git"
+REPO_BRANCH = "ultralytics/HUB"  # "master"
+
+ENVIRONMENT = os.environ.get("ULTRALYTICS_ENV", "production")
+if ENVIRONMENT == 'production':
+    HUB_API_ROOT = "https://api.ultralytics.com"
+else:
+    HUB_API_ROOT = "http://127.0.0.1:8000"
+    print(f'Connected to development server on {HUB_API_ROOT}')
--- a/ultralytics/hub/session.py
+++ b/ultralytics/hub/session.py
@ -0,0 +1,121 @@
+import signal
+import sys
+from pathlib import Path
+from time import sleep
+
+import requests
+
+from ultralytics import __version__
+from ultralytics.hub.config import HUB_API_ROOT
+from ultralytics.hub.utils import check_dataset_disk_space, smart_request
+from ultralytics.yolo.utils import LOGGER, is_colab, threaded
+
+AGENT_NAME = f'python-{__version__}-colab' if is_colab() else f'python-{__version__}-local'
+
+session = None
+
+
+def signal_handler(signum, frame):
+    """ Confirm exit """
+    global hub_logger
+    LOGGER.info(f'Signal received. {signum} {frame}')
+    if isinstance(session, HubTrainingSession):
+        hub_logger.alive = False
+        del hub_logger
+    sys.exit(signum)
+
+
+signal.signal(signal.SIGTERM, signal_handler)
+signal.signal(signal.SIGINT, signal_handler)
+
+
+class HubTrainingSession:
+
+    def __init__(self, model_id, auth):
+        self.agent_id = None  # identifies which instance is communicating with server
+        self.model_id = model_id
+        self.api_url = f'{HUB_API_ROOT}/v1/models/{model_id}'
+        self.auth_header = auth.get_auth_header()
+        self.rate_limits = {'metrics': 3.0, 'ckpt': 900.0, 'heartbeat': 300.0}  # rate limits (seconds)
+        self.t = {}  # rate limit timers (seconds)
+        self.metrics_queue = {}  # metrics queue
+        self.alive = True  # for heartbeats
+        self.model = self._get_model()
+        self._heartbeats()  # start heartbeats
+
+    def __del__(self):
+        # Class destructor
+        self.alive = False
+
+    def upload_metrics(self):
+        payload = {"metrics": self.metrics_queue.copy(), "type": "metrics"}
+        smart_request(f'{self.api_url}', json=payload, headers=self.auth_header, code=2)
+
+    def upload_model(self, epoch, weights, is_best=False, map=0.0, final=False):
+        # Upload a model to HUB
+        file = None
+        if Path(weights).is_file():
+            with open(weights, "rb") as f:
+                file = f.read()
+        if final:
+            smart_request(f'{self.api_url}/upload',
+                          data={
+                              "epoch": epoch,
+                              "type": "final",
+                              "map": map},
+                          files={"best.pt": file},
+                          headers=self.auth_header,
+                          retry=10,
+                          timeout=3600,
+                          code=4)
+        else:
+            smart_request(f'{self.api_url}/upload',
+                          data={
+                              "epoch": epoch,
+                              "type": "epoch",
+                              "isBest": bool(is_best)},
+                          headers=self.auth_header,
+                          files={"last.pt": file},
+                          code=3)
+
+    def _get_model(self):
+        # Returns model from database by id
+        api_url = f"{HUB_API_ROOT}/v1/models/{self.model_id}"
+        headers = self.auth_header
+
+        try:
+            r = smart_request(api_url, method="get", headers=headers, thread=False, code=0)
+            data = r.json().get("data", None)
+            if not data:
+                return
+            assert data['data'], 'ERROR: Dataset may still be processing. Please wait a minute and try again.'  # RF fix
+            self.model_id = data["id"]
+
+            return data
+        except requests.exceptions.ConnectionError as e:
+            raise ConnectionRefusedError('ERROR: The HUB server is not online. Please try again later.') from e
+
+    def check_disk_space(self):
+        if not check_dataset_disk_space(self.model['data']):
+            raise MemoryError("Not enough disk space")
+
+    # COMMENT: Should not be needed as HUB is now considered an integration and is in integrations_callbacks
+    # import ultralytics.yolo.utils.callbacks.hub as hub_callbacks
+    # @staticmethod
+    # def register_callbacks(trainer):
+    #     for k, v in hub_callbacks.callbacks.items():
+    #         trainer.add_callback(k, v)
+
+    @threaded
+    def _heartbeats(self):
+        while self.alive:
+            r = smart_request(f'{HUB_API_ROOT}/v1/agent/heartbeat/models/{self.model_id}',
+                              json={
+                                  "agent": AGENT_NAME,
+                                  "agentId": self.agent_id},
+                              headers=self.auth_header,
+                              retry=0,
+                              code=5,
+                              thread=False)
+            self.agent_id = r.json().get('data', {}).get('agentId', None)
+            sleep(self.rate_limits['heartbeat'])
--- a/ultralytics/hub/utils.py
+++ b/ultralytics/hub/utils.py
@ -0,0 +1,139 @@
+import shutil
+import threading
+import time
+import uuid
+
+import requests
+
+from ultralytics.hub.config import HUB_API_ROOT
+from ultralytics.yolo.utils import LOGGER, RANK, SETTINGS, colorstr, emojis
+
+PREFIX = colorstr('Ultralytics: ')
+HELP_MSG = 'If this issue persists please visit https://github.com/ultralytics/hub/issues for assistance.'
+
+
+def check_dataset_disk_space(url='https://github.com/ultralytics/yolov5/releases/download/v1.0/coco128.zip', sf=2.0):
+    # Check that url fits on disk with safety factor sf, i.e. require 2GB free if url size is 1GB with sf=2.0
+    gib = 1 << 30  # bytes per GiB
+    data = int(requests.head(url).headers['Content-Length']) / gib  # dataset size (GB)
+    total, used, free = (x / gib for x in shutil.disk_usage("/"))  # bytes
+    LOGGER.info(f'{PREFIX}{data:.3f} GB dataset, {free:.1f}/{total:.1f} GB free disk space')
+    if data * sf < free:
+        return True  # sufficient space
+    LOGGER.warning(f'{PREFIX}WARNING: Insufficient free disk space {free:.1f} GB < {data * sf:.3f} GB required, '
+                   f'training cancelled ❌. Please free {data * sf - free:.1f} GB additional disk space and try again.')
+    return False  # insufficient space
+
+
+def request_with_credentials(url: str) -> any:
+    """ Make a ajax request with cookies attached """
+    from google.colab import output  # noqa
+    from IPython import display  # noqa
+    display.display(
+        display.Javascript("""
+            window._hub_tmp = new Promise((resolve, reject) => {
+                const timeout = setTimeout(() => reject("Failed authenticating existing browser session"), 5000)
+                fetch("%s", {
+                    method: 'POST',
+                    credentials: 'include'
+                })
+                    .then((response) => resolve(response.json()))
+                    .then((json) => {
+                    clearTimeout(timeout);
+                    }).catch((err) => {
+                    clearTimeout(timeout);
+                    reject(err);
+                });
+            });
+            """ % url))
+    return output.eval_js("_hub_tmp")
+
+
+# Deprecated TODO: eliminate this function?
+def split_key(key: str = '') -> tuple[str, str]:
+    """
+    Verify and split a 'api_key[sep]model_id' string, sep is one of '.' or '_'
+
+    Args:
+        key (str): The model key to split. If not provided, the user will be prompted to enter it.
+
+    Returns:
+        Tuple[str, str]: A tuple containing the API key and model ID.
+    """
+
+    import getpass
+
+    error_string = emojis(f'{PREFIX}Invalid API key ⚠️\n')  # error string
+    if not key:
+        key = getpass.getpass('Enter model key: ')
+    sep = '_' if '_' in key else '.' if '.' in key else None  # separator
+    assert sep, error_string
+    api_key, model_id = key.split(sep)
+    assert len(api_key) and len(model_id), error_string
+    return api_key, model_id
+
+
+def smart_request(*args, retry=3, timeout=30, thread=True, code=-1, method="post", **kwargs):
+    """
+    Makes an HTTP request using the 'requests' library, with exponential backoff retries up to a specified timeout.
+
+    Args:
+        *args: Positional arguments to be passed to the requests function specified in method.
+        retry (int, optional): Number of retries to attempt before giving up. Default is 3.
+        timeout (int, optional): Timeout in seconds after which the function will give up retrying. Default is 30.
+        thread (bool, optional): Whether to execute the request in a separate daemon thread. Default is True.
+        code (int, optional): An identifier for the request, used for logging purposes. Default is -1.
+        method (str, optional): The HTTP method to use for the request. Choices are 'post' and 'get'. Default is 'post'.
+        **kwargs: Keyword arguments to be passed to the requests function specified in method.
+
+    Returns:
+        requests.Response: The HTTP response object. If the request is executed in a separate thread, returns None.
+    """
+    retry_codes = (408, 500)  # retry only these codes
+    methods = {'post': requests.post, 'get': requests.get}  # request methods
+
+    def fcn(*args, **kwargs):
+        t0 = time.time()
+        for i in range(retry + 1):
+            if (time.time() - t0) > timeout:
+                break
+            r = methods[method](*args, **kwargs)  # i.e. post(url, data, json, files)
+            if r.status_code == 200:
+                break
+            try:
+                m = r.json().get('message', 'No JSON message.')
+            except Exception:
+                m = 'Unable to read JSON.'
+            if i == 0:
+                if r.status_code in retry_codes:
+                    m += f' Retrying {retry}x for {timeout}s.' if retry else ''
+                elif r.status_code == 429:  # rate limit
+                    h = r.headers  # response headers
+                    m = f"Rate limit reached ({h['X-RateLimit-Remaining']}/{h['X-RateLimit-Limit']}). " \
+                        f"Please retry after {h['Retry-After']}s."
+                LOGGER.warning(f"{PREFIX}{m} {HELP_MSG} ({r.status_code} #{code})")
+                if r.status_code not in retry_codes:
+                    return r
+            time.sleep(2 ** i)  # exponential standoff
+        return r
+
+    if thread:
+        threading.Thread(target=fcn, args=args, kwargs=kwargs, daemon=True).start()
+    else:
+        return fcn(*args, **kwargs)
+
+
+def sync_analytics(cfg, enabled=False):
+    """
+   Sync analytics data if enabled in the global settings
+
+    Args:
+        cfg (DictConfig): Configuration for the task and mode.
+        enabled (bool): For debugging.
+    """
+    if SETTINGS['sync'] and RANK in {-1, 0} and enabled:
+        cfg = dict(cfg)  # convert type from DictConfig to dict
+        cfg['uuid'] = uuid.getnode()  # add the device UUID to the configuration data
+
+        # Send a request to the HUB API to sync the analytics data
+        smart_request(f'{HUB_API_ROOT}/analytics', data=cfg, headers=None, code=3, retry=0)
--- a/ultralytics/yolo/cli.py
+++ b/ultralytics/yolo/cli.py
@ -3,46 +3,48 @@ from pathlib import Path

 import hydra

-import ultralytics
-from ultralytics import yolo
+from ultralytics import hub, yolo
+from ultralytics.yolo.utils import DEFAULT_CONFIG, LOGGER, colorstr

-from .utils import DEFAULT_CONFIG, LOGGER, colorstr
+DIR = Path(__file__).parent


-@hydra.main(version_base=None, config_path="configs", config_name="default")
+@hydra.main(version_base=None, config_path=str(DEFAULT_CONFIG.parent.relative_to(DIR)), config_name=DEFAULT_CONFIG.name)
 def cli(cfg):
-    cwd = Path().cwd()
-    LOGGER.info(f"{colorstr(f'Ultralytics YOLO v{ultralytics.__version__}')}")
+    """
+    Run a specified task and mode with the given configuration.
+
+    Args:
+        cfg (DictConfig): Configuration for the task and mode.
+    """
+    # LOGGER.info(f"{colorstr(f'Ultralytics YOLO v{ultralytics.__version__}')}")
    task, mode = cfg.task.lower(), cfg.mode.lower()

-    if task == "init":  # special case
-        shutil.copy2(DEFAULT_CONFIG, cwd)
+    # Special case for initializing the configuration
+    if task == "init":
+        shutil.copy2(DEFAULT_CONFIG, Path.cwd())
        LOGGER.info(f"""
-        {colorstr("YOLO:")} configuration saved to {cwd / DEFAULT_CONFIG.name}.
+        {colorstr("YOLO:")} configuration saved to {Path.cwd() / DEFAULT_CONFIG.name}.
        To run experiments using custom configuration:
        yolo task='task' mode='mode' --config-name config_file.yaml
                    """)
        return

-    elif task == "detect":
-        module = yolo.v8.detect
-    elif task == "segment":
-        module = yolo.v8.segment
-    elif task == "classify":
-        module = yolo.v8.classify
-    elif task == "export":
-        func = yolo.engine.exporter.export
-    else:
-        raise SyntaxError("task not recognized. Choices are `'detect', 'segment', 'classify'`")
-
-    if mode == "train":
-        func = module.train
-    elif mode == "val":
-        func = module.val
-    elif mode == "predict":
-        func = module.predict
-    elif mode == "export":
-        func = yolo.engine.exporter.export
-    else:
-        raise SyntaxError("mode not recognized. Choices are `'train', 'val', 'predict', 'export'`")
+    # Mapping from task to module
+    task_module_map = {"detect": yolo.v8.detect, "segment": yolo.v8.segment, "classify": yolo.v8.classify}
+    module = task_module_map.get(task)
+    if not module:
+        raise SyntaxError(f"task not recognized. Choices are {', '.join(task_module_map.keys())}")
+
+    # Mapping from mode to function
+    mode_func_map = {
+        "train": module.train,
+        "val": module.val,
+        "predict": module.predict,
+        "export": yolo.engine.exporter.export,
+        "checks": hub.checks}
+    func = mode_func_map.get(mode)
+    if not func:
+        raise SyntaxError(f"mode not recognized. Choices are {', '.join(mode_func_map.keys())}")
+
    func(cfg)
--- a/ultralytics/yolo/configs/default.yaml
+++ b/ultralytics/yolo/configs/default.yaml
@ -8,6 +8,7 @@ mode: "train" # choices=['train', 'val', 'predict'] # mode to run task in.
 model: null # i.e. yolov5s.pt, yolo.yaml. Path to model file
 data: null # i.e. coco128.yaml. Path to data file
 epochs: 100 # number of epochs to train for
+patience: 50  # TODO: epochs to wait for no observable improvement for early stopping of training
 batch_size: 16 # number of images per batch
 imgsz: 640 # size of input images
 save: True # save checkpoints
--- a/ultralytics/yolo/engine/exporter.py
+++ b/ultralytics/yolo/engine/exporter.py
@ -71,8 +71,7 @@ from ultralytics.nn.tasks import ClassificationModel, DetectionModel, Segmentati
 from ultralytics.yolo.configs import get_config
 from ultralytics.yolo.data.dataloaders.stream_loaders import LoadImages
 from ultralytics.yolo.data.utils import check_dataset
-from ultralytics.yolo.utils import DEFAULT_CONFIG, LOGGER, colorstr, get_default_args, yaml_save
-from ultralytics.yolo.utils.callbacks import default_callbacks
+from ultralytics.yolo.utils import DEFAULT_CONFIG, LOGGER, callbacks, colorstr, get_default_args, yaml_save
 from ultralytics.yolo.utils.checks import check_imgsz, check_requirements, check_version, check_yaml
 from ultralytics.yolo.utils.files import file_size, increment_path
 from ultralytics.yolo.utils.ops import Profile
@ -138,16 +137,15 @@ class Exporter:
        """
        if overrides is None:
            overrides = {}
+        if 'batch_size' not in overrides:
+            overrides['batch_size'] = 1  # set default export batch size
        self.args = get_config(config, overrides)
        project = self.args.project or f"runs/{self.args.task}"
        name = self.args.name or "exp"  # hardcode mode as export doesn't require it
        self.save_dir = increment_path(Path(project) / name, exist_ok=self.args.exist_ok)
        self.save_dir.mkdir(parents=True, exist_ok=True)
-
-        # callbacks
-        self.callbacks = defaultdict([])
-        for callback, func in default_callbacks.items():
-            self.add_callback(callback, func)
+        self.callbacks = defaultdict(list, {k: [v] for k, v in callbacks.default_callbacks.items()})  # add callbacks
+        callbacks.add_integration_callbacks(self)

    @smart_inference_mode()
    def __call__(self, model=None):
@ -173,7 +171,6 @@ class Exporter:
            assert self.device.type == 'cpu', '--optimize not compatible with cuda devices, i.e. use --device cpu'

        # Input
-        self.args.batch_size = 1  # TODO: resolve this issue, default 16 not fit for export
        im = torch.zeros(self.args.batch_size, 3, *self.imgsz).to(self.device)
        file = Path(getattr(model, 'yaml_file', None) or Path(model.yaml['yaml_file']).name)

@ -765,18 +762,6 @@ class Exporter:
        LOGGER.info(f'{prefix} pipeline success')
        return model

-    def add_callback(self, event: str, callback):
-        """
-        appends the given callback
-        """
-        self.callbacks[event].append(callback)
-
-    def set_callback(self, event: str, callback):
-        """
-        overrides the existing callbacks with the given callback
-        """
-        self.callbacks[event] = [callback]
-
    def run_callbacks(self, event: str):
        for callback in self.callbacks.get(event, []):
            callback(self)
--- a/ultralytics/yolo/engine/predictor.py
+++ b/ultralytics/yolo/engine/predictor.py
@ -35,8 +35,7 @@ from ultralytics.nn.autobackend import AutoBackend
 from ultralytics.yolo.configs import get_config
 from ultralytics.yolo.data.dataloaders.stream_loaders import LoadImages, LoadScreenshots, LoadStreams
 from ultralytics.yolo.data.utils import IMG_FORMATS, VID_FORMATS
-from ultralytics.yolo.utils import DEFAULT_CONFIG, LOGGER, colorstr, ops
-from ultralytics.yolo.utils.callbacks import default_callbacks
+from ultralytics.yolo.utils import DEFAULT_CONFIG, LOGGER, callbacks, colorstr, ops
 from ultralytics.yolo.utils.checks import check_file, check_imgsz, check_imshow
 from ultralytics.yolo.utils.files import increment_path
 from ultralytics.yolo.utils.torch_utils import select_device, smart_inference_mode
@ -90,11 +89,8 @@ class BasePredictor:
        self.view_img = None
        self.annotator = None
        self.data_path = None
-
-        # callbacks
-        self.callbacks = defaultdict([])
-        for callback, func in default_callbacks.items():
-            self.add_callback(callback, func)
+        self.callbacks = defaultdict(list, {k: [v] for k, v in callbacks.default_callbacks.items()})  # add callbacks
+        callbacks.add_integration_callbacks(self)

    def preprocess(self, img):
        pass
@ -227,18 +223,6 @@ class BasePredictor:
                self.vid_writer[idx] = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
            self.vid_writer[idx].write(im0)

-    def add_callback(self, event: str, callback):
-        """
-        appends the given callback
-        """
-        self.callbacks[event].append(callback)
-
-    def set_callback(self, event: str, callback):
-        """
-        overrides the existing callbacks with the given callback
-        """
-        self.callbacks[event] = [callback]
-
    def run_callbacks(self, event: str):
        for callback in self.callbacks.get(event, []):
            callback(self)
--- a/ultralytics/yolo/engine/trainer.py
+++ b/ultralytics/yolo/engine/trainer.py
@ -21,11 +21,10 @@ from torch.optim import lr_scheduler
 from tqdm import tqdm

 import ultralytics.yolo.utils as utils
-import ultralytics.yolo.utils.callbacks as callbacks
 from ultralytics import __version__
 from ultralytics.yolo.configs import get_config
 from ultralytics.yolo.data.utils import check_dataset, check_dataset_yaml
-from ultralytics.yolo.utils import DEFAULT_CONFIG, LOGGER, RANK, TQDM_BAR_FORMAT, colorstr, yaml_save
+from ultralytics.yolo.utils import DEFAULT_CONFIG, LOGGER, RANK, TQDM_BAR_FORMAT, callbacks, colorstr, yaml_save
 from ultralytics.yolo.utils.checks import check_file, print_args
 from ultralytics.yolo.utils.dist import ddp_cleanup, generate_ddp_command
 from ultralytics.yolo.utils.files import get_latest_run, increment_path
@ -88,7 +87,7 @@ class BaseTrainer:
        self.model = None
        self.callbacks = defaultdict(list)

-        # dirs
+        # Dirs
        project = self.args.project or f"runs/{self.args.task}"
        name = self.args.name or f"{self.args.mode}"
        self.save_dir = increment_path(Path(project) / name, exist_ok=self.args.exist_ok if RANK in {-1, 0} else True)
@ -104,7 +103,7 @@ class BaseTrainer:
        if RANK == -1:
            print_args(dict(self.args))

-        # device
+        # Device
        self.device = utils.torch_utils.select_device(self.args.device, self.batch_size)
        self.amp = self.device.type != 'cpu'
        self.scaler = amp.GradScaler(enabled=self.amp)
@ -123,7 +122,7 @@ class BaseTrainer:
        self.lf = None
        self.scheduler = None

-        # epoch level metrics
+        # Epoch level metrics
        self.best_fitness = None
        self.fitness = None
        self.loss = None
@ -131,20 +130,20 @@ class BaseTrainer:
        self.loss_names = None
        self.csv = self.save_dir / 'results.csv'

-        for callback, func in callbacks.default_callbacks.items():
-            self.add_callback(callback, func)
+        # Callbacks
+        self.callbacks = defaultdict(list, {k: [v] for k, v in callbacks.default_callbacks.items()})  # add callbacks
        if RANK in {0, -1}:
            callbacks.add_integration_callbacks(self)

    def add_callback(self, event: str, callback):
        """
-        appends the given callback
+        Appends the given callback. TODO: unused, consider removing
        """
        self.callbacks[event].append(callback)

    def set_callback(self, event: str, callback):
        """
-        overrides the existing callbacks with the given callback
+        Overrides the existing callbacks with the given callback.  TODO: unused, consider removing
        """
        self.callbacks[event] = [callback]

@ -469,7 +468,7 @@ class BaseTrainer:
                    self.validator.args.save_json = True
                    self.metrics = self.validator(model=f)
                    self.metrics.pop('fitness', None)
-                    self.run_callbacks('on_val_end')
+                    self.run_callbacks('on_fit_epoch_end')

    def check_resume(self):
        resume = self.args.resume
--- a/ultralytics/yolo/engine/validator.py
+++ b/ultralytics/yolo/engine/validator.py
@ -8,8 +8,7 @@ from tqdm import tqdm

 from ultralytics.nn.autobackend import AutoBackend
 from ultralytics.yolo.data.utils import check_dataset, check_dataset_yaml
-from ultralytics.yolo.utils import DEFAULT_CONFIG, LOGGER, RANK, TQDM_BAR_FORMAT
-from ultralytics.yolo.utils.callbacks import default_callbacks
+from ultralytics.yolo.utils import DEFAULT_CONFIG, LOGGER, RANK, TQDM_BAR_FORMAT, callbacks
 from ultralytics.yolo.utils.checks import check_imgsz
 from ultralytics.yolo.utils.files import increment_path
 from ultralytics.yolo.utils.ops import Profile
@ -66,10 +65,7 @@ class BaseValidator:
                                                   exist_ok=self.args.exist_ok if RANK in {-1, 0} else True)
        (self.save_dir / 'labels' if self.args.save_txt else self.save_dir).mkdir(parents=True, exist_ok=True)

-        # callbacks
-        self.callbacks = defaultdict(list)
-        for callback, func in default_callbacks.items():
-            self.add_callback(callback, func)
+        self.callbacks = defaultdict(list, {k: [v] for k, v in callbacks.default_callbacks.items()})  # add callbacks

    @smart_inference_mode()
    def __call__(self, trainer=None, model=None):
@ -77,7 +73,6 @@ class BaseValidator:
        Supports validation of a pre-trained model if passed or a model being trained
        if trainer is passed (trainer gets priority).
        """
-        self.run_callbacks('on_val_start')
        self.training = trainer is not None
        if self.training:
            self.device = trainer.device
@ -89,6 +84,8 @@ class BaseValidator:
            self.loss = torch.zeros_like(trainer.loss_items, device=trainer.device)
            self.args.plots = trainer.epoch == trainer.epochs - 1  # always plot final epoch
        else:
+            callbacks.add_integration_callbacks(self)
+            self.run_callbacks('on_val_start')
            assert model is not None, "Either trainer or model is needed for validation"
            self.device = select_device(self.args.device, self.args.batch_size)
            self.args.half &= self.device.type != 'cpu'
@ -167,18 +164,6 @@ class BaseValidator:
                stats = self.eval_json(stats)  # update stats
            return stats

-    def add_callback(self, event: str, callback):
-        """
-        appends the given callback
-        """
-        self.callbacks[event].append(callback)
-
-    def set_callback(self, event: str, callback):
-        """
-        overrides the existing callbacks with the given callback
-        """
-        self.callbacks[event] = [callback]
-
    def run_callbacks(self, event: str):
        for callback in self.callbacks.get(event, []):
            callback(self)
--- a/ultralytics/yolo/utils/init.py
+++ b/ultralytics/yolo/utils/init.py
@ -249,26 +249,6 @@ def threaded(func):
    return wrapper


-def get_settings(file=USER_CONFIG_DIR / 'settings.yaml'):
-    """
-    Function that loads a global settings YAML, or creates it and populates it with default values if it does not exist.
-
-    If the datasets or weights directories are set to None, the current working directory will be used.
-    The 'sync' setting determines whether analytics will be synced to help with YOLO development.
-    """
-    from ultralytics.yolo.utils.torch_utils import torch_distributed_zero_first
-
-    with torch_distributed_zero_first(RANK):
-        if not file.exists():
-            settings = {
-                'datasets_dir': None,  # default datasets directory. If None, current working directory is used.
-                'weights_dir': None,  # default weights directory. If None, current working directory is used.
-                'sync': True}  # sync analytics to help with YOLO development
-            yaml_save(file, settings)
-
-    return yaml_load(file)
-
-
 def yaml_save(file='data.yaml', data=None):
    """
    Save YAML data to a file.
@ -305,6 +285,26 @@ def yaml_load(file='data.yaml'):
        return {**yaml.safe_load(f), 'yaml_file': file}


+def get_settings(file=USER_CONFIG_DIR / 'settings.yaml'):
+    """
+    Function that loads a global settings YAML, or creates it and populates it with default values if it does not exist.
+
+    If the datasets or weights directories are set to None, the current working directory will be used.
+    The 'sync' setting determines whether analytics will be synced to help with YOLO development.
+    """
+    from ultralytics.yolo.utils.torch_utils import torch_distributed_zero_first
+
+    with torch_distributed_zero_first(RANK):
+        if not file.exists():
+            settings = {
+                'datasets_dir': None,  # default datasets directory. If None, current working directory is used.
+                'weights_dir': None,  # default weights directory. If None, current working directory is used.
+                'sync': True}  # sync analytics to help with YOLO development
+            yaml_save(file, settings)
+
+    return yaml_load(file)
+
+
 # Run below code on utils init -----------------------------------------------------------------------------------------

 # Set logger
--- a/ultralytics/yolo/utils/callbacks/base.py
+++ b/ultralytics/yolo/utils/callbacks/base.py
@ -135,11 +135,12 @@ default_callbacks = {
    'on_export_end': on_export_end}


-def add_integration_callbacks(trainer):
+def add_integration_callbacks(instance):
    from .clearml import callbacks as clearml_callbacks
-    from .tb import callbacks as tb_callbacks
+    from .hub import callbacks as hub_callbacks
+    from .tensorboard import callbacks as tb_callbacks
    from .wb import callbacks as wb_callbacks

-    for x in clearml_callbacks, tb_callbacks, wb_callbacks:
+    for x in clearml_callbacks, hub_callbacks, tb_callbacks, wb_callbacks:
        for k, v in x.items():
-            trainer.add_callback(k, v)  # add_callback(name, func)
+            instance.callbacks[k].append(v)  # callback[name].append(func)
--- a/ultralytics/yolo/utils/callbacks/hub.py
+++ b/ultralytics/yolo/utils/callbacks/hub.py
@ -0,0 +1,80 @@
+import json
+from time import time
+
+import torch
+
+from ultralytics.hub.utils import PREFIX, sync_analytics
+from ultralytics.yolo.utils import LOGGER
+
+
+def on_pretrain_routine_end(trainer):
+    session = getattr(trainer, 'hub_session', None)
+    if session:
+        # Start timer for upload rate limit
+        LOGGER.info(f"{PREFIX}View model at https://hub.ultralytics.com/models/{session.model_id} 🚀")
+        session.t = {'metrics': time(), 'ckpt': time()}  # start timer on self.rate_limit
+
+
+def on_fit_epoch_end(trainer):
+    session = getattr(trainer, 'hub_session', None)
+    if session:
+        # Upload metrics after val end
+        metrics = trainer.metrics
+        for k, v in metrics.items():
+            if isinstance(v, torch.Tensor):
+                metrics[k] = v.item()
+
+        session.metrics_queue[trainer.epoch] = json.dumps(metrics)  # json string
+        if time() - session.t['metrics'] > session.rate_limits['metrics']:
+            session.upload_metrics()
+            session.t['metrics'] = time()  # reset timer
+            session.metrics_queue = {}  # reset queue
+
+
+def on_model_save(trainer):
+    session = getattr(trainer, 'hub_session', None)
+    if session:
+        # Upload checkpoints with rate limiting
+        is_best = trainer.best_fitness == trainer.fitness
+        if time() - session.t['ckpt'] > session.rate_limits['ckpt']:
+            LOGGER.info(f"{PREFIX}Uploading checkpoint {session.model_id}")
+            session.upload_model(trainer.epoch, trainer.last, is_best)
+            session.t['ckpt'] = time()  # reset timer
+
+
+def on_train_end(trainer):
+    session = getattr(trainer, 'hub_session', None)
+    if session:
+        # Upload final model and metrics with exponential standoff
+        LOGGER.info(f"{PREFIX}Training completed successfully ✅\n"
+                    f"{PREFIX}Uploading final {session.model_id}")
+        session.upload_model(trainer.epoch, trainer.best, map=trainer.metrics['metrics/mAP50(B)'], final=True)
+        session.alive = False  # stop heartbeats
+        LOGGER.info(f"{PREFIX}View model at https://hub.ultralytics.com/models/{session.model_id} 🚀")
+
+
+def on_train_start(trainer):
+    sync_analytics(trainer.args)
+
+
+def on_val_start(validator):
+    sync_analytics(validator.args)
+
+
+def on_predict_start(predictor):
+    sync_analytics(predictor.args)
+
+
+def on_export_start(exporter):
+    sync_analytics(exporter.args)
+
+
+callbacks = {
+    "on_pretrain_routine_end": on_pretrain_routine_end,
+    "on_fit_epoch_end": on_fit_epoch_end,
+    "on_model_save": on_model_save,
+    "on_train_end": on_train_end,
+    "on_train_start": on_train_start,
+    "on_val_start": on_val_start,
+    "on_predict_start": on_predict_start,
+    "on_export_start": on_export_start}
--- a/ultralytics/yolo/utils/callbacks/tensorboard.py
+++ b/ultralytics/yolo/utils/callbacks/tensorboard.py