trainer.py 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920
  1. # Ultralytics YOLO 🚀, AGPL-3.0 license
  2. """
  3. Train a model on a dataset.
  4. Usage:
  5. $ yolo mode=train model=yolov8n.pt data=coco8.yaml imgsz=640 epochs=100 batch=16
  6. """
  7. import gc
  8. import math
  9. import os
  10. import subprocess
  11. import time
  12. import warnings
  13. from copy import deepcopy
  14. from datetime import datetime, timedelta
  15. from pathlib import Path
  16. import numpy as np
  17. import torch
  18. from torch import distributed as dist
  19. from torch import nn, optim
  20. from ultralytics.cfg import get_cfg, get_save_dir
  21. from ultralytics.data.utils import check_cls_dataset, check_det_dataset
  22. from ultralytics.nn.tasks import attempt_load_one_weight, attempt_load_weights
  23. from ultralytics.utils import (
  24. DEFAULT_CFG,
  25. LOGGER,
  26. RANK,
  27. TQDM,
  28. __version__,
  29. callbacks,
  30. clean_url,
  31. colorstr,
  32. emojis,
  33. yaml_save,
  34. )
  35. from ultralytics.utils.autobatch import check_train_batch_size
  36. from ultralytics.utils.checks import check_amp, check_file, check_imgsz, check_model_file_from_stem, print_args
  37. from ultralytics.utils.dist import ddp_cleanup, generate_ddp_command
  38. from ultralytics.utils.files import get_latest_run
  39. from ultralytics.utils.torch_utils import (
  40. EarlyStopping,
  41. ModelEMA,
  42. convert_optimizer_state_dict_to_fp16,
  43. init_seeds,
  44. one_cycle,
  45. select_device,
  46. strip_optimizer,
  47. torch_distributed_zero_first,
  48. )
  49. from ultralytics.nn.extra_modules.kernel_warehouse import get_temperature
  50. from ultralytics.trainsdk import TrainSdk
  51. from ultralytics.vinno_metrics.image_test.object_metrics import ObjectMetrics, YoloType, YoloMetas
  52. from ultralytics.vinno_metrics.image_test.seg_metrics import YoloInstanceSegMetas, SegMetrics, ResizeMode
  53. class BaseTrainer:
  54. """
  55. BaseTrainer.
  56. A base class for creating trainers.
  57. Attributes:
  58. args (SimpleNamespace): Configuration for the trainer.
  59. validator (BaseValidator): Validator instance.
  60. model (nn.Module): Model instance.
  61. callbacks (defaultdict): Dictionary of callbacks.
  62. save_dir (Path): Directory to save results.
  63. wdir (Path): Directory to save weights.
  64. last (Path): Path to the last checkpoint.
  65. best (Path): Path to the best checkpoint.
  66. save_period (int): Save checkpoint every x epochs (disabled if < 1).
  67. batch_size (int): Batch size for training.
  68. epochs (int): Number of epochs to train for.
  69. start_epoch (int): Starting epoch for training.
  70. device (torch.device): Device to use for training.
  71. amp (bool): Flag to enable AMP (Automatic Mixed Precision).
  72. scaler (amp.GradScaler): Gradient scaler for AMP.
  73. data (str): Path to data.
  74. trainset (torch.utils.data.Dataset): Training dataset.
  75. testset (torch.utils.data.Dataset): Testing dataset.
  76. ema (nn.Module): EMA (Exponential Moving Average) of the model.
  77. resume (bool): Resume training from a checkpoint.
  78. lf (nn.Module): Loss function.
  79. scheduler (torch.optim.lr_scheduler._LRScheduler): Learning rate scheduler.
  80. best_fitness (float): The best fitness value achieved.
  81. fitness (float): Current fitness value.
  82. loss (float): Current loss value.
  83. tloss (float): Total loss value.
  84. loss_names (list): List of loss names.
  85. csv (Path): Path to results CSV file.
  86. """
  87. def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
  88. """
  89. Initializes the BaseTrainer class.
  90. Args:
  91. cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG.
  92. overrides (dict, optional): Configuration overrides. Defaults to None.
  93. """
  94. self.args = get_cfg(cfg, overrides)
  95. self.check_resume(overrides)
  96. self.device = select_device(self.args.device, self.args.batch)
  97. self.validator = None
  98. self.metrics = None
  99. self.plots = {}
  100. init_seeds(self.args.seed + 1 + RANK, deterministic=self.args.deterministic)
  101. # Dirs
  102. self.save_dir = get_save_dir(self.args)
  103. self.args.name = self.save_dir.name # update name for loggers
  104. self.wdir = self.save_dir / "weights" # weights dir
  105. if RANK in {-1, 0}:
  106. self.wdir.mkdir(parents=True, exist_ok=True) # make dir
  107. self.args.save_dir = str(self.save_dir)
  108. yaml_save(self.save_dir / "args.yaml", vars(self.args)) # save run args
  109. if self.args.is_train_on_platform:
  110. TrainSdk.save_output_model(self.save_dir / "args.yaml")
  111. self.last, self.best = self.wdir / "last.pt", self.wdir / "best.pt" # checkpoint paths
  112. self.save_period = self.args.save_period
  113. self.batch_size = self.args.batch
  114. self.epochs = self.args.epochs
  115. self.start_epoch = 0
  116. if RANK == -1:
  117. print_args(vars(self.args))
  118. # Device
  119. if self.device.type in {"cpu", "mps"}:
  120. self.args.workers = 0 # faster CPU training as time dominated by inference, not dataloading
  121. # Model and Dataset
  122. self.model = check_model_file_from_stem(self.args.model) # add suffix, i.e. yolov8n -> yolov8n.pt
  123. with torch_distributed_zero_first(RANK): # avoid auto-downloading dataset multiple times
  124. self.trainset, self.testset = self.get_dataset()
  125. self.ema = None
  126. # Optimization utils init
  127. self.lf = None
  128. self.scheduler = None
  129. # Epoch level metrics
  130. self.best_fitness = None
  131. self.fitness = None
  132. self.loss = None
  133. self.tloss = None
  134. self.loss_names = ["Loss"]
  135. self.csv = self.save_dir / "results.csv"
  136. self.plot_idx = [0, 1, 2]
  137. # HUB
  138. self.hub_session = None
  139. # Callbacks
  140. self.callbacks = _callbacks or callbacks.get_default_callbacks()
  141. if RANK in {-1, 0}:
  142. callbacks.add_integration_callbacks(self)
  143. def add_callback(self, event: str, callback):
  144. """Appends the given callback."""
  145. self.callbacks[event].append(callback)
  146. def set_callback(self, event: str, callback):
  147. """Overrides the existing callbacks with the given callback."""
  148. self.callbacks[event] = [callback]
  149. def run_callbacks(self, event: str):
  150. """Run all existing callbacks associated with a particular event."""
  151. for callback in self.callbacks.get(event, []):
  152. callback(self)
  153. def train(self):
  154. """Allow device='', device=None on Multi-GPU systems to default to device=0."""
  155. if isinstance(self.args.device, str) and len(self.args.device): # i.e. device='0' or device='0,1,2,3'
  156. world_size = len(self.args.device.split(","))
  157. elif isinstance(self.args.device, (tuple, list)): # i.e. device=[0, 1, 2, 3] (multi-GPU from CLI is list)
  158. world_size = len(self.args.device)
  159. elif torch.cuda.is_available(): # i.e. device=None or device='' or device=number
  160. world_size = 1 # default to device 0
  161. else: # i.e. device='cpu' or 'mps'
  162. world_size = 0
  163. # Run subprocess if DDP training, else train normally
  164. if world_size > 1 and "LOCAL_RANK" not in os.environ:
  165. # Argument checks
  166. if self.args.rect:
  167. LOGGER.warning("WARNING ⚠️ 'rect=True' is incompatible with Multi-GPU training, setting 'rect=False'")
  168. self.args.rect = False
  169. if self.args.batch < 1.0:
  170. LOGGER.warning(
  171. "WARNING ⚠️ 'batch<1' for AutoBatch is incompatible with Multi-GPU training, setting "
  172. "default 'batch=16'"
  173. )
  174. self.args.batch = 16
  175. # Command
  176. cmd, file = generate_ddp_command(world_size, self)
  177. try:
  178. LOGGER.info(f'{colorstr("DDP:")} debug command {" ".join(cmd)}')
  179. subprocess.run(cmd, check=True)
  180. except Exception as e:
  181. raise e
  182. finally:
  183. ddp_cleanup(self, str(file))
  184. else:
  185. self._do_train(world_size)
  186. def _setup_scheduler(self):
  187. """Initialize training learning rate scheduler."""
  188. if self.args.cos_lr:
  189. self.lf = one_cycle(1, self.args.lrf, self.epochs) # cosine 1->hyp['lrf']
  190. else:
  191. self.lf = lambda x: max(1 - x / self.epochs, 0) * (1.0 - self.args.lrf) + self.args.lrf # linear
  192. self.scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=self.lf)
  193. def _setup_ddp(self, world_size):
  194. """Initializes and sets the DistributedDataParallel parameters for training."""
  195. torch.cuda.set_device(RANK)
  196. self.device = torch.device("cuda", RANK)
  197. # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
  198. os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout
  199. dist.init_process_group(
  200. backend="nccl" if dist.is_nccl_available() else "gloo",
  201. timeout=timedelta(seconds=10800), # 3 hours
  202. rank=RANK,
  203. world_size=world_size,
  204. )
  205. def _setup_train(self, world_size):
  206. """Builds dataloaders and optimizer on correct rank process."""
  207. # Model
  208. self.run_callbacks("on_pretrain_routine_start")
  209. ckpt = self.setup_model()
  210. self.model = self.model.to(self.device)
  211. self.set_model_attributes()
  212. # Freeze layers
  213. freeze_list = (
  214. self.args.freeze
  215. if isinstance(self.args.freeze, list)
  216. else range(self.args.freeze)
  217. if isinstance(self.args.freeze, int)
  218. else []
  219. )
  220. always_freeze_names = [".dfl"] # always freeze these layers
  221. freeze_layer_names = [f"model.{x}." for x in freeze_list] + always_freeze_names
  222. for k, v in self.model.named_parameters():
  223. # v.register_hook(lambda x: torch.nan_to_num(x)) # NaN to 0 (commented for erratic training results)
  224. if any(x in k for x in freeze_layer_names):
  225. LOGGER.info(f"Freezing layer '{k}'")
  226. v.requires_grad = False
  227. # elif not v.requires_grad and v.dtype.is_floating_point: # only floating point Tensor can require gradients
  228. # LOGGER.info(
  229. # f"WARNING ⚠️ setting 'requires_grad=True' for frozen layer '{k}'. "
  230. # "See ultralytics.engine.trainer for customization of frozen layers."
  231. # )
  232. # v.requires_grad = True
  233. # Check AMP
  234. self.amp = torch.tensor(self.args.amp).to(self.device) # True or False
  235. if self.amp and RANK in {-1, 0}: # Single-GPU and DDP
  236. callbacks_backup = callbacks.default_callbacks.copy() # backup callbacks as check_amp() resets them
  237. self.amp = torch.tensor(check_amp(self.model), device=self.device)
  238. callbacks.default_callbacks = callbacks_backup # restore callbacks
  239. if RANK > -1 and world_size > 1: # DDP
  240. dist.broadcast(self.amp, src=0) # broadcast the tensor from rank 0 to all other ranks (returns None)
  241. self.amp = bool(self.amp) # as boolean
  242. self.scaler = torch.cuda.amp.GradScaler(enabled=self.amp)
  243. if world_size > 1:
  244. self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[RANK])
  245. # Check imgsz
  246. gs = max(int(self.model.stride.max() if hasattr(self.model, "stride") else 32), 32) # grid size (max stride)
  247. self.args.imgsz = check_imgsz(self.args.imgsz, stride=gs, floor=gs, max_dim=1)
  248. self.stride = gs # for multiscale training
  249. # Batch size
  250. if self.batch_size < 1 and RANK == -1: # single-GPU only, estimate best batch size
  251. self.args.batch = self.batch_size = check_train_batch_size(
  252. model=self.model,
  253. imgsz=self.args.imgsz,
  254. amp=self.amp,
  255. batch=self.batch_size,
  256. )
  257. # Dataloaders
  258. batch_size = self.batch_size // max(world_size, 1)
  259. self.train_loader = self.get_dataloader(self.trainset, batch_size=batch_size, rank=RANK, mode="train")
  260. if RANK in {-1, 0}:
  261. # Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
  262. self.test_loader = self.get_dataloader(
  263. self.testset, batch_size=batch_size if self.args.task == "obb" else batch_size * 2, rank=-1, mode="val"
  264. )
  265. self.validator = self.get_validator()
  266. metric_keys = self.validator.metrics.keys + self.label_loss_items(prefix="val")
  267. self.metrics = dict(zip(metric_keys, [0] * len(metric_keys)))
  268. self.ema = ModelEMA(self.model)
  269. if self.args.plots:
  270. self.plot_training_labels()
  271. # Optimizer
  272. self.accumulate = max(round(self.args.nbs / self.batch_size), 1) # accumulate loss before optimizing
  273. weight_decay = self.args.weight_decay * self.batch_size * self.accumulate / self.args.nbs # scale weight_decay
  274. iterations = math.ceil(len(self.train_loader.dataset) / max(self.batch_size, self.args.nbs)) * self.epochs
  275. self.optimizer = self.build_optimizer(
  276. model=self.model,
  277. name=self.args.optimizer,
  278. lr=self.args.lr0,
  279. momentum=self.args.momentum,
  280. decay=weight_decay,
  281. iterations=iterations,
  282. )
  283. # Scheduler
  284. self._setup_scheduler()
  285. self.stopper, self.stop = EarlyStopping(patience=self.args.patience), False
  286. self.resume_training(ckpt)
  287. self.scheduler.last_epoch = self.start_epoch - 1 # do not move
  288. self.run_callbacks("on_pretrain_routine_end")
  289. def _do_train(self, world_size=1):
  290. """Train completed, evaluate and plot if specified by arguments."""
  291. if world_size > 1:
  292. self._setup_ddp(world_size)
  293. self._setup_train(world_size)
  294. nb = len(self.train_loader) # number of batches
  295. nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1 # warmup iterations
  296. last_opt_step = -1
  297. self.epoch_time = None
  298. self.epoch_time_start = time.time()
  299. self.train_time_start = time.time()
  300. self.run_callbacks("on_train_start")
  301. LOGGER.info(
  302. f'Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n'
  303. f'Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n'
  304. f"Logging results to {colorstr('bold', self.save_dir)}\n"
  305. f'Starting training for ' + (f"{self.args.time} hours..." if self.args.time else f"{self.epochs} epochs...")
  306. )
  307. if self.args.close_mosaic:
  308. base_idx = (self.epochs - self.args.close_mosaic) * nb
  309. self.plot_idx.extend([base_idx, base_idx + 1, base_idx + 2])
  310. epoch = self.start_epoch
  311. self.optimizer.zero_grad() # zero any resumed gradients to ensure stability on train start
  312. while True:
  313. self.epoch = epoch
  314. self.run_callbacks("on_train_epoch_start")
  315. with warnings.catch_warnings():
  316. warnings.simplefilter("ignore") # suppress 'Detected lr_scheduler.step() before optimizer.step()'
  317. self.scheduler.step()
  318. self.model.train()
  319. if RANK != -1:
  320. self.train_loader.sampler.set_epoch(epoch)
  321. pbar = enumerate(self.train_loader)
  322. # Update dataloader attributes (optional)
  323. if epoch == (self.epochs - self.args.close_mosaic):
  324. self._close_dataloader_mosaic()
  325. self.train_loader.reset()
  326. if RANK in {-1, 0}:
  327. LOGGER.info(self.progress_string())
  328. pbar = TQDM(enumerate(self.train_loader), total=nb)
  329. self.tloss = None
  330. for i, batch in pbar:
  331. self.run_callbacks("on_train_batch_start")
  332. # Warmup
  333. ni = i + nb * epoch
  334. if ni <= nw:
  335. xi = [0, nw] # x interp
  336. self.accumulate = max(1, int(np.interp(ni, xi, [1, self.args.nbs / self.batch_size]).round()))
  337. for j, x in enumerate(self.optimizer.param_groups):
  338. # Bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0
  339. x["lr"] = np.interp(
  340. ni, xi, [self.args.warmup_bias_lr if j == 0 else 0.0, x["initial_lr"] * self.lf(epoch)]
  341. )
  342. if "momentum" in x:
  343. x["momentum"] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum])
  344. if hasattr(self.model, 'net_update_temperature'):
  345. temp = get_temperature(i + 1, epoch, len(self.train_loader), temp_epoch=20, temp_init_value=1.0)
  346. self.model.net_update_temperature(temp)
  347. # Forward
  348. with torch.cuda.amp.autocast(self.amp):
  349. batch = self.preprocess_batch(batch)
  350. self.loss, self.loss_items = self.model(batch)
  351. if RANK != -1:
  352. self.loss *= world_size
  353. self.tloss = (
  354. (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None else self.loss_items
  355. )
  356. # Backward
  357. self.scaler.scale(self.loss).backward()
  358. # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
  359. if ni - last_opt_step >= self.accumulate:
  360. self.optimizer_step()
  361. last_opt_step = ni
  362. # Timed stopping
  363. if self.args.time:
  364. self.stop = (time.time() - self.train_time_start) > (self.args.time * 3600)
  365. if RANK != -1: # if DDP training
  366. broadcast_list = [self.stop if RANK == 0 else None]
  367. dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks
  368. self.stop = broadcast_list[0]
  369. if self.stop: # training time exceeded
  370. break
  371. # Log
  372. mem = f"{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G" # (GB)
  373. loss_len = self.tloss.shape[0] if len(self.tloss.shape) else 1
  374. losses = self.tloss if loss_len > 1 else torch.unsqueeze(self.tloss, 0)
  375. if RANK in {-1, 0}:
  376. pbar.set_description(
  377. ("%11s" * 2 + "%11.4g" * (2 + loss_len))
  378. % (f"{epoch + 1}/{self.epochs}", mem, *losses, batch["cls"].shape[0], batch["img"].shape[-1])
  379. )
  380. self.run_callbacks("on_batch_end")
  381. if self.args.plots and ni in self.plot_idx:
  382. self.plot_training_samples(batch, ni)
  383. self.run_callbacks("on_train_batch_end")
  384. self.lr = {f"lr/pg{ir}": x["lr"] for ir, x in enumerate(self.optimizer.param_groups)} # for loggers
  385. self.run_callbacks("on_train_epoch_end")
  386. if RANK in {-1, 0}:
  387. final_epoch = epoch + 1 >= self.epochs
  388. self.ema.update_attr(self.model, include=["yaml", "nc", "args", "names", "stride", "class_weights"])
  389. # Validation
  390. if self.args.val or final_epoch or self.stopper.possible_stop or self.stop:
  391. self.metrics, self.fitness = self.validate()
  392. self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr})
  393. self.stop |= self.stopper(epoch + 1, self.fitness) or final_epoch
  394. if self.args.time:
  395. self.stop |= (time.time() - self.train_time_start) > (self.args.time * 3600)
  396. # Save model
  397. if self.args.save or final_epoch:
  398. self.save_model()
  399. self.run_callbacks("on_model_save")
  400. # Scheduler
  401. t = time.time()
  402. self.epoch_time = t - self.epoch_time_start
  403. self.epoch_time_start = t
  404. if self.args.time:
  405. mean_epoch_time = (t - self.train_time_start) / (epoch - self.start_epoch + 1)
  406. self.epochs = self.args.epochs = math.ceil(self.args.time * 3600 / mean_epoch_time)
  407. self._setup_scheduler()
  408. self.scheduler.last_epoch = self.epoch # do not move
  409. self.stop |= epoch >= self.epochs # stop if exceeded epochs
  410. self.run_callbacks("on_fit_epoch_end")
  411. gc.collect()
  412. torch.cuda.empty_cache() # clear GPU memory at end of epoch, may help reduce CUDA out of memory errors
  413. # Early Stopping
  414. if RANK != -1: # if DDP training
  415. broadcast_list = [self.stop if RANK == 0 else None]
  416. dist.broadcast_object_list(broadcast_list, 0) # broadcast 'stop' to all ranks
  417. self.stop = broadcast_list[0]
  418. if self.stop:
  419. break # must break all DDP ranks
  420. epoch += 1
  421. if RANK in {-1, 0}:
  422. # Do final val with best.pt
  423. LOGGER.info(
  424. f"\n{epoch - self.start_epoch + 1} epochs completed in "
  425. f"{(time.time() - self.train_time_start) / 3600:.3f} hours."
  426. )
  427. self.final_eval()
  428. if self.args.plots:
  429. self.plot_metrics()
  430. self.run_callbacks("on_train_end")
  431. gc.collect()
  432. torch.cuda.empty_cache()
  433. self.run_callbacks("teardown")
  434. def save_model(self):
  435. """Save model training checkpoints with additional metadata."""
  436. import io
  437. import pandas as pd # scope for faster 'import ultralytics'
  438. # Serialize ckpt to a byte buffer once (faster than repeated torch.save() calls)
  439. buffer = io.BytesIO()
  440. torch.save(
  441. {
  442. "epoch": self.epoch,
  443. "best_fitness": self.best_fitness,
  444. "model": None, # resume and final checkpoints derive from EMA
  445. "ema": deepcopy(self.ema.ema).half(),
  446. "updates": self.ema.updates,
  447. "optimizer": convert_optimizer_state_dict_to_fp16(deepcopy(self.optimizer.state_dict())),
  448. "train_args": vars(self.args), # save as dict
  449. "train_metrics": {**self.metrics, **{"fitness": self.fitness}},
  450. "train_results": {k.strip(): v for k, v in pd.read_csv(self.csv).to_dict(orient="list").items()},
  451. "date": datetime.now().isoformat(),
  452. "version": __version__,
  453. "license": "AGPL-3.0 (https://ultralytics.com/license)",
  454. "docs": "https://docs.ultralytics.com",
  455. },
  456. buffer,
  457. )
  458. serialized_ckpt = buffer.getvalue() # get the serialized content to save
  459. # Save checkpoints
  460. self.last.write_bytes(serialized_ckpt) # save last.pt
  461. if self.args.is_train_on_platform:
  462. TrainSdk.save_output_model(self.last)
  463. if self.best_fitness == self.fitness:
  464. self.best.write_bytes(serialized_ckpt) # save best.pt
  465. if self.args.is_train_on_platform:
  466. TrainSdk.save_output_model(self.best)
  467. if (self.save_period > 0) and (self.epoch > 0) and (self.epoch % self.save_period == 0):
  468. (self.wdir / f"epoch{self.epoch}.pt").write_bytes(serialized_ckpt) # save epoch, i.e. 'epoch3.pt'
  469. # 额外计算转成onnx模型的评价指标,暂时只支持YOLOv8和v9,需要nms后处理的模型
  470. # RTDETR、yolov10无需后处理的模型,转成onnx之后,需要额外操作,暂时不支持
  471. # if self.args.is_train_on_platform:
  472. # TrainSdk.save_output_model(self.wdir / f"epoch{self.epoch}.pt")
  473. # from ultralytics import YOLO
  474. # model = YOLO(self.wdir / f"epoch{self.epoch}.pt")
  475. # model.export(format="onnx") # export the model to ONNX format
  476. # onnx_model = self.wdir / f"epoch{self.epoch}.onnx"
  477. # TrainSdk.save_output_model(onnx_model)
  478. # self.platform_test_metrics(onnx_model)
  479. def platform_test_metrics(self, onnx_model):
  480. """
  481. 在VINNO平台训练时,将模型转成onnx,采用默认resize方式,调用c++的后处理,得到评价指标
  482. """
  483. if self.args.is_train_on_platform and self.data['platform_data_args']:
  484. platform_data_args = self.data["platform_data_args"]
  485. token = platform_data_args["token"]
  486. data_type = platform_data_args["data_type"]
  487. class_id_map_list = platform_data_args["class_id_map_list"]
  488. dll_file = platform_data_args["dll_file"]
  489. wrong_file = platform_data_args["wrong_file"]
  490. metrics_type = platform_data_args['metrics_type']
  491. extra_contours_args = platform_data_args['extra_contours_args']
  492. needed_image_results_dict = platform_data_args['needed_image_results_dict']
  493. needed_rois_dict = platform_data_args['needed_rois_dict']
  494. # 暂时没有直接从val或者test的dataset中取im_files,
  495. # 平台训练时,暂时都是直接区分train和test,可以直接取到测试数据
  496. files = []
  497. image_count = TrainSdk.get_test_file_count(token)
  498. for i in range(image_count):
  499. files.append(str(i))
  500. if data_type == 'detection':
  501. yolo_metas_yaml = platform_data_args['yolo_metas']
  502. yolo_metas = YoloMetas(
  503. yolotype=YoloType[yolo_metas_yaml['yolotype']],
  504. confthres=yolo_metas_yaml['confthres'],
  505. clsconfthres=yolo_metas_yaml['clsconfthres'],
  506. batchsize=yolo_metas_yaml['batchsize'],
  507. maxdet=yolo_metas_yaml['maxdet'],
  508. minboxratio=yolo_metas_yaml['minboxratio'],
  509. # ApplyPostProcessToBBox中,用于单个类别的box筛选
  510. postprocesstopk=yolo_metas_yaml['postprocesstopk'],
  511. enableioufilt=yolo_metas_yaml['enableioufilt'],
  512. enableiosfilt=yolo_metas_yaml['enableiosfilt'],
  513. ioufltth=yolo_metas_yaml['ioufltth'],
  514. iosfltth=yolo_metas_yaml['iosfltth'],
  515. # FindBoxesToUnion中 当两框重叠率高于一定程度,且合并后增加面积并不多,则合并两框
  516. enableunion=yolo_metas_yaml['enableunion'],
  517. unioniouth=yolo_metas_yaml['unioniouth'],
  518. unioniosth=yolo_metas_yaml['unioniosth'],
  519. unionuobth=yolo_metas_yaml['unionuobth'],
  520. # ApplyBoxClassFilter中,同一幅图上有多个框,用于不用类别的box筛选
  521. enableioufiltdiffcls=yolo_metas_yaml['enableioufiltdiffcls'],
  522. enableiosfiltdiffcls=yolo_metas_yaml['enableiosfiltdiffcls'],
  523. ioufltthdiffcls=yolo_metas_yaml['ioufltthdiffcls'],
  524. iosfltthdiffcls=yolo_metas_yaml['iosfltthdiffcls'], )
  525. for class_id_map in class_id_map_list:
  526. metric_reports = ObjectMetrics(is_local_file=False,
  527. files=files,
  528. token=token,
  529. onnx_file=onnx_model,
  530. needed_image_results_dict=needed_image_results_dict,
  531. needed_rois_dict=needed_rois_dict,
  532. extra_contours_args=extra_contours_args,
  533. class_id_map=class_id_map,
  534. wrong_file=wrong_file,
  535. dll_file=dll_file,
  536. yolo_metas=yolo_metas,
  537. metrics_type=metrics_type)
  538. metric_reports.run()
  539. elif data_type == 'segment':
  540. yolo_metas_yaml = platform_data_args['yolo_inst_seg_metas']
  541. resize_mode = ResizeMode[platform_data_args['resize_mode']]
  542. seg_post_process_param = platform_data_args['seg_post_process_param']
  543. yolo_inst_seg_metas = YoloInstanceSegMetas(
  544. yolo_type=YoloType[yolo_metas_yaml['yolo_type']],
  545. box_conf_thres=yolo_metas_yaml['box_conf_thres'],
  546. cls_conf_thres=yolo_metas_yaml['cls_conf_thres'],
  547. batch_size=yolo_metas_yaml['batch_size'],
  548. max_det=yolo_metas_yaml['max_det'],
  549. min_box_ratio=yolo_metas_yaml['min_box_ratio'],
  550. post_process_top_k=yolo_metas_yaml['post_process_top_k'],
  551. iou_fltth=yolo_metas_yaml['iou_fltth'],
  552. ios_fltth=yolo_metas_yaml['ios_fltth'],
  553. enable_iou_filt=yolo_metas_yaml['enable_iou_filt'],
  554. enable_ios_filt=yolo_metas_yaml['enable_ios_filt'],
  555. iou_fltth_diff_cls=yolo_metas_yaml['iou_fltth_diff_cls'],
  556. ios_fltth_diff_cls=yolo_metas_yaml['ios_fltth_diff_cls'],
  557. enable_iou_filt_diff_cls=yolo_metas_yaml['enable_iou_filt_diff_cls'],
  558. enable_ios_filt_diff_cls=yolo_metas_yaml['enable_iou_filt_diff_cls'],
  559. mask_thres=yolo_metas_yaml['mask_thres'],
  560. )
  561. for class_id_map in class_id_map_list:
  562. metric_reports = SegMetrics(is_local_file=False,
  563. files=files,
  564. token=token,
  565. onnx_file=onnx_model,
  566. resize_mode=resize_mode,
  567. needed_image_results_dict=needed_image_results_dict,
  568. needed_rois_dict=needed_rois_dict,
  569. extra_contours_args=extra_contours_args,
  570. class_id_map=class_id_map,
  571. wrong_file=wrong_file,
  572. dll_file=dll_file,
  573. yolo_inst_seg_metas=yolo_inst_seg_metas,
  574. seg_post_process_param=seg_post_process_param,
  575. use_contour_for_iou=True,
  576. metrics_type=metrics_type)
  577. metric_reports.run()
  578. else:
  579. pass
  580. def get_dataset(self):
  581. """
  582. Get train, val path from data dict if it exists.
  583. Returns None if data format is not recognized.
  584. """
  585. try:
  586. if self.args.task == "classify":
  587. data = check_cls_dataset(self.args.data)
  588. elif self.args.data.split(".")[-1] in {"yaml", "yml"} or self.args.task in {
  589. "detect",
  590. "segment",
  591. "pose",
  592. "obb",
  593. }:
  594. data = check_det_dataset(self.args.data, self.args.is_train_on_platform)
  595. if "yaml_file" in data:
  596. self.args.data = data["yaml_file"] # for validating 'yolo train data=url.zip' usage
  597. except Exception as e:
  598. raise RuntimeError(emojis(f"Dataset '{clean_url(self.args.data)}' error ❌ {e}")) from e
  599. self.data = data
  600. # 在平台训练时,无需train和val或test路径,直接返回空
  601. if self.args.is_train_on_platform:
  602. return "", ""
  603. else:
  604. return data["train"], data.get("val") or data.get("test")
  605. def setup_model(self):
  606. """Load/create/download model for any task."""
  607. if isinstance(self.model, torch.nn.Module): # if model is loaded beforehand. No setup needed
  608. return
  609. cfg, weights = self.model, None
  610. ckpt = None
  611. if str(self.model).endswith(".pt"):
  612. weights, ckpt = attempt_load_one_weight(self.model)
  613. cfg = weights.yaml
  614. elif isinstance(self.args.pretrained, (str, Path)):
  615. weights, _ = attempt_load_one_weight(self.args.pretrained)
  616. self.model = self.get_model(cfg=cfg, weights=weights, verbose=RANK == -1) # calls Model(cfg, weights)
  617. return ckpt
  618. def optimizer_step(self):
  619. """Perform a single step of the training optimizer with gradient clipping and EMA update."""
  620. self.scaler.unscale_(self.optimizer) # unscale gradients
  621. torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0) # clip gradients
  622. self.scaler.step(self.optimizer)
  623. self.scaler.update()
  624. self.optimizer.zero_grad()
  625. if self.ema:
  626. self.ema.update(self.model)
  627. def preprocess_batch(self, batch):
  628. """Allows custom preprocessing model inputs and ground truths depending on task type."""
  629. return batch
  630. def validate(self):
  631. """
  632. Runs validation on test set using self.validator.
  633. The returned dict is expected to contain "fitness" key.
  634. """
  635. metrics = self.validator(self)
  636. fitness = metrics.pop("fitness", -self.loss.detach().cpu().numpy()) # use loss as fitness measure if not found
  637. if not self.best_fitness or self.best_fitness < fitness:
  638. self.best_fitness = fitness
  639. return metrics, fitness
  640. def get_model(self, cfg=None, weights=None, verbose=True):
  641. """Get model and raise NotImplementedError for loading cfg files."""
  642. raise NotImplementedError("This task trainer doesn't support loading cfg files")
  643. def get_validator(self):
  644. """Returns a NotImplementedError when the get_validator function is called."""
  645. raise NotImplementedError("get_validator function not implemented in trainer")
  646. def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode="train"):
  647. """Returns dataloader derived from torch.data.Dataloader."""
  648. raise NotImplementedError("get_dataloader function not implemented in trainer")
  649. def build_dataset(self, img_path, mode="train", batch=None):
  650. """Build dataset."""
  651. raise NotImplementedError("build_dataset function not implemented in trainer")
  652. def label_loss_items(self, loss_items=None, prefix="train"):
  653. """
  654. Returns a loss dict with labelled training loss items tensor.
  655. Note:
  656. This is not needed for classification but necessary for segmentation & detection
  657. """
  658. return {"loss": loss_items} if loss_items is not None else ["loss"]
  659. def set_model_attributes(self):
  660. """To set or update model parameters before training."""
  661. self.model.names = self.data["names"]
  662. def build_targets(self, preds, targets):
  663. """Builds target tensors for training YOLO model."""
  664. pass
  665. def progress_string(self):
  666. """Returns a string describing training progress."""
  667. return ""
  668. # TODO: may need to put these following functions into callback
  669. def plot_training_samples(self, batch, ni):
  670. """Plots training samples during YOLO training."""
  671. pass
  672. def plot_training_labels(self):
  673. """Plots training labels for YOLO model."""
  674. pass
  675. def save_metrics(self, metrics):
  676. """Saves training metrics to a CSV file."""
  677. keys, vals = list(metrics.keys()), list(metrics.values())
  678. n = len(metrics) + 1 # number of cols
  679. s = "" if self.csv.exists() else (("%23s," * n % tuple(["epoch"] + keys)).rstrip(",") + "\n") # header
  680. with open(self.csv, "a") as f:
  681. f.write(s + ("%23.5g," * n % tuple([self.epoch + 1] + vals)).rstrip(",") + "\n")
  682. if self.args.is_train_on_platform:
  683. TrainSdk.save_output_model(self.csv)
  684. def plot_metrics(self):
  685. """Plot and display metrics visually."""
  686. pass
  687. def on_plot(self, name, data=None):
  688. """Registers plots (e.g. to be consumed in callbacks)"""
  689. path = Path(name)
  690. self.plots[path] = {"data": data, "timestamp": time.time()}
  691. def final_eval(self):
  692. """Performs final evaluation and validation for object detection YOLO model."""
  693. for f in self.last, self.best:
  694. if f.exists():
  695. strip_optimizer(f) # strip optimizers
  696. if f is self.best:
  697. LOGGER.info(f"\nValidating {f}...")
  698. self.validator.args.plots = self.args.plots
  699. self.metrics = self.validator(model=f)
  700. self.metrics.pop("fitness", None)
  701. self.run_callbacks("on_fit_epoch_end")
  702. def check_resume(self, overrides):
  703. """Check if resume checkpoint exists and update arguments accordingly."""
  704. resume = self.args.resume
  705. if resume:
  706. try:
  707. exists = isinstance(resume, (str, Path)) and Path(resume).exists()
  708. last = Path(check_file(resume) if exists else get_latest_run())
  709. # Check that resume data YAML exists, otherwise strip to force re-download of dataset
  710. ckpt_args = attempt_load_weights(last).args
  711. if not Path(ckpt_args["data"]).exists():
  712. ckpt_args["data"] = self.args.data
  713. resume = True
  714. self.args = get_cfg(ckpt_args)
  715. self.args.model = self.args.resume = str(last) # reinstate model
  716. for k in "imgsz", "batch", "device": # allow arg updates to reduce memory or update device on resume
  717. if k in overrides:
  718. setattr(self.args, k, overrides[k])
  719. except Exception as e:
  720. raise FileNotFoundError(
  721. "Resume checkpoint not found. Please pass a valid checkpoint to resume from, "
  722. "i.e. 'yolo train resume model=path/to/last.pt'"
  723. ) from e
  724. self.resume = resume
  725. def resume_training(self, ckpt):
  726. """Resume YOLO training from given epoch and best fitness."""
  727. if ckpt is None or not self.resume:
  728. return
  729. best_fitness = 0.0
  730. start_epoch = ckpt.get("epoch", -1) + 1
  731. if ckpt.get("optimizer", None) is not None:
  732. self.optimizer.load_state_dict(ckpt["optimizer"]) # optimizer
  733. best_fitness = ckpt["best_fitness"]
  734. if self.ema and ckpt.get("ema"):
  735. self.ema.ema.load_state_dict(ckpt["ema"].float().state_dict()) # EMA
  736. self.ema.updates = ckpt["updates"]
  737. assert start_epoch > 0, (
  738. f"{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n"
  739. f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'"
  740. )
  741. LOGGER.info(f"Resuming training {self.args.model} from epoch {start_epoch + 1} to {self.epochs} total epochs")
  742. if self.epochs < start_epoch:
  743. LOGGER.info(
  744. f"{self.model} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {self.epochs} more epochs."
  745. )
  746. self.epochs += ckpt["epoch"] # finetune additional epochs
  747. self.best_fitness = best_fitness
  748. self.start_epoch = start_epoch
  749. if start_epoch > (self.epochs - self.args.close_mosaic):
  750. self._close_dataloader_mosaic()
  751. def _close_dataloader_mosaic(self):
  752. """Update dataloaders to stop using mosaic augmentation."""
  753. if hasattr(self.train_loader.dataset, "mosaic"):
  754. self.train_loader.dataset.mosaic = False
  755. if hasattr(self.train_loader.dataset, "close_mosaic"):
  756. LOGGER.info("Closing dataloader mosaic")
  757. self.train_loader.dataset.close_mosaic(hyp=self.args)
  758. def build_optimizer(self, model, name="auto", lr=0.001, momentum=0.9, decay=1e-5, iterations=1e5):
  759. """
  760. Constructs an optimizer for the given model, based on the specified optimizer name, learning rate, momentum,
  761. weight decay, and number of iterations.
  762. Args:
  763. model (torch.nn.Module): The model for which to build an optimizer.
  764. name (str, optional): The name of the optimizer to use. If 'auto', the optimizer is selected
  765. based on the number of iterations. Default: 'auto'.
  766. lr (float, optional): The learning rate for the optimizer. Default: 0.001.
  767. momentum (float, optional): The momentum factor for the optimizer. Default: 0.9.
  768. decay (float, optional): The weight decay for the optimizer. Default: 1e-5.
  769. iterations (float, optional): The number of iterations, which determines the optimizer if
  770. name is 'auto'. Default: 1e5.
  771. Returns:
  772. (torch.optim.Optimizer): The constructed optimizer.
  773. """
  774. g = [], [], [] # optimizer parameter groups
  775. bn = tuple(v for k, v in nn.__dict__.items() if "Norm" in k) # normalization layers, i.e. BatchNorm2d()
  776. if name == "auto":
  777. LOGGER.info(
  778. f"{colorstr('optimizer:')} 'optimizer=auto' found, "
  779. f"ignoring 'lr0={self.args.lr0}' and 'momentum={self.args.momentum}' and "
  780. f"determining best 'optimizer', 'lr0' and 'momentum' automatically... "
  781. )
  782. nc = getattr(model, "nc", 10) # number of classes
  783. lr_fit = round(0.002 * 5 / (4 + nc), 6) # lr0 fit equation to 6 decimal places
  784. name, lr, momentum = ("SGD", 0.01, 0.9) if iterations > 10000 else ("AdamW", lr_fit, 0.9)
  785. self.args.warmup_bias_lr = 0.0 # no higher than 0.01 for Adam
  786. for module_name, module in model.named_modules():
  787. for param_name, param in module.named_parameters(recurse=False):
  788. fullname = f"{module_name}.{param_name}" if module_name else param_name
  789. if "bias" in fullname: # bias (no decay)
  790. g[2].append(param)
  791. elif isinstance(module, bn): # weight (no decay)
  792. g[1].append(param)
  793. else: # weight (with decay)
  794. g[0].append(param)
  795. if name in {"Adam", "Adamax", "AdamW", "NAdam", "RAdam"}:
  796. optimizer = getattr(optim, name, optim.Adam)(g[2], lr=lr, betas=(momentum, 0.999), weight_decay=0.0)
  797. elif name == "RMSProp":
  798. optimizer = optim.RMSprop(g[2], lr=lr, momentum=momentum)
  799. elif name == "SGD":
  800. optimizer = optim.SGD(g[2], lr=lr, momentum=momentum, nesterov=True)
  801. else:
  802. raise NotImplementedError(
  803. f"Optimizer '{name}' not found in list of available optimizers "
  804. f"[Adam, AdamW, NAdam, RAdam, RMSProp, SGD, auto]."
  805. "To request support for addition optimizers please visit https://github.com/ultralytics/ultralytics."
  806. )
  807. optimizer.add_param_group({"params": g[0], "weight_decay": decay}) # add g0 with weight_decay
  808. optimizer.add_param_group({"params": g[1], "weight_decay": 0.0}) # add g1 (BatchNorm2d weights)
  809. LOGGER.info(
  810. f"{colorstr('optimizer:')} {type(optimizer).__name__}(lr={lr}, momentum={momentum}) with parameter groups "
  811. f'{len(g[1])} weight(decay=0.0), {len(g[0])} weight(decay={decay}), {len(g[2])} bias(decay=0.0)'
  812. )
  813. return optimizer