mlflow.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. # Ultralytics YOLO 🚀, AGPL-3.0 license
  2. """
  3. MLflow Logging for Ultralytics YOLO.
  4. This module enables MLflow logging for Ultralytics YOLO. It logs metrics, parameters, and model artifacts.
  5. For setting up, a tracking URI should be specified. The logging can be customized using environment variables.
  6. Commands:
  7. 1. To set a project name:
  8. `export MLFLOW_EXPERIMENT_NAME=<your_experiment_name>` or use the project=<project> argument
  9. 2. To set a run name:
  10. `export MLFLOW_RUN=<your_run_name>` or use the name=<name> argument
  11. 3. To start a local MLflow server:
  12. mlflow server --backend-store-uri runs/mlflow
  13. It will by default start a local server at http://127.0.0.1:5000.
  14. To specify a different URI, set the MLFLOW_TRACKING_URI environment variable.
  15. 4. To kill all running MLflow server instances:
  16. ps aux | grep 'mlflow' | grep -v 'grep' | awk '{print $2}' | xargs kill -9
  17. """
  18. from ultralytics.utils import LOGGER, RUNS_DIR, SETTINGS, TESTS_RUNNING, colorstr
  19. try:
  20. import os
  21. assert not TESTS_RUNNING or "test_mlflow" in os.environ.get("PYTEST_CURRENT_TEST", "") # do not log pytest
  22. assert SETTINGS["mlflow"] is True # verify integration is enabled
  23. import mlflow
  24. assert hasattr(mlflow, "__version__") # verify package is not directory
  25. from pathlib import Path
  26. PREFIX = colorstr("MLflow: ")
  27. except (ImportError, AssertionError):
  28. mlflow = None
  29. def sanitize_dict(x):
  30. """Sanitize dictionary keys by removing parentheses and converting values to floats."""
  31. return {k.replace("(", "").replace(")", ""): float(v) for k, v in x.items()}
  32. def on_pretrain_routine_end(trainer):
  33. """
  34. Log training parameters to MLflow at the end of the pretraining routine.
  35. This function sets up MLflow logging based on environment variables and trainer arguments. It sets the tracking URI,
  36. experiment name, and run name, then starts the MLflow run if not already active. It finally logs the parameters
  37. from the trainer.
  38. Args:
  39. trainer (ultralytics.engine.trainer.BaseTrainer): The training object with arguments and parameters to log.
  40. Global:
  41. mlflow: The imported mlflow module to use for logging.
  42. Environment Variables:
  43. MLFLOW_TRACKING_URI: The URI for MLflow tracking. If not set, defaults to 'runs/mlflow'.
  44. MLFLOW_EXPERIMENT_NAME: The name of the MLflow experiment. If not set, defaults to trainer.args.project.
  45. MLFLOW_RUN: The name of the MLflow run. If not set, defaults to trainer.args.name.
  46. MLFLOW_KEEP_RUN_ACTIVE: Boolean indicating whether to keep the MLflow run active after the end of training.
  47. """
  48. global mlflow
  49. uri = os.environ.get("MLFLOW_TRACKING_URI") or str(RUNS_DIR / "mlflow")
  50. LOGGER.debug(f"{PREFIX} tracking uri: {uri}")
  51. mlflow.set_tracking_uri(uri)
  52. # Set experiment and run names
  53. experiment_name = os.environ.get("MLFLOW_EXPERIMENT_NAME") or trainer.args.project or "/Shared/YOLOv8"
  54. run_name = os.environ.get("MLFLOW_RUN") or trainer.args.name
  55. mlflow.set_experiment(experiment_name)
  56. mlflow.autolog()
  57. try:
  58. active_run = mlflow.active_run() or mlflow.start_run(run_name=run_name)
  59. LOGGER.info(f"{PREFIX}logging run_id({active_run.info.run_id}) to {uri}")
  60. if Path(uri).is_dir():
  61. LOGGER.info(f"{PREFIX}view at http://127.0.0.1:5000 with 'mlflow server --backend-store-uri {uri}'")
  62. LOGGER.info(f"{PREFIX}disable with 'yolo settings mlflow=False'")
  63. mlflow.log_params(dict(trainer.args))
  64. except Exception as e:
  65. LOGGER.warning(f"{PREFIX}WARNING ⚠️ Failed to initialize: {e}\n" f"{PREFIX}WARNING ⚠️ Not tracking this run")
  66. def on_train_epoch_end(trainer):
  67. """Log training metrics at the end of each train epoch to MLflow."""
  68. if mlflow:
  69. mlflow.log_metrics(
  70. metrics={
  71. **sanitize_dict(trainer.lr),
  72. **sanitize_dict(trainer.label_loss_items(trainer.tloss, prefix="train")),
  73. },
  74. step=trainer.epoch,
  75. )
  76. def on_fit_epoch_end(trainer):
  77. """Log training metrics at the end of each fit epoch to MLflow."""
  78. if mlflow:
  79. mlflow.log_metrics(metrics=sanitize_dict(trainer.metrics), step=trainer.epoch)
  80. def on_train_end(trainer):
  81. """Log model artifacts at the end of the training."""
  82. if not mlflow:
  83. return
  84. mlflow.log_artifact(str(trainer.best.parent)) # log save_dir/weights directory with best.pt and last.pt
  85. for f in trainer.save_dir.glob("*"): # log all other files in save_dir
  86. if f.suffix in {".png", ".jpg", ".csv", ".pt", ".yaml"}:
  87. mlflow.log_artifact(str(f))
  88. keep_run_active = os.environ.get("MLFLOW_KEEP_RUN_ACTIVE", "False").lower() == "true"
  89. if keep_run_active:
  90. LOGGER.info(f"{PREFIX}mlflow run still alive, remember to close it using mlflow.end_run()")
  91. else:
  92. mlflow.end_run()
  93. LOGGER.debug(f"{PREFIX}mlflow run ended")
  94. LOGGER.info(
  95. f"{PREFIX}results logged to {mlflow.get_tracking_uri()}\n{PREFIX}disable with 'yolo settings mlflow=False'"
  96. )
  97. callbacks = (
  98. {
  99. "on_pretrain_routine_end": on_pretrain_routine_end,
  100. "on_train_epoch_end": on_train_epoch_end,
  101. "on_fit_epoch_end": on_fit_epoch_end,
  102. "on_train_end": on_train_end,
  103. }
  104. if mlflow
  105. else {}
  106. )