loaders.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. # Ultralytics YOLO 🚀, AGPL-3.0 license
  2. import glob
  3. import math
  4. import os
  5. import time
  6. from dataclasses import dataclass
  7. from pathlib import Path
  8. from threading import Thread
  9. from urllib.parse import urlparse
  10. import cv2
  11. import numpy as np
  12. import requests
  13. import torch
  14. from PIL import Image
  15. from ultralytics.data.utils import FORMATS_HELP_MSG, IMG_FORMATS, VID_FORMATS
  16. from ultralytics.utils import IS_COLAB, IS_KAGGLE, LOGGER, ops
  17. from ultralytics.utils.checks import check_requirements
  18. @dataclass
  19. class SourceTypes:
  20. """Class to represent various types of input sources for predictions."""
  21. stream: bool = False
  22. screenshot: bool = False
  23. from_img: bool = False
  24. tensor: bool = False
  25. class LoadStreams:
  26. """
  27. Stream Loader for various types of video streams, Supports RTSP, RTMP, HTTP, and TCP streams.
  28. Attributes:
  29. sources (str): The source input paths or URLs for the video streams.
  30. vid_stride (int): Video frame-rate stride, defaults to 1.
  31. buffer (bool): Whether to buffer input streams, defaults to False.
  32. running (bool): Flag to indicate if the streaming thread is running.
  33. mode (str): Set to 'stream' indicating real-time capture.
  34. imgs (list): List of image frames for each stream.
  35. fps (list): List of FPS for each stream.
  36. frames (list): List of total frames for each stream.
  37. threads (list): List of threads for each stream.
  38. shape (list): List of shapes for each stream.
  39. caps (list): List of cv2.VideoCapture objects for each stream.
  40. bs (int): Batch size for processing.
  41. Methods:
  42. __init__: Initialize the stream loader.
  43. update: Read stream frames in daemon thread.
  44. close: Close stream loader and release resources.
  45. __iter__: Returns an iterator object for the class.
  46. __next__: Returns source paths, transformed, and original images for processing.
  47. __len__: Return the length of the sources object.
  48. Example:
  49. ```bash
  50. yolo predict source='rtsp://example.com/media.mp4'
  51. ```
  52. """
  53. def __init__(self, sources="file.streams", vid_stride=1, buffer=False):
  54. """Initialize instance variables and check for consistent input stream shapes."""
  55. torch.backends.cudnn.benchmark = True # faster for fixed-size inference
  56. self.buffer = buffer # buffer input streams
  57. self.running = True # running flag for Thread
  58. self.mode = "stream"
  59. self.vid_stride = vid_stride # video frame-rate stride
  60. sources = Path(sources).read_text().rsplit() if os.path.isfile(sources) else [sources]
  61. n = len(sources)
  62. self.bs = n
  63. self.fps = [0] * n # frames per second
  64. self.frames = [0] * n
  65. self.threads = [None] * n
  66. self.caps = [None] * n # video capture objects
  67. self.imgs = [[] for _ in range(n)] # images
  68. self.shape = [[] for _ in range(n)] # image shapes
  69. self.sources = [ops.clean_str(x) for x in sources] # clean source names for later
  70. for i, s in enumerate(sources): # index, source
  71. # Start thread to read frames from video stream
  72. st = f"{i + 1}/{n}: {s}... "
  73. if urlparse(s).hostname in {"www.youtube.com", "youtube.com", "youtu.be"}: # if source is YouTube video
  74. # YouTube format i.e. 'https://www.youtube.com/watch?v=Zgi9g1ksQHc' or 'https://youtu.be/LNwODJXcvt4'
  75. s = get_best_youtube_url(s)
  76. s = eval(s) if s.isnumeric() else s # i.e. s = '0' local webcam
  77. if s == 0 and (IS_COLAB or IS_KAGGLE):
  78. raise NotImplementedError(
  79. "'source=0' webcam not supported in Colab and Kaggle notebooks. "
  80. "Try running 'source=0' in a local environment."
  81. )
  82. self.caps[i] = cv2.VideoCapture(s) # store video capture object
  83. if not self.caps[i].isOpened():
  84. raise ConnectionError(f"{st}Failed to open {s}")
  85. w = int(self.caps[i].get(cv2.CAP_PROP_FRAME_WIDTH))
  86. h = int(self.caps[i].get(cv2.CAP_PROP_FRAME_HEIGHT))
  87. fps = self.caps[i].get(cv2.CAP_PROP_FPS) # warning: may return 0 or nan
  88. self.frames[i] = max(int(self.caps[i].get(cv2.CAP_PROP_FRAME_COUNT)), 0) or float(
  89. "inf"
  90. ) # infinite stream fallback
  91. self.fps[i] = max((fps if math.isfinite(fps) else 0) % 100, 0) or 30 # 30 FPS fallback
  92. success, im = self.caps[i].read() # guarantee first frame
  93. if not success or im is None:
  94. raise ConnectionError(f"{st}Failed to read images from {s}")
  95. self.imgs[i].append(im)
  96. self.shape[i] = im.shape
  97. self.threads[i] = Thread(target=self.update, args=([i, self.caps[i], s]), daemon=True)
  98. LOGGER.info(f"{st}Success ✅ ({self.frames[i]} frames of shape {w}x{h} at {self.fps[i]:.2f} FPS)")
  99. self.threads[i].start()
  100. LOGGER.info("") # newline
  101. def update(self, i, cap, stream):
  102. """Read stream `i` frames in daemon thread."""
  103. n, f = 0, self.frames[i] # frame number, frame array
  104. while self.running and cap.isOpened() and n < (f - 1):
  105. if len(self.imgs[i]) < 30: # keep a <=30-image buffer
  106. n += 1
  107. cap.grab() # .read() = .grab() followed by .retrieve()
  108. if n % self.vid_stride == 0:
  109. success, im = cap.retrieve()
  110. if not success:
  111. im = np.zeros(self.shape[i], dtype=np.uint8)
  112. LOGGER.warning("WARNING ⚠️ Video stream unresponsive, please check your IP camera connection.")
  113. cap.open(stream) # re-open stream if signal was lost
  114. if self.buffer:
  115. self.imgs[i].append(im)
  116. else:
  117. self.imgs[i] = [im]
  118. else:
  119. time.sleep(0.01) # wait until the buffer is empty
  120. def close(self):
  121. """Close stream loader and release resources."""
  122. self.running = False # stop flag for Thread
  123. for thread in self.threads:
  124. if thread.is_alive():
  125. thread.join(timeout=5) # Add timeout
  126. for cap in self.caps: # Iterate through the stored VideoCapture objects
  127. try:
  128. cap.release() # release video capture
  129. except Exception as e:
  130. LOGGER.warning(f"WARNING ⚠️ Could not release VideoCapture object: {e}")
  131. cv2.destroyAllWindows()
  132. def __iter__(self):
  133. """Iterates through YOLO image feed and re-opens unresponsive streams."""
  134. self.count = -1
  135. return self
  136. def __next__(self):
  137. """Returns source paths, transformed and original images for processing."""
  138. self.count += 1
  139. images = []
  140. for i, x in enumerate(self.imgs):
  141. # Wait until a frame is available in each buffer
  142. while not x:
  143. if not self.threads[i].is_alive() or cv2.waitKey(1) == ord("q"): # q to quit
  144. self.close()
  145. raise StopIteration
  146. time.sleep(1 / min(self.fps))
  147. x = self.imgs[i]
  148. if not x:
  149. LOGGER.warning(f"WARNING ⚠️ Waiting for stream {i}")
  150. # Get and remove the first frame from imgs buffer
  151. if self.buffer:
  152. images.append(x.pop(0))
  153. # Get the last frame, and clear the rest from the imgs buffer
  154. else:
  155. images.append(x.pop(-1) if x else np.zeros(self.shape[i], dtype=np.uint8))
  156. x.clear()
  157. return self.sources, images, [""] * self.bs
  158. def __len__(self):
  159. """Return the length of the sources object."""
  160. return self.bs # 1E12 frames = 32 streams at 30 FPS for 30 years
  161. class LoadScreenshots:
  162. """
  163. YOLOv8 screenshot dataloader.
  164. This class manages the loading of screenshot images for processing with YOLOv8.
  165. Suitable for use with `yolo predict source=screen`.
  166. Attributes:
  167. source (str): The source input indicating which screen to capture.
  168. screen (int): The screen number to capture.
  169. left (int): The left coordinate for screen capture area.
  170. top (int): The top coordinate for screen capture area.
  171. width (int): The width of the screen capture area.
  172. height (int): The height of the screen capture area.
  173. mode (str): Set to 'stream' indicating real-time capture.
  174. frame (int): Counter for captured frames.
  175. sct (mss.mss): Screen capture object from `mss` library.
  176. bs (int): Batch size, set to 1.
  177. monitor (dict): Monitor configuration details.
  178. Methods:
  179. __iter__: Returns an iterator object.
  180. __next__: Captures the next screenshot and returns it.
  181. """
  182. def __init__(self, source):
  183. """Source = [screen_number left top width height] (pixels)."""
  184. check_requirements("mss")
  185. import mss # noqa
  186. source, *params = source.split()
  187. self.screen, left, top, width, height = 0, None, None, None, None # default to full screen 0
  188. if len(params) == 1:
  189. self.screen = int(params[0])
  190. elif len(params) == 4:
  191. left, top, width, height = (int(x) for x in params)
  192. elif len(params) == 5:
  193. self.screen, left, top, width, height = (int(x) for x in params)
  194. self.mode = "stream"
  195. self.frame = 0
  196. self.sct = mss.mss()
  197. self.bs = 1
  198. self.fps = 30
  199. # Parse monitor shape
  200. monitor = self.sct.monitors[self.screen]
  201. self.top = monitor["top"] if top is None else (monitor["top"] + top)
  202. self.left = monitor["left"] if left is None else (monitor["left"] + left)
  203. self.width = width or monitor["width"]
  204. self.height = height or monitor["height"]
  205. self.monitor = {"left": self.left, "top": self.top, "width": self.width, "height": self.height}
  206. def __iter__(self):
  207. """Returns an iterator of the object."""
  208. return self
  209. def __next__(self):
  210. """mss screen capture: get raw pixels from the screen as np array."""
  211. im0 = np.asarray(self.sct.grab(self.monitor))[:, :, :3] # BGRA to BGR
  212. s = f"screen {self.screen} (LTWH): {self.left},{self.top},{self.width},{self.height}: "
  213. self.frame += 1
  214. return [str(self.screen)], [im0], [s] # screen, img, string
  215. class LoadImagesAndVideos:
  216. """
  217. YOLOv8 image/video dataloader.
  218. This class manages the loading and pre-processing of image and video data for YOLOv8. It supports loading from
  219. various formats, including single image files, video files, and lists of image and video paths.
  220. Attributes:
  221. files (list): List of image and video file paths.
  222. nf (int): Total number of files (images and videos).
  223. video_flag (list): Flags indicating whether a file is a video (True) or an image (False).
  224. mode (str): Current mode, 'image' or 'video'.
  225. vid_stride (int): Stride for video frame-rate, defaults to 1.
  226. bs (int): Batch size, set to 1 for this class.
  227. cap (cv2.VideoCapture): Video capture object for OpenCV.
  228. frame (int): Frame counter for video.
  229. frames (int): Total number of frames in the video.
  230. count (int): Counter for iteration, initialized at 0 during `__iter__()`.
  231. Methods:
  232. _new_video(path): Create a new cv2.VideoCapture object for a given video path.
  233. """
  234. def __init__(self, path, batch=1, vid_stride=1):
  235. """Initialize the Dataloader and raise FileNotFoundError if file not found."""
  236. parent = None
  237. if isinstance(path, str) and Path(path).suffix == ".txt": # *.txt file with img/vid/dir on each line
  238. parent = Path(path).parent
  239. path = Path(path).read_text().splitlines() # list of sources
  240. files = []
  241. for p in sorted(path) if isinstance(path, (list, tuple)) else [path]:
  242. a = str(Path(p).absolute()) # do not use .resolve() https://github.com/ultralytics/ultralytics/issues/2912
  243. if "*" in a:
  244. files.extend(sorted(glob.glob(a, recursive=True))) # glob
  245. elif os.path.isdir(a):
  246. files.extend(sorted(glob.glob(os.path.join(a, "*.*")))) # dir
  247. elif os.path.isfile(a):
  248. files.append(a) # files (absolute or relative to CWD)
  249. elif parent and (parent / p).is_file():
  250. files.append(str((parent / p).absolute())) # files (relative to *.txt file parent)
  251. else:
  252. raise FileNotFoundError(f"{p} does not exist")
  253. # Define files as images or videos
  254. images, videos = [], []
  255. for f in files:
  256. suffix = f.split(".")[-1].lower() # Get file extension without the dot and lowercase
  257. if suffix in IMG_FORMATS:
  258. images.append(f)
  259. elif suffix in VID_FORMATS:
  260. videos.append(f)
  261. ni, nv = len(images), len(videos)
  262. self.files = images + videos
  263. self.nf = ni + nv # number of files
  264. self.ni = ni # number of images
  265. self.video_flag = [False] * ni + [True] * nv
  266. self.mode = "image"
  267. self.vid_stride = vid_stride # video frame-rate stride
  268. self.bs = batch
  269. if any(videos):
  270. self._new_video(videos[0]) # new video
  271. else:
  272. self.cap = None
  273. if self.nf == 0:
  274. raise FileNotFoundError(f"No images or videos found in {p}. {FORMATS_HELP_MSG}")
  275. def __iter__(self):
  276. """Returns an iterator object for VideoStream or ImageFolder."""
  277. self.count = 0
  278. return self
  279. def __next__(self):
  280. """Returns the next batch of images or video frames along with their paths and metadata."""
  281. paths, imgs, info = [], [], []
  282. while len(imgs) < self.bs:
  283. if self.count >= self.nf: # end of file list
  284. if imgs:
  285. return paths, imgs, info # return last partial batch
  286. else:
  287. raise StopIteration
  288. path = self.files[self.count]
  289. if self.video_flag[self.count]:
  290. self.mode = "video"
  291. if not self.cap or not self.cap.isOpened():
  292. self._new_video(path)
  293. for _ in range(self.vid_stride):
  294. success = self.cap.grab()
  295. if not success:
  296. break # end of video or failure
  297. if success:
  298. success, im0 = self.cap.retrieve()
  299. if success:
  300. self.frame += 1
  301. paths.append(path)
  302. imgs.append(im0)
  303. info.append(f"video {self.count + 1}/{self.nf} (frame {self.frame}/{self.frames}) {path}: ")
  304. if self.frame == self.frames: # end of video
  305. self.count += 1
  306. self.cap.release()
  307. else:
  308. # Move to the next file if the current video ended or failed to open
  309. self.count += 1
  310. if self.cap:
  311. self.cap.release()
  312. if self.count < self.nf:
  313. self._new_video(self.files[self.count])
  314. else:
  315. self.mode = "image"
  316. im0 = cv2.imread(path) # BGR
  317. if im0 is None:
  318. LOGGER.warning(f"WARNING ⚠️ Image Read Error {path}")
  319. else:
  320. paths.append(path)
  321. imgs.append(im0)
  322. info.append(f"image {self.count + 1}/{self.nf} {path}: ")
  323. self.count += 1 # move to the next file
  324. if self.count >= self.ni: # end of image list
  325. break
  326. return paths, imgs, info
  327. def _new_video(self, path):
  328. """Creates a new video capture object for the given path."""
  329. self.frame = 0
  330. self.cap = cv2.VideoCapture(path)
  331. self.fps = int(self.cap.get(cv2.CAP_PROP_FPS))
  332. if not self.cap.isOpened():
  333. raise FileNotFoundError(f"Failed to open video {path}")
  334. self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT) / self.vid_stride)
  335. def __len__(self):
  336. """Returns the number of batches in the object."""
  337. return math.ceil(self.nf / self.bs) # number of files
  338. class LoadPilAndNumpy:
  339. """
  340. Load images from PIL and Numpy arrays for batch processing.
  341. This class is designed to manage loading and pre-processing of image data from both PIL and Numpy formats.
  342. It performs basic validation and format conversion to ensure that the images are in the required format for
  343. downstream processing.
  344. Attributes:
  345. paths (list): List of image paths or autogenerated filenames.
  346. im0 (list): List of images stored as Numpy arrays.
  347. mode (str): Type of data being processed, defaults to 'image'.
  348. bs (int): Batch size, equivalent to the length of `im0`.
  349. Methods:
  350. _single_check(im): Validate and format a single image to a Numpy array.
  351. """
  352. def __init__(self, im0):
  353. """Initialize PIL and Numpy Dataloader."""
  354. if not isinstance(im0, list):
  355. im0 = [im0]
  356. self.paths = [getattr(im, "filename", f"image{i}.jpg") for i, im in enumerate(im0)]
  357. self.im0 = [self._single_check(im) for im in im0]
  358. self.mode = "image"
  359. self.bs = len(self.im0)
  360. @staticmethod
  361. def _single_check(im):
  362. """Validate and format an image to numpy array."""
  363. assert isinstance(im, (Image.Image, np.ndarray)), f"Expected PIL/np.ndarray image type, but got {type(im)}"
  364. if isinstance(im, Image.Image):
  365. if im.mode != "RGB":
  366. im = im.convert("RGB")
  367. im = np.asarray(im)[:, :, ::-1]
  368. im = np.ascontiguousarray(im) # contiguous
  369. return im
  370. def __len__(self):
  371. """Returns the length of the 'im0' attribute."""
  372. return len(self.im0)
  373. def __next__(self):
  374. """Returns batch paths, images, processed images, None, ''."""
  375. if self.count == 1: # loop only once as it's batch inference
  376. raise StopIteration
  377. self.count += 1
  378. return self.paths, self.im0, [""] * self.bs
  379. def __iter__(self):
  380. """Enables iteration for class LoadPilAndNumpy."""
  381. self.count = 0
  382. return self
  383. class LoadTensor:
  384. """
  385. Load images from torch.Tensor data.
  386. This class manages the loading and pre-processing of image data from PyTorch tensors for further processing.
  387. Attributes:
  388. im0 (torch.Tensor): The input tensor containing the image(s).
  389. bs (int): Batch size, inferred from the shape of `im0`.
  390. mode (str): Current mode, set to 'image'.
  391. paths (list): List of image paths or filenames.
  392. count (int): Counter for iteration, initialized at 0 during `__iter__()`.
  393. Methods:
  394. _single_check(im, stride): Validate and possibly modify the input tensor.
  395. """
  396. def __init__(self, im0) -> None:
  397. """Initialize Tensor Dataloader."""
  398. self.im0 = self._single_check(im0)
  399. self.bs = self.im0.shape[0]
  400. self.mode = "image"
  401. self.paths = [getattr(im, "filename", f"image{i}.jpg") for i, im in enumerate(im0)]
  402. @staticmethod
  403. def _single_check(im, stride=32):
  404. """Validate and format an image to torch.Tensor."""
  405. s = (
  406. f"WARNING ⚠️ torch.Tensor inputs should be BCHW i.e. shape(1, 3, 640, 640) "
  407. f"divisible by stride {stride}. Input shape{tuple(im.shape)} is incompatible."
  408. )
  409. if len(im.shape) != 4:
  410. if len(im.shape) != 3:
  411. raise ValueError(s)
  412. LOGGER.warning(s)
  413. im = im.unsqueeze(0)
  414. if im.shape[2] % stride or im.shape[3] % stride:
  415. raise ValueError(s)
  416. if im.max() > 1.0 + torch.finfo(im.dtype).eps: # torch.float32 eps is 1.2e-07
  417. LOGGER.warning(
  418. f"WARNING ⚠️ torch.Tensor inputs should be normalized 0.0-1.0 but max value is {im.max()}. "
  419. f"Dividing input by 255."
  420. )
  421. im = im.float() / 255.0
  422. return im
  423. def __iter__(self):
  424. """Returns an iterator object."""
  425. self.count = 0
  426. return self
  427. def __next__(self):
  428. """Return next item in the iterator."""
  429. if self.count == 1:
  430. raise StopIteration
  431. self.count += 1
  432. return self.paths, self.im0, [""] * self.bs
  433. def __len__(self):
  434. """Returns the batch size."""
  435. return self.bs
  436. def autocast_list(source):
  437. """Merges a list of source of different types into a list of numpy arrays or PIL images."""
  438. files = []
  439. for im in source:
  440. if isinstance(im, (str, Path)): # filename or uri
  441. files.append(Image.open(requests.get(im, stream=True).raw if str(im).startswith("http") else im))
  442. elif isinstance(im, (Image.Image, np.ndarray)): # PIL or np Image
  443. files.append(im)
  444. else:
  445. raise TypeError(
  446. f"type {type(im).__name__} is not a supported Ultralytics prediction source type. \n"
  447. f"See https://docs.ultralytics.com/modes/predict for supported source types."
  448. )
  449. return files
  450. def get_best_youtube_url(url, method="pytube"):
  451. """
  452. Retrieves the URL of the best quality MP4 video stream from a given YouTube video.
  453. This function uses the specified method to extract the video info from YouTube. It supports the following methods:
  454. - "pytube": Uses the pytube library to fetch the video streams.
  455. - "pafy": Uses the pafy library to fetch the video streams.
  456. - "yt-dlp": Uses the yt-dlp library to fetch the video streams.
  457. The function then finds the highest quality MP4 format that has a video codec but no audio codec, and returns the
  458. URL of this video stream.
  459. Args:
  460. url (str): The URL of the YouTube video.
  461. method (str): The method to use for extracting video info. Default is "pytube". Other options are "pafy" and
  462. "yt-dlp".
  463. Returns:
  464. (str): The URL of the best quality MP4 video stream, or None if no suitable stream is found.
  465. """
  466. if method == "pytube":
  467. check_requirements("pytube")
  468. from pytube import YouTube
  469. streams = YouTube(url).streams.filter(file_extension="mp4", only_video=True)
  470. streams = sorted(streams, key=lambda s: s.resolution, reverse=True) # sort streams by resolution
  471. for stream in streams:
  472. if stream.resolution and int(stream.resolution[:-1]) >= 1080: # check if resolution is at least 1080p
  473. return stream.url
  474. elif method == "pafy":
  475. check_requirements(("pafy", "youtube_dl==2020.12.2"))
  476. import pafy # noqa
  477. return pafy.new(url).getbestvideo(preftype="mp4").url
  478. elif method == "yt-dlp":
  479. check_requirements("yt-dlp")
  480. import yt_dlp
  481. with yt_dlp.YoutubeDL({"quiet": True}) as ydl:
  482. info_dict = ydl.extract_info(url, download=False) # extract info
  483. for f in reversed(info_dict.get("formats", [])): # reversed because best is usually last
  484. # Find a format with video codec, no audio, *.mp4 extension at least 1920x1080 size
  485. good_size = (f.get("width") or 0) >= 1920 or (f.get("height") or 0) >= 1080
  486. if good_size and f["vcodec"] != "none" and f["acodec"] == "none" and f["ext"] == "mp4":
  487. return f.get("url")
  488. # Define constants
  489. LOADERS = (LoadStreams, LoadPilAndNumpy, LoadImagesAndVideos, LoadScreenshots)