Source code for utils.stats

try:
    from resource import getrusage, RUSAGE_CHILDREN, RUSAGE_SELF

[docs] def get_memory_mb(): """ Get the memory usage of the current process and its children. Returns: dict: A dictionary containing the memory usage of the current process and its children. The dictionary has the following keys: - self: The memory usage of the current process. - children: The memory usage of the children of the current process. - total: The total memory usage of the current process and its children. """ res = { "self": getrusage(RUSAGE_SELF).ru_maxrss / 1024, "children": getrusage(RUSAGE_CHILDREN).ru_maxrss / 1024, "total": getrusage(RUSAGE_SELF).ru_maxrss / 1024 + getrusage(RUSAGE_CHILDREN).ru_maxrss / 1024 } return res
except BaseException: get_memory_mb = None import torch try: if torch.cuda.is_available(): from utils.conf import get_alloc_memory_all_devices def get_memory_gpu_mb(avail_devices=None): """ Get the memory usage of the selected GPUs in MB. """ return [d / 1024 / 1024 for d in get_alloc_memory_all_devices(avail_devices=avail_devices)] else: get_memory_gpu_mb = None except BaseException: get_memory_gpu_mb = None import logging from utils.loggers import Logger def _parse_device_ids(device): """ Normalize a device specification to a list of CUDA ids. """ if device is None: return None if isinstance(device, torch.device): if device.type != 'cuda': return None if device.index is None: return list(range(torch.cuda.device_count())) if 0 <= device.index < torch.cuda.device_count(): return [device.index] logging.warning(f"Requested device index {device.index} is out of range.") return None if isinstance(device, str): if 'cuda' not in device: return None parts = [p for p in device.split(',') if p.strip() != ''] if len(parts) == 0: return list(range(torch.cuda.device_count())) ids = [] for p in parts: try: ids.append(int(p.split(':')[-1])) except ValueError: logging.warning(f"Could not parse device id from `{p}`, skipping.") ids = [i for i in ids if 0 <= i < torch.cuda.device_count()] if len(ids) == 0: logging.warning("No valid CUDA device ids parsed, falling back to all visible devices.") return list(range(torch.cuda.device_count())) return ids if isinstance(device, (list, tuple)): ids = [] for d in device: if isinstance(d, int): ids.append(d) elif isinstance(d, torch.device) and d.type == 'cuda' and d.index is not None: ids.append(d.index) ids = [i for i in ids if 0 <= i < torch.cuda.device_count()] return ids or None return None
[docs] class track_system_stats: """ A context manager that tracks the memory usage of the system. Tracks both CPU and GPU memory usage if available. Usage: .. code-block:: python with track_system_stats() as t: for i in range(100): ... # Do something t() cpu_res, gpu_res = t.cpu_res, t.gpu_res Args: logger (Logger): external logger. device: Device (or list of devices) to monitor. Defaults to all visible CUDA devices. disabled (bool): If True, the context manager will not track the memory usage. """
[docs] def get_stats(self): """ Get the memory usage of the system. Returns: tuple: (cpu_res, gpu_res) where cpu_res is the memory usage of the CPU and gpu_res is the memory usage of the GPU. """ cpu_res = None if get_memory_mb is not None: cpu_res = get_memory_mb()['total'] gpu_res = None if get_memory_gpu_mb is not None: gpu_res = get_memory_gpu_mb(self.gpu_ids) gpu_res = self._zip_gpu_res(gpu_res) return cpu_res, gpu_res
def __init__(self, logger: Logger = None, device=None, disabled=False): self.logger = logger self.disabled = disabled self._it = 0 self.gpu_ids = _parse_device_ids(device) if torch.cuda.is_available() else None def __enter__(self): if self.disabled: return self self.initial_cpu_res, self.initial_gpu_res = self.get_stats() if self.initial_cpu_res is None and self.initial_gpu_res is None: self.disabled = True else: self.avg_gpu_res = self.initial_gpu_res self.avg_cpu_res = self.initial_cpu_res self.max_cpu_res = self.initial_cpu_res self.max_gpu_res = self.initial_gpu_res if self.logger is not None: self.logger.log_system_stats(self.initial_cpu_res, self.initial_gpu_res) return self def __call__(self): if self.disabled: return cpu_res, gpu_res = self.get_stats() self.update_stats(cpu_res, gpu_res) def __exit__(self, exc_type, exc_val, exc_tb): if self.disabled: return if torch.cuda.is_available(): torch.cuda.synchronize() # this allows to raise errors triggered previously by the GPU cpu_res, gpu_res = self.get_stats() self.update_stats(cpu_res, gpu_res)
[docs] def update_stats(self, cpu_res, gpu_res): """ Update the memory usage statistics. Args: cpu_res (float): The memory usage of the CPU. gpu_res (dict): The memory usage of the GPUs keyed by device id. """ if self.disabled: return self._it += 1 alpha = 1 / self._it if self.initial_cpu_res is not None: self.avg_cpu_res = self.avg_cpu_res + alpha * (cpu_res - self.avg_cpu_res) self.max_cpu_res = max(self.max_cpu_res, cpu_res) if self.initial_gpu_res is not None: self.avg_gpu_res = {g: (g_res + alpha * (g_res - self.avg_gpu_res[g])) for g, g_res in gpu_res.items()} self.max_gpu_res = {g: max(self.max_gpu_res[g], g_res) for g, g_res in gpu_res.items()} if self.logger is not None: self.logger.log_system_stats(cpu_res, gpu_res)
[docs] def print_stats(self): """ Print the memory usage statistics. """ cpu_res, gpu_res = self.get_stats() # Print initial, average, final, and max memory usage logging.info("System stats:") if cpu_res is not None: logging.info(f"\tInitial CPU memory usage: {self.initial_cpu_res:.2f} MB") logging.info(f"\tAverage CPU memory usage: {self.avg_cpu_res:.2f} MB") logging.info(f"\tFinal CPU memory usage: {cpu_res:.2f} MB") logging.info(f"\tMax CPU memory usage: {self.max_cpu_res:.2f} MB") if gpu_res is not None: for gpu_id, g_res in gpu_res.items(): logging.info(f"\tInitial GPU {gpu_id} memory usage: {self.initial_gpu_res[gpu_id]:.2f} MB") logging.info(f"\tAverage GPU {gpu_id} memory usage: {self.avg_gpu_res[gpu_id]:.2f} MB") logging.info(f"\tFinal GPU {gpu_id} memory usage: {g_res:.2f} MB") logging.info(f"\tMax GPU {gpu_id} memory usage: {self.max_gpu_res[gpu_id]:.2f} MB")
def _zip_gpu_res(self, gpu_res): """ Zip a list of GPU stats to a dict keyed by the selected GPU ids. """ if gpu_res is None: return None keys = self.gpu_ids if self.gpu_ids is not None else list(range(len(gpu_res))) if len(keys) != len(gpu_res): logging.warning("Mismatch between provided GPU ids and measured GPUs. Falling back to enumeration.") keys = list(range(len(gpu_res))) return {g: g_res for g, g_res in zip(keys, gpu_res)}