Source code for nvitop.api.collector

# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2024 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Resource metrics collectors."""

from __future__ import annotations

import contextlib
import itertools
import math
import os
import threading
import time
from collections import OrderedDict, defaultdict
from typing import Callable, ClassVar, Generator, Iterable, NamedTuple, TypeVar
from weakref import WeakSet

from nvitop.api import host
from nvitop.api.device import CudaDevice, Device
from nvitop.api.process import GpuProcess, HostProcess
from nvitop.api.utils import GiB, MiB, Snapshot


__all__ = ['take_snapshots', 'collect_in_background', 'ResourceMetricCollector']


class SnapshotResult(NamedTuple):  # pylint: disable=missing-class-docstring
    devices: list[Snapshot]
    gpu_processes: list[Snapshot]


timer = time.monotonic


_T = TypeVar('_T')


def _unique(iterable: Iterable[_T]) -> list[_T]:
    return list(OrderedDict.fromkeys(iterable).keys())


# pylint: disable-next=too-many-branches
[docs]def take_snapshots(
    devices: Device | Iterable[Device] | None = None,
    *,
    gpu_processes: bool | GpuProcess | Iterable[GpuProcess] | None = None,
) -> SnapshotResult:
    """Retrieve status of demanded devices and GPU processes.

    Args:
        devices (Optional[Union[Device, Iterable[Device]]]):
            Requested devices for snapshots. If not given, the devices will be determined from GPU
            processes: **(1)** All devices (no GPU processes are given); **(2)** Devices that used
            by given GPU processes.
        gpu_processes (Optional[Union[bool, GpuProcess, Iterable[GpuProcess]]]):
            Requested GPU processes snapshots. If not given, all GPU processes running on the
            requested device will be returned. The GPU process snapshots can be suppressed by
            specifying ``gpu_processes=False``.

    Returns: SnapshotResult
        A named tuple containing two lists of snapshots.

    Note:
        If not arguments are specified, all devices and all GPU processes will
        be returned.

    Examples:
        >>> from nvitop import take_snapshots, Device
        >>> import os
        >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
        >>> os.environ['CUDA_VISIBLE_DEVICES'] = '1,0'

        >>> take_snapshots()  # equivalent to `take_snapshots(Device.all())`
        SnapshotResult(
            devices=[
                PhysicalDeviceSnapshot(
                    real=PhysicalDevice(index=0, ...),
                    ...
                ),
                ...
            ],
            gpu_processes=[
                GpuProcessSnapshot(
                    real=GpuProcess(pid=xxxxxx, device=PhysicalDevice(index=0, ...), ...),
                    ...
                ),
                ...
            ]
        )

        >>> device_snapshots, gpu_process_snapshots = take_snapshots(Device.all())  # type: Tuple[List[DeviceSnapshot], List[GpuProcessSnapshot]]

        >>> device_snapshots, _ = take_snapshots(gpu_processes=False)  # ignore process snapshots

        >>> take_snapshots(Device.cuda.all())  # use CUDA device enumeration
        SnapshotResult(
            devices=[
                CudaDeviceSnapshot(
                    real=CudaDevice(cuda_index=0, physical_index=1, ...),
                    ...
                ),
                CudaDeviceSnapshot(
                    real=CudaDevice(cuda_index=1, physical_index=0, ...),
                    ...
                ),
            ],
            gpu_processes=[
                GpuProcessSnapshot(
                    real=GpuProcess(pid=xxxxxx, device=CudaDevice(cuda_index=0, ...), ...),
                    ...
                ),
                ...
            ]
        )

        >>> take_snapshots(Device.cuda(1))  # <CUDA 1> only
        SnapshotResult(
            devices=[
                CudaDeviceSnapshot(
                    real=CudaDevice(cuda_index=1, physical_index=0, ...),
                    ...
                )
            ],
            gpu_processes=[
                GpuProcessSnapshot(
                    real=GpuProcess(pid=xxxxxx, device=CudaDevice(cuda_index=1, ...), ...),
                    ...
                ),
                ...
            ]
        )
    """  # pylint: disable=line-too-long
    if isinstance(devices, Device):
        devices = [devices]
    if isinstance(gpu_processes, GpuProcess):
        gpu_processes = [gpu_processes]

    if gpu_processes is not None and gpu_processes is not True:
        if gpu_processes:  # is a non-empty list/tuple
            gpu_processes = list(gpu_processes)
            process_devices = _unique(process.device for process in gpu_processes)
            for device in process_devices:
                device.processes()  # update GPU status for requested GPU processes
            if devices is None:
                devices = process_devices
        else:
            gpu_processes = []  # False or empty list/tuple
            if devices is None:
                devices = Device.all()
    else:
        if devices is None:
            physical_devices = Device.all()
            devices = []
            leaf_devices: list[Device] = []
            for physical_device in physical_devices:
                devices.append(physical_device)
                mig_devices = physical_device.mig_devices()
                if len(mig_devices) > 0:
                    devices.extend(mig_devices)
                    leaf_devices.extend(mig_devices)
                else:
                    leaf_devices.append(physical_device)
        else:
            leaf_devices = devices = list(devices)
        gpu_processes = list(
            itertools.chain.from_iterable(device.processes().values() for device in leaf_devices),
        )

    devices = [device.as_snapshot() for device in devices]  # type: ignore[union-attr]
    gpu_processes = GpuProcess.take_snapshots(gpu_processes, failsafe=True)

    return SnapshotResult(devices, gpu_processes)


# pylint: disable-next=too-many-arguments
[docs]def collect_in_background(
    on_collect: Callable[[dict[str, float]], bool],
    collector: ResourceMetricCollector | None = None,
    interval: float | None = None,
    *,
    on_start: Callable[[ResourceMetricCollector], None] | None = None,
    on_stop: Callable[[ResourceMetricCollector], None] | None = None,
    tag: str = 'metrics-daemon',
    start: bool = True,
) -> threading.Thread:
    """Start a background daemon thread that collect and call the callback function periodically.

    See also :func:`ResourceMetricCollector.daemonize`.

    Args:
        on_collect (Callable[[Dict[str, float]], bool]):
            A callback function that will be called periodically. It takes a dictionary containing
            the resource metrics and returns a boolean indicating whether to continue monitoring.
        collector (Optional[ResourceMetricCollector]):
            A :class:`ResourceMetricCollector` instance to collect metrics. If not given, it will
            collect metrics for all GPUs and subprocess of the current process.
        interval (Optional[float]):
            The collect interval. If not given, use ``collector.interval``.
        on_start (Optional[Callable[[ResourceMetricCollector], None]]):
            A function to initialize the daemon thread and collector.
        on_stop (Optional[Callable[[ResourceMetricCollector], None]]):
            A function that do some necessary cleanup after the daemon thread is stopped.
        tag (str):
            The tag prefix used for metrics results.
        start (bool):
            Whether to start the daemon thread on return.

    Returns: threading.Thread
        A daemon thread object.

    Examples:
        .. code-block:: python

            logger = ...

            def on_collect(metrics):  # will be called periodically
                if logger.is_closed():  # closed manually by user
                    return False
                logger.log(metrics)
                return True

            def on_stop(collector):  # will be called only once at stop
                if not logger.is_closed():
                    logger.close()  # cleanup

            # Record metrics to the logger in the background every 5 seconds.
            # It will collect 5-second mean/min/max for each metric.
            collect_in_background(
                on_collect,
                ResourceMetricCollector(Device.cuda.all()),
                interval=5.0,
                on_stop=on_stop,
            )
    """
    if collector is None:
        collector = ResourceMetricCollector()
    if isinstance(interval, (int, float)) and interval > 0:
        interval = float(interval)
    elif interval is None:
        interval = collector.interval
    else:
        raise ValueError(f'Invalid argument interval={interval!r}')

    def target() -> None:
        if on_start is not None:
            on_start(collector)  # type: ignore[arg-type]
        try:
            with collector(tag):  # type: ignore[misc]
                try:
                    next_snapshot = timer() + interval  # type: ignore[operator]
                    while on_collect(collector.collect()):  # type: ignore[union-attr]
                        time.sleep(max(0.0, next_snapshot - timer()))
                        next_snapshot += interval  # type: ignore[operator]
                except KeyboardInterrupt:
                    pass
        finally:
            if on_stop is not None:
                on_stop(collector)  # type: ignore[arg-type]

    daemon = threading.Thread(target=target, name=tag, daemon=True)
    daemon.collector = collector  # type: ignore[attr-defined]
    if start:
        daemon.start()
    return daemon


[docs]class ResourceMetricCollector:  # pylint: disable=too-many-instance-attributes
    """A class for collecting resource metrics.

    Args:
        devices (Iterable[Device]):
            Set of Device instances for logging. If not given, all physical devices on board will be
            used.
        root_pids (Set[int]):
            A set of PIDs, only the status of the descendant processes on the GPUs will be collected.
            If not given, the PID of the current process will be used.
        interval (float):
            The snapshot interval for background daemon thread.

    Core methods:

    .. code-block:: python

        collector.activate(tag='<tag>')  # alias: start
        collector.deactivate()           # alias: stop
        collector.reset(tag='<tag>')
        collector.collect()

        with collector(tag='<tag>'):
            ...

        collector.daemonize(on_collect_fn)

    Examples:
        >>> import os
        >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
        >>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0'

        >>> from nvitop import ResourceMetricCollector, Device

        >>> collector = ResourceMetricCollector()                           # log all devices and descendant processes of the current process on the GPUs
        >>> collector = ResourceMetricCollector(root_pids={1})              # log all devices and all GPU processes
        >>> collector = ResourceMetricCollector(devices=Device.cuda.all())  # use the CUDA ordinal

        >>> with collector(tag='<tag>'):
        ...     # Do something
        ...     collector.collect()  # -> Dict[str, float]
        # key -> '<tag>/<scope>/<metric (unit)>/<mean/min/max>'
        {
            '<tag>/host/cpu_percent (%)/mean': 8.967849777683456,
            '<tag>/host/cpu_percent (%)/min': 6.1,
            '<tag>/host/cpu_percent (%)/max': 28.1,
            ...,
            '<tag>/host/memory_percent (%)/mean': 21.5,
            '<tag>/host/swap_percent (%)/mean': 0.3,
            '<tag>/host/memory_used (GiB)/mean': 91.0136418208109,
            '<tag>/host/load_average (%) (1 min)/mean': 10.251427386878328,
            '<tag>/host/load_average (%) (5 min)/mean': 10.072539414569503,
            '<tag>/host/load_average (%) (15 min)/mean': 11.91126970422139,
            ...,
            '<tag>/cuda:0 (gpu:3)/memory_used (MiB)/mean': 3.875,
            '<tag>/cuda:0 (gpu:3)/memory_free (MiB)/mean': 11015.562499999998,
            '<tag>/cuda:0 (gpu:3)/memory_total (MiB)/mean': 11019.437500000002,
            '<tag>/cuda:0 (gpu:3)/memory_percent (%)/mean': 0.0,
            '<tag>/cuda:0 (gpu:3)/gpu_utilization (%)/mean': 0.0,
            '<tag>/cuda:0 (gpu:3)/memory_utilization (%)/mean': 0.0,
            '<tag>/cuda:0 (gpu:3)/fan_speed (%)/mean': 22.0,
            '<tag>/cuda:0 (gpu:3)/temperature (C)/mean': 25.0,
            '<tag>/cuda:0 (gpu:3)/power_usage (W)/mean': 19.11166264116916,
            ...,
            '<tag>/cuda:1 (gpu:2)/memory_used (MiB)/mean': 8878.875,
            ...,
            '<tag>/cuda:2 (gpu:1)/memory_used (MiB)/mean': 8182.875,
            ...,
            '<tag>/cuda:3 (gpu:0)/memory_used (MiB)/mean': 9286.875,
            ...,
            '<tag>/pid:12345/host/cpu_percent (%)/mean': 151.34342772112265,
            '<tag>/pid:12345/host/host_memory (MiB)/mean': 44749.72373447514,
            '<tag>/pid:12345/host/host_memory_percent (%)/mean': 8.675082352111717,
            '<tag>/pid:12345/host/running_time (min)': 336.23803206741576,
            '<tag>/pid:12345/cuda:1 (gpu:4)/gpu_memory (MiB)/mean': 8861.0,
            '<tag>/pid:12345/cuda:1 (gpu:4)/gpu_memory_percent (%)/mean': 80.4,
            '<tag>/pid:12345/cuda:1 (gpu:4)/gpu_memory_utilization (%)/mean': 6.711118172407917,
            '<tag>/pid:12345/cuda:1 (gpu:4)/gpu_sm_utilization (%)/mean': 48.23283397736476,
            ...,
            '<tag>/duration (s)': 7.247399162035435,
            '<tag>/timestamp': 1655909466.9981883
        }
    """  # pylint: disable=line-too-long

    DEVICE_METRICS: ClassVar[list[tuple[str, str, float | int]]] = [
        # (<attribute>, <name>, <unit>)
        # GPU memory metrics
        ('memory_used', 'memory_used (MiB)', MiB),
        ('memory_free', 'memory_free (MiB)', MiB),
        ('memory_total', 'memory_total (MiB)', MiB),
        ('memory_percent', 'memory_percent (%)', 1.0),
        # GPU utilization metrics
        ('gpu_utilization', 'gpu_utilization (%)', 1.0),
        ('memory_utilization', 'memory_utilization (%)', 1.0),
        # Miscellaneous
        ('fan_speed', 'fan_speed (%)', 1.0),
        ('temperature', 'temperature (C)', 1.0),
        ('power_usage', 'power_usage (W)', 1000.0),
    ]

    PROCESS_METRICS: ClassVar[list[tuple[str, str | None, str, float | int]]] = [
        # (<attribute>, <scope>, <name>, <unit>)
        # Host resource metrics
        ('cpu_percent', 'host', 'cpu_percent (%)', 1.0),
        ('host_memory', 'host', 'host_memory (MiB)', MiB),
        ('host_memory_percent', 'host', 'host_memory_percent (%)', 1.0),
        ('running_time_in_seconds', 'host', 'running_time (min)', 60.0),
        # GPU memory metrics
        ('gpu_memory', None, 'gpu_memory (MiB)', MiB),
        ('gpu_memory_percent', None, 'gpu_memory_percent (%)', 1.0),
        ('gpu_memory_utilization', None, 'gpu_memory_utilization (%)', 1.0),
        # GPU utilization metrics
        ('gpu_sm_utilization', None, 'gpu_sm_utilization (%)', 1.0),
    ]

[docs]    def __init__(
        self,
        devices: Iterable[Device] | None = None,
        root_pids: Iterable[int] | None = None,
        interval: float = 1.0,
    ) -> None:
        """Initialize the resource metric collector."""
        if isinstance(interval, (int, float)) and interval > 0:
            interval = float(interval)
        else:
            raise ValueError(f'Invalid argument interval={interval!r}')

        if devices is None:
            devices = Device.all()

        root_pids: set[int] = {os.getpid()} if root_pids is None else set(root_pids)

        self.interval: float = interval

        self.devices: list[Device] = list(devices)
        self.all_devices: list[Device] = []
        self.leaf_devices: list[Device] = []
        for device in self.devices:
            self.all_devices.append(device)
            mig_devices = device.mig_devices()
            if len(mig_devices) > 0:
                self.all_devices.extend(mig_devices)
                self.leaf_devices.extend(mig_devices)
            else:
                self.leaf_devices.append(device)

        self.root_pids: set[int] = root_pids
        self._positive_processes: WeakSet[HostProcess] = WeakSet(
            HostProcess(pid) for pid in self.root_pids
        )
        self._negative_processes: WeakSet[HostProcess] = WeakSet()

        self._last_timestamp: float = timer() - 2.0 * self.interval
        self._lock: threading.RLock = threading.RLock()
        self._metric_buffer: _MetricBuffer | None = None
        self._tags: set[str] = set()

        self._daemon: threading.Thread = threading.Thread(
            name='metrics-collector-daemon',
            target=self._target,
            daemon=True,
        )
        self._daemon_running: threading.Event = threading.Event()

[docs]    def activate(self, tag: str) -> ResourceMetricCollector:
        """Start a new metric collection with the given tag.

        Args:
            tag (str):
                The name of the new metric collection. The tag will be used to identify the metric
                collection. It must be a unique string.

        Examples:
            >>> collector = ResourceMetricCollector()

            >>> collector.activate(tag='train')  # key prefix -> 'train'
            >>> collector.activate(tag='batch')  # key prefix -> 'train/batch'
            >>> collector.deactivate()           # key prefix -> 'train'
            >>> collector.deactivate()           # the collector has been stopped
            >>> collector.activate(tag='test')   # key prefix -> 'test'
        """
        with self._lock:
            if self._metric_buffer is None or tag not in self._tags:
                self._tags.add(tag)
                self._metric_buffer = _MetricBuffer(tag, self, prev=self._metric_buffer)
                self._last_timestamp = timer() - 2.0 * self.interval
            else:
                raise RuntimeError(f'Resource metric collector is already started with tag "{tag}"')

        self._daemon_running.set()
        try:
            self._daemon.start()
        except RuntimeError:
            pass

        return self

    start = activate

[docs]    def deactivate(self, tag: str | None = None) -> ResourceMetricCollector:
        """Stop the current collection with the given tag and remove all sub-tags.

        If the tag is not specified, deactivate the current active collection. For nested
        collections, the sub-collections will be deactivated as well.

        Args:
            tag (Optional[str]):
                The tag to deactivate. If :data:`None`, the current active collection will be used.
        """
        with self._lock:
            if self._metric_buffer is None:
                if tag is not None:
                    raise RuntimeError('Resource metric collector has not been started yet.')
                return self

            if tag is None:
                tag = self._metric_buffer.tag
            elif tag not in self._tags:
                raise RuntimeError(
                    f'Resource metric collector has not been started with tag "{tag}".',
                )

            buffer = self._metric_buffer
            while True:
                self._tags.remove(buffer.tag)
                if buffer.tag == tag:
                    self._metric_buffer = buffer.prev
                    break
                buffer = buffer.prev  # type: ignore[assignment]

            if self._metric_buffer is None:
                self._daemon_running.clear()

        return self

    stop = deactivate

[docs]    @contextlib.contextmanager
    def context(self, tag: str) -> Generator[ResourceMetricCollector, None, None]:
        """A context manager for starting and stopping resource metric collection.

        Args:
            tag (str):
                The name of the new metric collection. The tag will be used to identify the metric
                collection. It must be a unique string.

        Examples:
            >>> collector = ResourceMetricCollector()

            >>> with collector.context(tag='train'):  # key prefix -> 'train'
            ...     # Do something
            ...     collector.collect()  # -> Dict[str, float]
        """
        try:
            self.activate(tag=tag)
            yield self
        finally:
            self.deactivate(tag=tag)

    __call__ = context  # alias for `with collector(tag='<tag>')`

[docs]    def clear(self, tag: str | None = None) -> None:
        """Reset the metric collection with the given tag.

        If the tag is not specified, reset the current active collection. For nested collections,
        the sub-collections will be reset as well.

        Args:
            tag (Optional[str]):
                The tag to reset. If :data:`None`, the current active collection will be reset.

        Examples:
            >>> collector = ResourceMetricCollector()

            >>> with collector(tag='train'):          # key prefix -> 'train'
            ...     time.sleep(5.0)
            ...     collector.collect()               # metrics within the 5.0s interval
            ...
            ...     time.sleep(5.0)
            ...     collector.collect()               # metrics within the cumulative 10.0s interval
            ...
            ...     collector.reset()                 # reset the active collection
            ...     time.sleep(5.0)
            ...     collector.collect()               # metrics within the 5.0s interval
            ...
            ...     with collector(tag='batch'):      # key prefix -> 'train/batch'
            ...         collector.reset(tag='train')  # reset both 'train' and 'train/batch'
        """
        with self._lock:
            if self._metric_buffer is None:
                if tag is not None:
                    raise RuntimeError('Resource metric collector has not been started yet.')
                return

            if tag is None:
                tag = self._metric_buffer.tag
            elif tag not in self._tags:
                raise RuntimeError(
                    f'Resource metric collector has not been started with tag "{tag}".',
                )

            buffer = self._metric_buffer
            while True:
                buffer.clear()
                if buffer.tag == tag:
                    break
                buffer = buffer.prev  # type: ignore[assignment]

[docs]    def collect(self) -> dict[str, float]:
        """Get the average resource consumption during collection."""
        with self._lock:
            if self._metric_buffer is None:
                raise RuntimeError('Resource metric collector has not been started yet.')

            if timer() - self._last_timestamp > self.interval / 2.0:
                self.take_snapshots()
            return self._metric_buffer.collect()

    # pylint: disable-next=too-many-arguments
[docs]    def daemonize(
        self,
        on_collect: Callable[[dict[str, float]], bool],
        interval: float | None = None,
        *,
        on_start: Callable[[ResourceMetricCollector], None] | None = None,
        on_stop: Callable[[ResourceMetricCollector], None] | None = None,
        tag: str = 'metrics-daemon',
        start: bool = True,
    ) -> threading.Thread:
        """Start a background daemon thread that collect and call the callback function periodically.

        See also :func:`collect_in_background`.

        Args:
            on_collect (Callable[[Dict[str, float]], bool]):
                A callback function that will be called periodically. It takes a dictionary containing
                the resource metrics and returns a boolean indicating whether to continue monitoring.
            interval (Optional[float]):
                The collect interval. If not given, use ``collector.interval``.
            on_start (Optional[Callable[[ResourceMetricCollector], None]]):
                A function to initialize the daemon thread and collector.
            on_stop (Optional[Callable[[ResourceMetricCollector], None]]):
                A function that do some necessary cleanup after the daemon thread is stopped.
            tag (str):
                The tag prefix used for metrics results.
            start (bool):
                Whether to start the daemon thread on return.

        Returns: threading.Thread
            A daemon thread object.

        Examples:
            .. code-block:: python

                logger = ...

                def on_collect(metrics):  # will be called periodically
                    if logger.is_closed():  # closed manually by user
                        return False
                    logger.log(metrics)
                    return True

                def on_stop(collector):  # will be called only once at stop
                    if not logger.is_closed():
                        logger.close()  # cleanup

                # Record metrics to the logger in the background every 5 seconds.
                # It will collect 5-second mean/min/max for each metric.
                ResourceMetricCollector(Device.cuda.all()).daemonize(
                    on_collect,
                    ResourceMetricCollector(Device.cuda.all()),
                    interval=5.0,
                    on_stop=on_stop,
                )
        """
        return collect_in_background(
            on_collect,
            collector=self,
            interval=interval,
            on_start=on_start,
            on_stop=on_stop,
            tag=tag,
            start=start,
        )

[docs]    def __del__(self) -> None:
        """Clean up the demon thread on destruction."""
        self._daemon_running.clear()

    # pylint: disable-next=too-many-branches,too-many-locals,too-many-statements
[docs]    def take_snapshots(self) -> SnapshotResult:
        """Take snapshots of the current resource metrics and update the metric buffer."""
        if len(self.root_pids) > 0:
            all_gpu_processes: list[GpuProcess] = []
            for device in self.leaf_devices:
                all_gpu_processes.extend(device.processes().values())

            gpu_processes = []
            for process in all_gpu_processes:
                if process.host in self._negative_processes:
                    continue

                positive = True
                if process.host not in self._positive_processes:
                    positive = False
                    p = process.host
                    parents = []
                    while p is not None:
                        parents.append(p)
                        if p in self._positive_processes:
                            positive = True
                            break
                        try:
                            p = p.parent()  # type: ignore[assignment]
                        except host.PsutilError:
                            break
                    if positive:
                        self._positive_processes.update(parents)
                    else:
                        self._negative_processes.update(parents)

                if positive:
                    gpu_processes.append(process)
        else:
            gpu_processes = []

        timestamp = timer()
        epoch_timestamp = time.time()
        metrics = {}
        device_snapshots = [device.as_snapshot() for device in self.all_devices]
        gpu_process_snapshots = GpuProcess.take_snapshots(gpu_processes, failsafe=True)

        metrics.update(
            {
                'host/cpu_percent (%)': host.cpu_percent(),
                'host/memory_percent (%)': host.memory_percent(),
                'host/swap_percent (%)': host.swap_percent(),
                'host/memory_used (GiB)': host.virtual_memory().used / GiB,
            },
        )
        load_average = host.load_average()
        if load_average is not None:
            metrics.update(
                {
                    'host/load_average (%) (1 min)': load_average[0],
                    'host/load_average (%) (5 min)': load_average[1],
                    'host/load_average (%) (15 min)': load_average[2],
                },
            )

        device_identifiers = {}
        for device_snapshot in device_snapshots:
            identifier = f'gpu:{device_snapshot.index}'
            if isinstance(device_snapshot.real, CudaDevice):
                identifier = f'cuda:{device_snapshot.cuda_index} ({identifier})'
            device_identifiers[device_snapshot.real] = identifier

            for attr, name, unit in self.DEVICE_METRICS:
                value = float(getattr(device_snapshot, attr)) / unit
                metrics[f'{identifier}/{name}'] = value

        for process_snapshot in gpu_process_snapshots:
            device_identifier = device_identifiers[process_snapshot.device]
            identifier = f'pid:{process_snapshot.pid}'

            for attr, scope, name, unit in self.PROCESS_METRICS:
                scope = scope or device_identifier
                value = float(getattr(process_snapshot, attr)) / unit
                metrics[f'{identifier}/{scope}/{name}'] = value

        with self._lock:
            if self._metric_buffer is not None:
                self._metric_buffer.add(
                    metrics,
                    timestamp=timestamp,
                    epoch_timestamp=epoch_timestamp,
                )
                self._last_timestamp = timestamp

        return SnapshotResult(device_snapshots, gpu_process_snapshots)

    def _target(self) -> None:
        self._daemon_running.wait()
        while self._daemon_running.is_set():
            next_snapshot = timer() + self.interval
            self.take_snapshots()
            time.sleep(max(0.0, next_snapshot - timer()))
            next_snapshot += self.interval


class _MetricBuffer:  # pylint: disable=missing-class-docstring,missing-function-docstring,too-many-instance-attributes
    def __init__(
        self,
        tag: str,
        collector: ResourceMetricCollector,
        prev: _MetricBuffer | None = None,
    ) -> None:
        self.collector: ResourceMetricCollector = collector
        self.prev: _MetricBuffer | None = prev

        self.tag: str = tag
        self.key_prefix: str
        if self.prev is not None:
            self.key_prefix = f'{self.prev.key_prefix}/{self.tag}'
        else:
            self.key_prefix = self.tag

        self.last_timestamp = self.start_timestamp = timer()
        self.last_epoch_timestamp = time.time()
        self.buffer: defaultdict[str, _StatisticsMaintainer] = defaultdict(
            lambda: _StatisticsMaintainer(self.last_timestamp),
        )

        self.len = 0

    def add(
        self,
        metrics: dict[str, float],
        timestamp: float | None = None,
        epoch_timestamp: float | None = None,
    ) -> None:
        if timestamp is None:
            timestamp = timer()
        if epoch_timestamp is None:
            epoch_timestamp = time.time()

        for key in set(self.buffer).difference(metrics):
            self.buffer[key].add(math.nan, timestamp=timestamp)
        for key, value in metrics.items():
            self.buffer[key].add(value, timestamp=timestamp)
        self.len += 1
        self.last_timestamp = timestamp
        self.last_epoch_timestamp = epoch_timestamp

        if self.prev is not None:
            self.prev.add(metrics, timestamp=timestamp)

    def clear(self) -> None:
        self.last_timestamp = self.start_timestamp = timer()
        self.last_epoch_timestamp = time.time()
        self.buffer.clear()
        self.len = 0

    def collect(self) -> dict[str, float]:
        metrics = {
            f'{self.key_prefix}/{key}/{name}': value
            for key, stats in self.buffer.items()
            for name, value in stats.items()
        }
        for key in tuple(metrics.keys()):
            if key.endswith('host/running_time (min)/max'):
                metrics[key[:-4]] = metrics[key]
                del metrics[key]
            elif key.endswith(('host/running_time (min)/mean', 'host/running_time (min)/min')):
                del metrics[key]
        metrics[f'{self.key_prefix}/duration (s)'] = timer() - self.start_timestamp
        metrics[f'{self.key_prefix}/timestamp'] = time.time()
        metrics[f'{self.key_prefix}/last_timestamp'] = self.last_epoch_timestamp
        return metrics

    def __len__(self) -> int:
        return self.len


class _StatisticsMaintainer:  # pylint: disable=missing-class-docstring,missing-function-docstring
    def __init__(self, timestamp: float) -> None:
        self.start_timestamp: float = timestamp
        self.last_timestamp: float = math.nan
        self.integral: float | None = None
        self.last_value: float | None = None
        self.min_value: float | None = None
        self.max_value: float | None = None
        self.has_nan: bool = False

    def add(self, value: float, timestamp: float | None = None) -> None:
        if timestamp is None:
            timestamp = timer()

        if math.isnan(value):
            self.has_nan = True
            return

        if self.last_value is None:
            self.integral = value * (timestamp - self.start_timestamp)
            self.last_value = self.min_value = self.max_value = value
        else:
            # pylint: disable-next=line-too-long
            self.integral += (value + self.last_value) * (timestamp - self.last_timestamp) / 2.0  # type: ignore[operator]
            self.last_value = value
            self.min_value = min(self.min_value, value)  # type: ignore[type-var]
            self.max_value = max(self.max_value, value)  # type: ignore[type-var]

        self.last_timestamp = timestamp

    def mean(self) -> float:
        if self.integral is None:
            return math.nan

        if self.has_nan:
            return self.integral / (self.last_timestamp - self.start_timestamp)

        timestamp = timer()
        integral = self.integral + self.last_value * (timestamp - self.last_timestamp)  # type: ignore[operator]
        return integral / (timestamp - self.start_timestamp)

    def min(self) -> float:
        if self.min_value is None:
            return math.nan
        return self.min_value

    def max(self) -> float:
        if self.max_value is None:
            return math.nan
        return self.max_value

    def last(self) -> float:
        if self.last_value is None:
            return math.nan
        return self.last_value

    def items(self) -> Iterable[tuple[str, float]]:
        yield ('mean', self.mean())
        yield ('min', self.min())
        yield ('max', self.max())
        yield ('last', self.last())