Source code for nvitop.api.device

# This file is part of nvitop, the interactive NVIDIA-GPU process viewer.
#
# Copyright 2021-2024 Xuehai Pan. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""The live classes for GPU devices.

The core classes are :class:`Device` and :class:`CudaDevice` (also aliased as :attr:`Device.cuda`).
The type of returned instance created by ``Class(args)`` is depending on the given arguments.

``Device()`` returns:

.. code-block:: python

    - (index: int)        -> PhysicalDevice
    - (index: (int, int)) -> MigDevice
    - (uuid: str)         -> Union[PhysicalDevice, MigDevice]  # depending on the UUID value
    - (bus_id: str)       -> PhysicalDevice

``CudaDevice()`` returns:

.. code-block:: python

    - (cuda_index: int)        -> Union[CudaDevice, CudaMigDevice]  # depending on `CUDA_VISIBLE_DEVICES`
    - (uuid: str)              -> Union[CudaDevice, CudaMigDevice]  # depending on `CUDA_VISIBLE_DEVICES`
    - (nvml_index: int)        -> CudaDevice
    - (nvml_index: (int, int)) -> CudaMigDevice

Examples:
    >>> from nvitop import Device, CudaDevice
    >>> Device.driver_version()              # version of the installed NVIDIA display driver
    '470.129.06'

    >>> Device.count()                       # number of NVIDIA GPUs in the system
    10

    >>> Device.all()                         # all physical devices in the system
    [
        PhysicalDevice(index=0, ...),
        PhysicalDevice(index=1, ...),
        ...
    ]

    >>> nvidia0 = Device(index=0)            # -> PhysicalDevice
    >>> mig10   = Device(index=(1, 0))       # -> MigDevice
    >>> nvidia2 = Device(uuid='GPU-xxxxxx')  # -> PhysicalDevice
    >>> mig30   = Device(uuid='MIG-xxxxxx')  # -> MigDevice

    >>> nvidia0.memory_free()                # total free memory in bytes
    11550654464
    >>> nvidia0.memory_free_human()          # total free memory in human readable format
    '11016MiB'

    >>> nvidia2.as_snapshot()                # takes an onetime snapshot of the device
    PhysicalDeviceSnapshot(
        real=PhysicalDevice(index=2, ...),
        ...
    )

    >>> import os
    >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
    >>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0'

    >>> CudaDevice.count()                     # number of NVIDIA GPUs visible to CUDA applications
    4
    >>> Device.cuda.count()                    # use alias in class `Device`
    4

    >>> CudaDevice.all()                       # all CUDA visible devices (or `Device.cuda.all()`)
    [
        CudaDevice(cuda_index=0, nvml_index=3, ...),
        CudaDevice(cuda_index=1, nvml_index=2, ...),
        ...
    ]

    >>> cuda0 = CudaDevice(cuda_index=0)       # use CUDA ordinal (or `Device.cuda(0)`)
    >>> cuda1 = CudaDevice(nvml_index=2)       # use NVML ordinal
    >>> cuda2 = CudaDevice(uuid='GPU-xxxxxx')  # use UUID string

    >>> cuda0.memory_free()                    # total free memory in bytes
    11550654464
    >>> cuda0.memory_free_human()              # total free memory in human readable format
    '11016MiB'

    >>> cuda1.as_snapshot()                    # takes an onetime snapshot of the device
    CudaDeviceSnapshot(
        real=CudaDevice(cuda_index=1, nvml_index=2, ...),
        ...
    )
"""

# pylint: disable=too-many-lines

from __future__ import annotations

import contextlib
import functools
import multiprocessing as mp
import os
import re
import subprocess
import sys
import textwrap
import threading
import time
from collections import OrderedDict
from typing import TYPE_CHECKING, Any, Callable, ClassVar, Generator, Iterable, NamedTuple, overload

from nvitop.api import libcuda, libcudart, libnvml
from nvitop.api.process import GpuProcess
from nvitop.api.utils import (
    NA,
    UINT_MAX,
    NaType,
    Snapshot,
    boolify,
    bytes2human,
    memoize_when_activated,
)


if TYPE_CHECKING:
    from collections.abc import Hashable

    from typing_extensions import Literal  # Python 3.8+
    from typing_extensions import Self  # Python 3.11+


__all__ = [
    'Device',
    'PhysicalDevice',
    'MigDevice',
    'CudaDevice',
    'CudaMigDevice',
    'parse_cuda_visible_devices',
    'normalize_cuda_visible_devices',
]

# Class definitions ################################################################################


class MemoryInfo(NamedTuple):  # in bytes # pylint: disable=missing-class-docstring
    total: int | NaType
    free: int | NaType
    used: int | NaType


class ClockInfos(NamedTuple):  # in MHz # pylint: disable=missing-class-docstring
    graphics: int | NaType
    sm: int | NaType
    memory: int | NaType
    video: int | NaType


class ClockSpeedInfos(NamedTuple):  # pylint: disable=missing-class-docstring
    current: ClockInfos
    max: ClockInfos


class UtilizationRates(NamedTuple):  # in percentage # pylint: disable=missing-class-docstring
    gpu: int | NaType
    memory: int | NaType
    encoder: int | NaType
    decoder: int | NaType


class ThroughputInfo(NamedTuple):  # in KiB/s # pylint: disable=missing-class-docstring
    tx: int | NaType
    rx: int | NaType

    @property
    def transmit(self) -> int | NaType:
        """Alias of :attr:`tx`."""
        return self.tx

    @property
    def receive(self) -> int | NaType:
        """Alias of :attr:`rx`."""
        return self.rx


# pylint: disable-next=missing-class-docstring,too-few-public-methods
class ValueOmitted:
    def __repr__(self) -> str:
        return '<VALUE OMITTED>'


_VALUE_OMITTED: str = ValueOmitted()  # type: ignore[assignment]
del ValueOmitted


[docs]class Device: # pylint: disable=too-many-instance-attributes,too-many-public-methods """Live class of the GPU devices, different from the device snapshots. :meth:`Device.__new__()` returns different types depending on the given arguments. .. code-block:: python - (index: int) -> PhysicalDevice - (index: (int, int)) -> MigDevice - (uuid: str) -> Union[PhysicalDevice, MigDevice] # depending on the UUID value - (bus_id: str) -> PhysicalDevice Examples: >>> Device.driver_version() # version of the installed NVIDIA display driver '470.129.06' >>> Device.count() # number of NVIDIA GPUs in the system 10 >>> Device.all() # all physical devices in the system [ PhysicalDevice(index=0, ...), PhysicalDevice(index=1, ...), ... ] >>> nvidia0 = Device(index=0) # -> PhysicalDevice >>> mig10 = Device(index=(1, 0)) # -> MigDevice >>> nvidia2 = Device(uuid='GPU-xxxxxx') # -> PhysicalDevice >>> mig30 = Device(uuid='MIG-xxxxxx') # -> MigDevice >>> nvidia0.memory_free() # total free memory in bytes 11550654464 >>> nvidia0.memory_free_human() # total free memory in human readable format '11016MiB' >>> nvidia2.as_snapshot() # takes an onetime snapshot of the device PhysicalDeviceSnapshot( real=PhysicalDevice(index=2, ...), ... ) Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the device index is out of range. TypeError: If the number of non-None arguments is not exactly 1. TypeError: If the given index is a tuple but is not consist of two integers. """ # https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars # https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices # GPU UUID : `GPU-<GPU-UUID>` # MIG UUID : `MIG-GPU-<GPU-UUID>/<GPU instance ID>/<compute instance ID>` # MIG UUID (R470+): `MIG-<MIG-UUID>` UUID_PATTERN: re.Pattern = re.compile( r"""^ # full match (?:(?P<MigMode>MIG)-)? # prefix for MIG UUID (?:(?P<GpuUuid>GPU)-)? # prefix for GPU UUID (?(MigMode)|(?(GpuUuid)|GPU-)) # always have a prefix (?P<UUID>[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12}) # UUID for the GPU/MIG device in lower case # Suffix for MIG device while using GPU UUID with GPU instance (GI) ID and compute instance (CI) ID (?(MigMode) # match only when the MIG prefix matches (?(GpuUuid) # match only when provide with GPU UUID /(?P<GpuInstanceId>\d+) # GI ID of the MIG device /(?P<ComputeInstanceId>\d+) # CI ID of the MIG device |) |) $""", # full match flags=re.VERBOSE, ) GPU_PROCESS_CLASS: type[GpuProcess] = GpuProcess cuda: type[CudaDevice] = None # type: ignore[assignment] # defined in below """Shortcut for class :class:`CudaDevice`.""" _nvml_index: int | tuple[int, int]
[docs] @classmethod def is_available(cls) -> bool: """Test whether there are any devices and the NVML library is successfully loaded.""" try: return cls.count() > 0 except libnvml.NVMLError: return False
[docs] @staticmethod def driver_version() -> str | NaType: """The version of the installed NVIDIA display driver. This is an alphanumeric string. Command line equivalent: .. code:: bash nvidia-smi --id=0 --format=csv,noheader,nounits --query-gpu=driver_version Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ return libnvml.nvmlQuery('nvmlSystemGetDriverVersion')
[docs] @staticmethod def cuda_driver_version() -> str | NaType: """The maximum CUDA version supported by the NVIDIA display driver. This is an alphanumeric string. This can be different from the version of the CUDA Runtime. See also :meth:`cuda_runtime_version`. Returns: Union[str, NaType] The maximum CUDA version supported by the NVIDIA display driver. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ cuda_driver_version = libnvml.nvmlQuery('nvmlSystemGetCudaDriverVersion') if libnvml.nvmlCheckReturn(cuda_driver_version, int): major = cuda_driver_version // 1000 minor = (cuda_driver_version % 1000) // 10 revision = cuda_driver_version % 10 if revision == 0: return f'{major}.{minor}' return f'{major}.{minor}.{revision}' return NA
max_cuda_version = cuda_driver_version
[docs] @staticmethod def cuda_runtime_version() -> str | NaType: """The CUDA Runtime version. This is an alphanumeric string. This can be different from the CUDA driver version. See also :meth:`cuda_driver_version`. Returns: Union[str, NaType] The CUDA Runtime version, or :const:`nvitop.NA` when no CUDA Runtime is available or no CUDA-capable devices are present. """ try: return libcudart.cudaRuntimeGetVersion() except libcudart.cudaError: return NA
cudart_version = cuda_runtime_version
[docs] @classmethod def count(cls) -> int: """The number of NVIDIA GPUs in the system. Command line equivalent: .. code:: bash nvidia-smi --id=0 --format=csv,noheader,nounits --query-gpu=count Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. """ return libnvml.nvmlQuery('nvmlDeviceGetCount', default=0)
[docs] @classmethod def all(cls) -> list[PhysicalDevice]: """Return a list of all physical devices in the system.""" return cls.from_indices() # type: ignore[return-value]
[docs] @classmethod def from_indices( cls, indices: int | Iterable[int | tuple[int, int]] | None = None, ) -> list[PhysicalDevice | MigDevice]: """Return a list of devices of the given indices. Args: indices (Iterable[Union[int, Tuple[int, int]]]): Indices of the devices. For each index, get :class:`PhysicalDevice` for single int and :class:`MigDevice` for tuple (int, int). That is: - (int) -> PhysicalDevice - ((int, int)) -> MigDevice Returns: List[Union[PhysicalDevice, MigDevice]] A list of :class:`PhysicalDevice` and/or :class:`MigDevice` instances of the given indices. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the device index is out of range. """ if indices is None: try: indices = range(cls.count()) except libnvml.NVMLError: return [] if isinstance(indices, int): indices = [indices] return list(map(cls, indices)) # type: ignore[arg-type]
[docs] @staticmethod def from_cuda_visible_devices() -> list[CudaDevice]: """Return a list of all CUDA visible devices. The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_ - `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_ Returns: List[CudaDevice] A list of :class:`CudaDevice` instances. """ # pylint: disable=line-too-long visible_device_indices = Device.parse_cuda_visible_devices() device_index: int | tuple[int, int] cuda_devices: list[CudaDevice] = [] for cuda_index, device_index in enumerate(visible_device_indices): # type: ignore[assignment] cuda_devices.append(CudaDevice(cuda_index, nvml_index=device_index)) return cuda_devices
[docs] @staticmethod def from_cuda_indices( cuda_indices: int | Iterable[int] | None = None, ) -> list[CudaDevice]: """Return a list of CUDA devices of the given CUDA indices. The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable. See also for CUDA Device Enumeration: - `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_ - `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_ Args: cuda_indices (Iterable[int]): The indices of the GPU in CUDA ordinal, if not given, returns all visible CUDA devices. Returns: List[CudaDevice] A list of :class:`CudaDevice` of the given CUDA indices. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ # pylint: disable=line-too-long cuda_devices = Device.from_cuda_visible_devices() if cuda_indices is None: return cuda_devices if isinstance(cuda_indices, int): cuda_indices = [cuda_indices] cuda_indices = list(cuda_indices) cuda_device_count = len(cuda_devices) devices = [] for cuda_index in cuda_indices: if not 0 <= cuda_index < cuda_device_count: raise RuntimeError(f'CUDA Error: invalid device ordinal: {cuda_index!r}.') device = cuda_devices[cuda_index] devices.append(device) return devices
[docs] @staticmethod def parse_cuda_visible_devices( cuda_visible_devices: str | None = _VALUE_OMITTED, ) -> list[int] | list[tuple[int, int]]: """Parse the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices. This is a alias of :func:`parse_cuda_visible_devices`. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_ - `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_ Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. Returns: Union[List[int], List[Tuple[int, int]]] A list of int (physical device) or a list of tuple of two integers (MIG device) for the corresponding real device indices. """ # pylint: disable=line-too-long return parse_cuda_visible_devices(cuda_visible_devices)
[docs] @staticmethod def normalize_cuda_visible_devices(cuda_visible_devices: str | None = _VALUE_OMITTED) -> str: """Parse the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs. This is an alias of :func:`normalize_cuda_visible_devices`. Note: The result could be empty string if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_ - `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_ Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. Returns: str The comma-separated string (GPU UUIDs) of the ``CUDA_VISIBLE_DEVICES`` environment variable. """ # pylint: disable=line-too-long return normalize_cuda_visible_devices(cuda_visible_devices)
[docs] def __new__( cls, index: int | tuple[int, int] | str | None = None, *, uuid: str | None = None, bus_id: str | None = None, ) -> Self: """Create a new instance of Device. The type of the result is determined by the given argument. .. code-block:: python - (index: int) -> PhysicalDevice - (index: (int, int)) -> MigDevice - (uuid: str) -> Union[PhysicalDevice, MigDevice] # depending on the UUID value - (bus_id: str) -> PhysicalDevice Note: This method takes exact 1 non-None argument. Returns: Union[PhysicalDevice, MigDevice] A :class:`PhysicalDevice` instance or a :class:`MigDevice` instance. Raises: TypeError: If the number of non-None arguments is not exactly 1. TypeError: If the given index is a tuple but is not consist of two integers. """ if (index, uuid, bus_id).count(None) != 2: raise TypeError( f'Device(index=None, uuid=None, bus_id=None) takes 1 non-None arguments ' f'but (index, uuid, bus_id) = {(index, uuid, bus_id)!r} were given', ) if cls is not Device: # Use the subclass type if the type is explicitly specified return super().__new__(cls) # Auto subclass type inference logic goes here when `cls` is `Device` (e.g., calls `Device(...)`) match: re.Match | None = None if isinstance(index, str): match = cls.UUID_PATTERN.match(index) if match is not None: # passed by UUID index, uuid = None, index elif isinstance(uuid, str): match = cls.UUID_PATTERN.match(uuid) if index is not None: if not isinstance(index, int): if not isinstance(index, tuple): raise TypeError( f'index must be an integer, or a tuple of two integers, or a valid UUID string, ' f'but index = {index!r} was given', ) if not ( len(index) == 2 and isinstance(index[0], int) and isinstance(index[1], int) ): raise TypeError( f'index for MIG device must be a tuple of two integers ' f'but index = {index!r} was given', ) return super().__new__(MigDevice) # type: ignore[return-value] elif uuid is not None and match is not None and match.group('MigMode') is not None: return super().__new__(MigDevice) # type: ignore[return-value] return super().__new__(PhysicalDevice) # type: ignore[return-value]
[docs] def __init__( self, index: int | str | None = None, *, uuid: str | None = None, bus_id: str | None = None, ) -> None: """Initialize the instance created by :meth:`__new__()`. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the device index is out of range. """ if isinstance(index, str) and self.UUID_PATTERN.match(index) is not None: # passed by UUID index, uuid = None, index index, uuid, bus_id = ( arg.encode() if isinstance(arg, str) else arg for arg in (index, uuid, bus_id) ) self._name: str = NA self._uuid: str = NA self._bus_id: str = NA self._memory_total: int | NaType = NA self._memory_total_human: str = NA self._nvlink_link_count: int | None = None self._nvlink_throughput_counters: tuple[tuple[int | NaType, int]] | None = None self._is_mig_device: bool | None = None self._cuda_index: int | None = None self._cuda_compute_capability: tuple[int, int] | NaType | None = None if index is not None: self._nvml_index = index # type: ignore[assignment] try: self._handle = libnvml.nvmlQuery( 'nvmlDeviceGetHandleByIndex', index, ignore_errors=False, ) except libnvml.NVMLError_GpuIsLost: self._handle = None self._name = 'ERROR: GPU is Lost' except libnvml.NVMLError_Unknown: self._handle = None self._name = 'ERROR: Unknown' else: try: if uuid is not None: self._handle = libnvml.nvmlQuery( 'nvmlDeviceGetHandleByUUID', uuid, ignore_errors=False, ) else: self._handle = libnvml.nvmlQuery( 'nvmlDeviceGetHandleByPciBusId', bus_id, ignore_errors=False, ) except libnvml.NVMLError_GpuIsLost: self._handle = None self._nvml_index = NA # type: ignore[assignment] self._name = 'ERROR: GPU is Lost' except libnvml.NVMLError_Unknown: self._handle = None self._nvml_index = NA # type: ignore[assignment] self._name = 'ERROR: Unknown' else: self._nvml_index = libnvml.nvmlQuery('nvmlDeviceGetIndex', self._handle) self._max_clock_infos: ClockInfos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA) self._lock: threading.RLock = threading.RLock() self._ident: tuple[Hashable, str] = (self.index, self.uuid()) self._hash: int | None = None
[docs] def __repr__(self) -> str: """Return a string representation of the device.""" return '{}(index={}, name={!r}, total_memory={})'.format( # noqa: UP032 self.__class__.__name__, self.index, self.name(), self.memory_total_human(), )
[docs] def __eq__(self, other: object) -> bool: """Test equality to other object.""" if not isinstance(other, Device): return NotImplemented return self._ident == other._ident
[docs] def __hash__(self) -> int: """Return a hash value of the device.""" if self._hash is None: self._hash = hash(self._ident) return self._hash
[docs] def __getattr__(self, name: str) -> Any | Callable[..., Any]: """Get the object attribute. If the attribute is not defined, make a method from ``pynvml.nvmlDeviceGet<AttributeName>(handle)``. The attribute name will be converted to PascalCase string. Raises: AttributeError: If the attribute is not defined in ``pynvml.py``. Examples: >>> device = Device(0) >>> # Method `cuda_compute_capability` is not implemented in the class definition >>> PhysicalDevice.cuda_compute_capability AttributeError: type object 'Device' has no attribute 'cuda_compute_capability' >>> # Dynamically create a new method from `pynvml.nvmlDeviceGetCudaComputeCapability(device.handle, *args, **kwargs)` >>> device.cuda_compute_capability <function PhysicalDevice.cuda_compute_capability at 0x7fbfddf5d9d0> >>> device.cuda_compute_capability() (8, 6) """ # pylint: disable=line-too-long try: return super().__getattr__(name) # type: ignore[misc] except AttributeError: if name == '_cache': raise if self._handle is None: return lambda: NA match = libnvml.VERSIONED_PATTERN.match(name) if match is not None: name = match.group('name') suffix = match.group('suffix') else: suffix = '' try: pascal_case = name.title().replace('_', '') func = getattr(libnvml, 'nvmlDeviceGet' + pascal_case + suffix) except AttributeError: pascal_case = ''.join( part[:1].upper() + part[1:] for part in filter(None, name.split('_')) ) func = getattr(libnvml, 'nvmlDeviceGet' + pascal_case + suffix) def attribute(*args: Any, **kwargs: Any) -> Any: try: return libnvml.nvmlQuery( func, self._handle, *args, **kwargs, ignore_errors=False, ) except libnvml.NVMLError_NotSupported: return NA attribute.__name__ = name attribute.__qualname__ = f'{self.__class__.__name__}.{name}' setattr(self, name, attribute) return attribute
[docs] def __reduce__(self) -> tuple[type[Device], tuple[int | tuple[int, int]]]: """Return state information for pickling.""" return self.__class__, (self._nvml_index,)
@property def index(self) -> int | tuple[int, int]: """The NVML index of the device. Returns: Union[int, Tuple[int, int]] Returns an int for physical device and tuple of two integers for MIG device. """ return self._nvml_index @property def nvml_index(self) -> int | tuple[int, int]: """The NVML index of the device. Returns: Union[int, Tuple[int, int]] Returns an int for physical device and tuple of two integers for MIG device. """ return self._nvml_index @property def physical_index(self) -> int: """The index of the physical device. Returns: int An int for the physical device index. For MIG devices, returns the index of the parent physical device. """ return self._nvml_index # type: ignore[return-value] # will be overridden in MigDevice @property def handle(self) -> libnvml.c_nvmlDevice_t: """The NVML device handle.""" return self._handle @property def cuda_index(self) -> int: """The CUDA device index. The value will be evaluated on the first call. Raises: RuntimeError: If the current device is not visible to CUDA applications (i.e. not listed in the ``CUDA_VISIBLE_DEVICES`` environment variable or the environment variable is invalid). """ if self._cuda_index is None: visible_device_indices = self.parse_cuda_visible_devices() try: self._cuda_index = visible_device_indices.index(self.index) # type: ignore[arg-type] except ValueError as ex: raise RuntimeError( f'CUDA Error: Device(index={self.index}) is not visible to CUDA applications', ) from ex return self._cuda_index
[docs] def name(self) -> str | NaType: """The official product name of the GPU. This is an alphanumeric string. For all products. Returns: Union[str, NaType] The official product name, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=name """ if self._name is NA: self._name = libnvml.nvmlQuery('nvmlDeviceGetName', self.handle) return self._name
[docs] def uuid(self) -> str | NaType: """This value is the globally unique immutable alphanumeric identifier of the GPU. It does not correspond to any physical label on the board. Returns: Union[str, NaType] The UUID of the device, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=name """ if self._uuid is NA: self._uuid = libnvml.nvmlQuery('nvmlDeviceGetUUID', self.handle) return self._uuid
[docs] def bus_id(self) -> str | NaType: """PCI bus ID as "domain:bus:device.function", in hex. Returns: Union[str, NaType] The PCI bus ID of the device, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=pci.bus_id """ if self._bus_id is NA: self._bus_id = libnvml.nvmlQuery( lambda handle: libnvml.nvmlDeviceGetPciInfo(handle).busId, self.handle, ) return self._bus_id
[docs] def serial(self) -> str | NaType: """This number matches the serial number physically printed on each board. It is a globally unique immutable alphanumeric value. Returns: Union[str, NaType] The serial number of the device, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=serial """ return libnvml.nvmlQuery('nvmlDeviceGetSerial', self.handle)
[docs] @memoize_when_activated def memory_info(self) -> MemoryInfo: # in bytes """Return a named tuple with memory information (in bytes) for the device. Returns: MemoryInfo(total, free, used) A named tuple with memory information, the item could be :const:`nvitop.NA` when not applicable. """ memory_info = libnvml.nvmlQuery('nvmlDeviceGetMemoryInfo', self.handle) if libnvml.nvmlCheckReturn(memory_info): return MemoryInfo(total=memory_info.total, free=memory_info.free, used=memory_info.used) return MemoryInfo(total=NA, free=NA, used=NA)
[docs] def memory_total(self) -> int | NaType: # in bytes """Total installed GPU memory in bytes. Returns: Union[int, NaType] Total installed GPU memory in bytes, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=memory.total """ if self._memory_total is NA: self._memory_total = self.memory_info().total return self._memory_total
[docs] def memory_used(self) -> int | NaType: # in bytes """Total memory allocated by active contexts in bytes. Returns: Union[int, NaType] Total memory allocated by active contexts in bytes, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=memory.used """ return self.memory_info().used
[docs] def memory_free(self) -> int | NaType: # in bytes """Total free memory in bytes. Returns: Union[int, NaType] Total free memory in bytes, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=memory.free """ return self.memory_info().free
[docs] def memory_total_human(self) -> str | NaType: # in human readable """Total installed GPU memory in human readable format. Returns: Union[str, NaType] Total installed GPU memory in human readable format, or :const:`nvitop.NA` when not applicable. """ if self._memory_total_human is NA: self._memory_total_human = bytes2human(self.memory_total()) return self._memory_total_human
[docs] def memory_used_human(self) -> str | NaType: # in human readable """Total memory allocated by active contexts in human readable format. Returns: Union[int, NaType] Total memory allocated by active contexts in human readable format, or :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long return bytes2human(self.memory_used())
[docs] def memory_free_human(self) -> str | NaType: # in human readable """Total free memory in human readable format. Returns: Union[int, NaType] Total free memory in human readable format, or :const:`nvitop.NA` when not applicable. """ return bytes2human(self.memory_free())
[docs] def memory_percent(self) -> float | NaType: # in percentage """The percentage of used memory over total memory (``0 <= p <= 100``). Returns: Union[float, NaType] The percentage of used memory over total memory, or :const:`nvitop.NA` when not applicable. """ memory_info = self.memory_info() if libnvml.nvmlCheckReturn(memory_info.used, int) and libnvml.nvmlCheckReturn( memory_info.total, int, ): return round(100.0 * memory_info.used / memory_info.total, 1) return NA
[docs] def memory_usage(self) -> str: # string of used memory over total memory (in human readable) """The used memory over total memory in human readable format. Returns: str The used memory over total memory in human readable format, or :const:`'N/A / N/A'` when not applicable. """ # pylint: disable=line-too-long return f'{self.memory_used_human()} / {self.memory_total_human()}'
[docs] @memoize_when_activated def bar1_memory_info(self) -> MemoryInfo: # in bytes """Return a named tuple with BAR1 memory information (in bytes) for the device. Returns: MemoryInfo(total, free, used) A named tuple with BAR1 memory information, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long memory_info = libnvml.nvmlQuery('nvmlDeviceGetBAR1MemoryInfo', self.handle) if libnvml.nvmlCheckReturn(memory_info): return MemoryInfo( total=memory_info.bar1Total, free=memory_info.bar1Free, used=memory_info.bar1Used, ) return MemoryInfo(total=NA, free=NA, used=NA)
[docs] def bar1_memory_total(self) -> int | NaType: # in bytes """Total BAR1 memory in bytes. Returns: Union[int, NaType] Total BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable. """ return self.bar1_memory_info().total
[docs] def bar1_memory_used(self) -> int | NaType: # in bytes """Total used BAR1 memory in bytes. Returns: Union[int, NaType] Total used BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable. """ return self.bar1_memory_info().used
[docs] def bar1_memory_free(self) -> int | NaType: # in bytes """Total free BAR1 memory in bytes. Returns: Union[int, NaType] Total free BAR1 memory in bytes, or :const:`nvitop.NA` when not applicable. """ return self.bar1_memory_info().free
[docs] def bar1_memory_total_human(self) -> str | NaType: # in human readable """Total BAR1 memory in human readable format. Returns: Union[int, NaType] Total BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable. """ return bytes2human(self.bar1_memory_total())
[docs] def bar1_memory_used_human(self) -> str | NaType: # in human readable """Total used BAR1 memory in human readable format. Returns: Union[int, NaType] Total used BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable. """ return bytes2human(self.bar1_memory_used())
[docs] def bar1_memory_free_human(self) -> str | NaType: # in human readable """Total free BAR1 memory in human readable format. Returns: Union[int, NaType] Total free BAR1 memory in human readable format, or :const:`nvitop.NA` when not applicable. """ return bytes2human(self.bar1_memory_free())
[docs] def bar1_memory_percent(self) -> float | NaType: # in percentage """The percentage of used BAR1 memory over total BAR1 memory (0 <= p <= 100). Returns: Union[float, NaType] The percentage of used BAR1 memory over total BAR1 memory, or :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long memory_info = self.bar1_memory_info() if libnvml.nvmlCheckReturn(memory_info.used, int) and libnvml.nvmlCheckReturn( memory_info.total, int, ): return round(100.0 * memory_info.used / memory_info.total, 1) return NA
[docs] def bar1_memory_usage(self) -> str: # in human readable """The used BAR1 memory over total BAR1 memory in human readable format. Returns: str The used BAR1 memory over total BAR1 memory in human readable format, or :const:`'N/A / N/A'` when not applicable. """ # pylint: disable=line-too-long return f'{self.bar1_memory_used_human()} / {self.bar1_memory_total_human()}'
[docs] @memoize_when_activated def utilization_rates(self) -> UtilizationRates: # in percentage """Return a named tuple with GPU utilization rates (in percentage) for the device. Returns: UtilizationRates(gpu, memory, encoder, decoder) A named tuple with GPU utilization rates (in percentage) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long gpu, memory, encoder, decoder = NA, NA, NA, NA utilization_rates = libnvml.nvmlQuery('nvmlDeviceGetUtilizationRates', self.handle) if libnvml.nvmlCheckReturn(utilization_rates): gpu, memory = utilization_rates.gpu, utilization_rates.memory encoder_utilization = libnvml.nvmlQuery('nvmlDeviceGetEncoderUtilization', self.handle) if libnvml.nvmlCheckReturn(encoder_utilization, list) and len(encoder_utilization) > 0: encoder = encoder_utilization[0] decoder_utilization = libnvml.nvmlQuery('nvmlDeviceGetDecoderUtilization', self.handle) if libnvml.nvmlCheckReturn(decoder_utilization, list) and len(decoder_utilization) > 0: decoder = decoder_utilization[0] return UtilizationRates(gpu=gpu, memory=memory, encoder=encoder, decoder=decoder)
[docs] def gpu_utilization(self) -> int | NaType: # in percentage """Percent of time over the past sample period during which one or more kernels was executing on the GPU. The sample period may be between 1 second and 1/6 second depending on the product. Returns: Union[int, NaType] The GPU utilization rate in percentage, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=utilization.gpu """ return self.utilization_rates().gpu
gpu_percent = gpu_utilization # in percentage
[docs] def memory_utilization(self) -> int | NaType: # in percentage """Percent of time over the past sample period during which global (device) memory was being read or written. The sample period may be between 1 second and 1/6 second depending on the product. Returns: Union[int, NaType] The memory bandwidth utilization rate of the GPU in percentage, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=utilization.memory """ # pylint: disable=line-too-long return self.utilization_rates().memory
[docs] def encoder_utilization(self) -> int | NaType: # in percentage """The encoder utilization rate in percentage. Returns: Union[int, NaType] The encoder utilization rate in percentage, or :const:`nvitop.NA` when not applicable. """ return self.utilization_rates().encoder
[docs] def decoder_utilization(self) -> int | NaType: # in percentage """The decoder utilization rate in percentage. Returns: Union[int, NaType] The decoder utilization rate in percentage, or :const:`nvitop.NA` when not applicable. """ return self.utilization_rates().decoder
[docs] @memoize_when_activated def clock_infos(self) -> ClockInfos: # in MHz """Return a named tuple with current clock speeds (in MHz) for the device. Returns: ClockInfos(graphics, sm, memory, video) A named tuple with current clock speeds (in MHz) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long return ClockInfos( graphics=libnvml.nvmlQuery( 'nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_GRAPHICS, ), sm=libnvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_SM), memory=libnvml.nvmlQuery('nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_MEM), video=libnvml.nvmlQuery( 'nvmlDeviceGetClockInfo', self.handle, libnvml.NVML_CLOCK_VIDEO, ), )
clocks = clock_infos
[docs] @memoize_when_activated def max_clock_infos(self) -> ClockInfos: # in MHz """Return a named tuple with maximum clock speeds (in MHz) for the device. Returns: ClockInfos(graphics, sm, memory, video) A named tuple with maximum clock speeds (in MHz) for the device, the item could be :const:`nvitop.NA` when not applicable. """ # pylint: disable=line-too-long clock_infos = self._max_clock_infos._asdict() for name, clock in clock_infos.items(): if clock is NA: clock_type = getattr( libnvml, 'NVML_CLOCK_{}'.format(name.replace('memory', 'mem').upper()), ) clock = libnvml.nvmlQuery('nvmlDeviceGetMaxClockInfo', self.handle, clock_type) clock_infos[name] = clock self._max_clock_infos = ClockInfos(**clock_infos) return self._max_clock_infos
max_clocks = max_clock_infos
[docs] def clock_speed_infos(self) -> ClockSpeedInfos: # in MHz """Return a named tuple with the current and the maximum clock speeds (in MHz) for the device. Returns: ClockSpeedInfos(current, max) A named tuple with the current and the maximum clock speeds (in MHz) for the device. """ return ClockSpeedInfos(current=self.clock_infos(), max=self.max_clock_infos())
[docs] def graphics_clock(self) -> int | NaType: # in MHz """Current frequency of graphics (shader) clock in MHz. Returns: Union[int, NaType] The current frequency of graphics (shader) clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.current.graphics """ # pylint: disable=line-too-long return self.clock_infos().graphics
[docs] def sm_clock(self) -> int | NaType: # in MHz """Current frequency of SM (Streaming Multiprocessor) clock in MHz. Returns: Union[int, NaType] The current frequency of SM (Streaming Multiprocessor) clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.current.sm """ # pylint: disable=line-too-long return self.clock_infos().sm
[docs] def memory_clock(self) -> int | NaType: # in MHz """Current frequency of memory clock in MHz. Returns: Union[int, NaType] The current frequency of memory clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.current.memory """ return self.clock_infos().memory
[docs] def video_clock(self) -> int | NaType: # in MHz """Current frequency of video encoder/decoder clock in MHz. Returns: Union[int, NaType] The current frequency of video encoder/decoder clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.current.video """ # pylint: disable=line-too-long return self.clock_infos().video
[docs] def max_graphics_clock(self) -> int | NaType: # in MHz """Maximum frequency of graphics (shader) clock in MHz. Returns: Union[int, NaType] The maximum frequency of graphics (shader) clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.max.graphics """ # pylint: disable=line-too-long return self.max_clock_infos().graphics
[docs] def max_sm_clock(self) -> int | NaType: # in MHz """Maximum frequency of SM (Streaming Multiprocessor) clock in MHz. Returns: Union[int, NaType] The maximum frequency of SM (Streaming Multiprocessor) clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.max.sm """ # pylint: disable=line-too-long return self.max_clock_infos().sm
[docs] def max_memory_clock(self) -> int | NaType: # in MHz """Maximum frequency of memory clock in MHz. Returns: Union[int, NaType] The maximum frequency of memory clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.max.memory """ return self.max_clock_infos().memory
[docs] def max_video_clock(self) -> int | NaType: # in MHz """Maximum frequency of video encoder/decoder clock in MHz. Returns: Union[int, NaType] The maximum frequency of video encoder/decoder clock in MHz, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=clocks.max.video """ # pylint: disable=line-too-long return self.max_clock_infos().video
[docs] def fan_speed(self) -> int | NaType: # in percentage """The fan speed value is the percent of the product's maximum noise tolerance fan speed that the device's fan is currently intended to run at. This value may exceed 100% in certain cases. Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, this output will not match the actual fan speed. Many parts do not report fan speeds because they rely on cooling via fans in the surrounding enclosure. Returns: Union[int, NaType] The fan speed value in percentage, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=fan.speed """ # pylint: disable=line-too-long return libnvml.nvmlQuery('nvmlDeviceGetFanSpeed', self.handle)
[docs] def temperature(self) -> int | NaType: # in Celsius """Core GPU temperature in degrees C. Returns: Union[int, NaType] The core GPU temperature in Celsius degrees, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=temperature.gpu """ return libnvml.nvmlQuery( 'nvmlDeviceGetTemperature', self.handle, libnvml.NVML_TEMPERATURE_GPU, )
[docs] @memoize_when_activated def power_usage(self) -> int | NaType: # in milliwatts (mW) """The last measured power draw for the entire board in milliwatts. Returns: Union[int, NaType] The power draw for the entire board in milliwatts, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash $(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.draw)" * 1000 )) """ return libnvml.nvmlQuery('nvmlDeviceGetPowerUsage', self.handle)
power_draw = power_usage # in milliwatts (mW)
[docs] @memoize_when_activated def power_limit(self) -> int | NaType: # in milliwatts (mW) """The software power limit in milliwatts. Set by software like nvidia-smi. Returns: Union[int, NaType] The software power limit in milliwatts, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash $(( "$(nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=power.limit)" * 1000 )) """ return libnvml.nvmlQuery('nvmlDeviceGetPowerManagementLimit', self.handle)
[docs] def power_status(self) -> str: # string of power usage over power limit in watts (W) """The string of power usage over power limit in watts. Returns: str The string of power usage over power limit in watts, or :const:`'N/A / N/A'` when not applicable. """ # pylint: disable=line-too-long power_usage = self.power_usage() power_limit = self.power_limit() if libnvml.nvmlCheckReturn(power_usage, int): power_usage = f'{round(power_usage / 1000)}W' # type: ignore[assignment] if libnvml.nvmlCheckReturn(power_limit, int): power_limit = f'{round(power_limit / 1000)}W' # type: ignore[assignment] return f'{power_usage} / {power_limit}'
[docs] def pcie_throughput(self) -> ThroughputInfo: # in KiB/s """The current PCIe throughput in KiB/s. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: ThroughputInfo(tx, rx) A named tuple with current PCIe throughput in KiB/s, the item could be :const:`nvitop.NA` when not applicable. """ return ThroughputInfo(tx=self.pcie_tx_throughput(), rx=self.pcie_rx_throughput())
[docs] @memoize_when_activated def pcie_tx_throughput(self) -> int | NaType: # in KiB/s """The current PCIe transmit throughput in KiB/s. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: Union[int, NaType] The current PCIe transmit throughput in KiB/s, or :const:`nvitop.NA` when not applicable. """ return libnvml.nvmlQuery( 'nvmlDeviceGetPcieThroughput', self.handle, libnvml.NVML_PCIE_UTIL_RX_BYTES, )
[docs] @memoize_when_activated def pcie_rx_throughput(self) -> int | NaType: # in KiB/s """The current PCIe receive throughput in KiB/s. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: Union[int, NaType] The current PCIe receive throughput in KiB/s, or :const:`nvitop.NA` when not applicable. """ return libnvml.nvmlQuery( 'nvmlDeviceGetPcieThroughput', self.handle, libnvml.NVML_PCIE_UTIL_RX_BYTES, )
[docs] def pcie_tx_throughput_human(self) -> str | NaType: # in human readable """The current PCIe transmit throughput in human readable format. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: Union[str, NaType] The current PCIe transmit throughput in human readable format, or :const:`nvitop.NA` when not applicable. """ tx = self.pcie_tx_throughput() if libnvml.nvmlCheckReturn(tx, int): return f'{bytes2human(tx * 1024)}/s' return NA
[docs] def pcie_rx_throughput_human(self) -> str | NaType: # in human readable """The current PCIe receive throughput in human readable format. This function is querying a byte counter over a 20ms interval and thus is the PCIe throughput over that interval. Returns: Union[str, NaType] The current PCIe receive throughput in human readable format, or :const:`nvitop.NA` when not applicable. """ rx = self.pcie_rx_throughput() if libnvml.nvmlCheckReturn(rx, int): return f'{bytes2human(rx * 1024)}/s' return NA
[docs] def display_active(self) -> str | NaType: """A flag that indicates whether a display is initialized on the GPU's (e.g. memory is allocated on the device for display). Display can be active even when no monitor is physically attached. "Enabled" indicates an active display. "Disabled" indicates otherwise. Returns: Union[str, NaType] - :const:`'Disabled'`: if not an active display device. - :const:`'Enabled'`: if an active display device. - :const:`nvitop.NA`: if not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=display_active """ # pylint: disable=line-too-long return {0: 'Disabled', 1: 'Enabled'}.get( libnvml.nvmlQuery('nvmlDeviceGetDisplayActive', self.handle), NA, )
[docs] def display_mode(self) -> str | NaType: """A flag that indicates whether a physical display (e.g. monitor) is currently connected to any of the GPU's connectors. "Enabled" indicates an attached display. "Disabled" indicates otherwise. Returns: Union[str, NaType] - :const:`'Disabled'`: if the display mode is disabled. - :const:`'Enabled'`: if the display mode is enabled. - :const:`nvitop.NA`: if not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=display_mode """ # pylint: disable=line-too-long return {0: 'Disabled', 1: 'Enabled'}.get( libnvml.nvmlQuery('nvmlDeviceGetDisplayMode', self.handle), NA, )
[docs] def current_driver_model(self) -> str | NaType: """The driver model currently in use. Always "N/A" on Linux. On Windows, the TCC (WDM) and WDDM driver models are supported. The TCC driver model is optimized for compute applications. I.E. kernel launch times will be quicker with TCC. The WDDM driver model is designed for graphics applications and is not recommended for compute applications. Linux does not support multiple driver models, and will always have the value of "N/A". Returns: Union[str, NaType] - :const:`'WDDM'`: for WDDM driver model on Windows. - :const:`'WDM'`: for TTC (WDM) driver model on Windows. - :const:`nvitop.NA`: if not applicable, e.g. on Linux. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=driver_model.current """ return {libnvml.NVML_DRIVER_WDDM: 'WDDM', libnvml.NVML_DRIVER_WDM: 'WDM'}.get( libnvml.nvmlQuery('nvmlDeviceGetCurrentDriverModel', self.handle), NA, )
driver_model = current_driver_model
[docs] def persistence_mode(self) -> str | NaType: """A flag that indicates whether persistence mode is enabled for the GPU. Value is either "Enabled" or "Disabled". When persistence mode is enabled the NVIDIA driver remains loaded even when no active clients, such as X11 or nvidia-smi, exist. This minimizes the driver load latency associated with running dependent apps, such as CUDA programs. Linux only. Returns: Union[str, NaType] - :const:`'Disabled'`: if the persistence mode is disabled. - :const:`'Enabled'`: if the persistence mode is enabled. - :const:`nvitop.NA`: if not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=persistence_mode """ # pylint: disable=line-too-long return {0: 'Disabled', 1: 'Enabled'}.get( libnvml.nvmlQuery('nvmlDeviceGetPersistenceMode', self.handle), NA, )
[docs] def performance_state(self) -> str | NaType: """The current performance state for the GPU. States range from P0 (maximum performance) to P12 (minimum performance). Returns: Union[str, NaType] The current performance state in format ``P<int>``, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=pstate """ # pylint: disable=line-too-long performance_state = libnvml.nvmlQuery('nvmlDeviceGetPerformanceState', self.handle) if libnvml.nvmlCheckReturn(performance_state, int): performance_state = 'P' + str(performance_state) return performance_state
[docs] def total_volatile_uncorrected_ecc_errors(self) -> int | NaType: """Total errors detected across entire chip. Returns: Union[int, NaType] The total number of uncorrected errors in volatile ECC memory, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=ecc.errors.uncorrected.volatile.total """ # pylint: disable=line-too-long return libnvml.nvmlQuery( 'nvmlDeviceGetTotalEccErrors', self.handle, libnvml.NVML_MEMORY_ERROR_TYPE_UNCORRECTED, libnvml.NVML_VOLATILE_ECC, )
[docs] def compute_mode(self) -> str | NaType: """The compute mode flag indicates whether individual or multiple compute applications may run on the GPU. Returns: Union[str, NaType] - :const:`'Default'`: means multiple contexts are allowed per device. - :const:`'Exclusive Thread'`: deprecated, use Exclusive Process instead - :const:`'Prohibited'`: means no contexts are allowed per device (no compute apps). - :const:`'Exclusive Process'`: means only one context is allowed per device, usable from multiple threads at a time. - :const:`nvitop.NA`: if not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=compute_mode """ # pylint: disable=line-too-long return { libnvml.NVML_COMPUTEMODE_DEFAULT: 'Default', libnvml.NVML_COMPUTEMODE_EXCLUSIVE_THREAD: 'Exclusive Thread', libnvml.NVML_COMPUTEMODE_PROHIBITED: 'Prohibited', libnvml.NVML_COMPUTEMODE_EXCLUSIVE_PROCESS: 'Exclusive Process', }.get(libnvml.nvmlQuery('nvmlDeviceGetComputeMode', self.handle), NA)
[docs] def cuda_compute_capability(self) -> tuple[int, int] | NaType: """The CUDA compute capability for the device. Returns: Union[Tuple[int, int], NaType] The CUDA compute capability version in format ``(major, minor)``, or :const:`nvitop.NA` when not applicable. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=compute_cap """ if self._cuda_compute_capability is None: self._cuda_compute_capability = libnvml.nvmlQuery( 'nvmlDeviceGetCudaComputeCapability', self.handle, ) return self._cuda_compute_capability
[docs] def is_mig_device(self) -> bool: """Return whether or not the device is a MIG device.""" if self._is_mig_device is None: is_mig_device = libnvml.nvmlQuery( 'nvmlDeviceIsMigDeviceHandle', self.handle, default=False, ignore_function_not_found=True, ) self._is_mig_device = bool(is_mig_device) # nvmlDeviceIsMigDeviceHandle returns c_uint return self._is_mig_device
[docs] def mig_mode(self) -> str | NaType: """The MIG mode that the GPU is currently operating under. Returns: Union[str, NaType] - :const:`'Disabled'`: if the MIG mode is disabled. - :const:`'Enabled'`: if the MIG mode is enabled. - :const:`nvitop.NA`: if not applicable, e.g. the GPU does not support MIG mode. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=mig.mode.current """ if self.is_mig_device(): return NA mig_mode, *_ = libnvml.nvmlQuery( 'nvmlDeviceGetMigMode', self.handle, default=(NA, NA), ignore_function_not_found=True, ) return {0: 'Disabled', 1: 'Enabled'}.get(mig_mode, NA)
[docs] def is_mig_mode_enabled(self) -> bool: """Test whether the MIG mode is enabled on the device. Return :data:`False` if MIG mode is disabled or the device does not support MIG mode. """ return boolify(self.mig_mode())
[docs] def max_mig_device_count(self) -> int: """Return the maximum number of MIG instances the device supports. This method will return 0 if the device does not support MIG mode. """ return 0 # implemented in PhysicalDevice
[docs] def mig_devices(self) -> list[MigDevice]: """Return a list of children MIG devices of the current device. This method will return an empty list if the MIG mode is disabled or the device does not support MIG mode. """ return [] # implemented in PhysicalDevice
[docs] def is_leaf_device(self) -> bool: """Test whether the device is a physical device with MIG mode disabled or a MIG device. Return :data:`True` if the device is a physical device with MIG mode disabled or a MIG device. Otherwise, return :data:`False` if the device is a physical device with MIG mode enabled. """ return self.is_mig_device() or not self.is_mig_mode_enabled()
[docs] def to_leaf_devices( self, ) -> list[PhysicalDevice] | list[MigDevice] | list[CudaDevice] | list[CudaMigDevice]: """Return a list of leaf devices. Note that a CUDA device is always a leaf device. """ if isinstance(self, CudaDevice) or self.is_leaf_device(): return [self] # type: ignore[return-value] return self.mig_devices()
[docs] def processes(self) -> dict[int, GpuProcess]: """Return a dictionary of processes running on the GPU. Returns: Dict[int, GpuProcess] A dictionary mapping PID to GPU process instance. """ processes = {} found_na = False for type, func in ( # pylint: disable=redefined-builtin ('C', 'nvmlDeviceGetComputeRunningProcesses'), ('G', 'nvmlDeviceGetGraphicsRunningProcesses'), ): for p in libnvml.nvmlQuery(func, self.handle, default=()): if isinstance(p.usedGpuMemory, int): gpu_memory = p.usedGpuMemory else: # Used GPU memory is `N/A` on Windows Display Driver Model (WDDM) # or on MIG-enabled GPUs gpu_memory = NA # type: ignore[assignment] found_na = True proc = processes[p.pid] = self.GPU_PROCESS_CLASS( pid=p.pid, device=self, gpu_memory=gpu_memory, gpu_instance_id=getattr(p, 'gpuInstanceId', UINT_MAX), compute_instance_id=getattr(p, 'computeInstanceId', UINT_MAX), ) proc.type = proc.type + type if len(processes) > 0: samples = libnvml.nvmlQuery( 'nvmlDeviceGetProcessUtilization', self.handle, # Only utilization samples that were recorded after this timestamp will be returned. # The CPU timestamp, i.e. absolute Unix epoch timestamp (in microseconds), is used. # Here we use the timestamp 1 second ago to ensure the record buffer is not empty. time.time_ns() // 1000 - 1000_000, default=(), ) for s in sorted(samples, key=lambda s: s.timeStamp): try: processes[s.pid].set_gpu_utilization(s.smUtil, s.memUtil, s.encUtil, s.decUtil) except KeyError: # noqa: PERF203 pass if not found_na: for pid in set(processes).difference(s.pid for s in samples): processes[pid].set_gpu_utilization(0, 0, 0, 0) return processes
[docs] def as_snapshot(self) -> Snapshot: """Return a onetime snapshot of the device. The attributes are defined in :attr:`SNAPSHOT_KEYS`. """ with self.oneshot(): return Snapshot( real=self, index=self.index, physical_index=self.physical_index, **{key: getattr(self, key)() for key in self.SNAPSHOT_KEYS}, )
SNAPSHOT_KEYS: ClassVar[list[str]] = [ 'name', 'uuid', 'bus_id', 'memory_info', 'memory_used', 'memory_free', 'memory_total', 'memory_used_human', 'memory_free_human', 'memory_total_human', 'memory_percent', 'memory_usage', 'utilization_rates', 'gpu_utilization', 'memory_utilization', 'encoder_utilization', 'decoder_utilization', 'clock_infos', 'max_clock_infos', 'clock_speed_infos', 'sm_clock', 'memory_clock', 'fan_speed', 'temperature', 'power_usage', 'power_limit', 'power_status', 'pcie_throughput', 'pcie_tx_throughput', 'pcie_rx_throughput', 'pcie_tx_throughput_human', 'pcie_rx_throughput_human', 'display_active', 'display_mode', 'current_driver_model', 'persistence_mode', 'performance_state', 'total_volatile_uncorrected_ecc_errors', 'compute_mode', 'cuda_compute_capability', 'mig_mode', ] # Modified from psutil (https://github.com/giampaolo/psutil)
[docs] @contextlib.contextmanager def oneshot(self) -> Generator[None, None, None]: """A utility context manager which considerably speeds up the retrieval of multiple device information at the same time. Internally different device info (e.g. memory_info, utilization_rates, ...) may be fetched by using the same routine, but only one information is returned and the others are discarded. When using this context manager the internal routine is executed once (in the example below on memory_info()) and the other info are cached. The cache is cleared when exiting the context manager block. The advice is to use this every time you retrieve more than one information about the device. Examples: >>> from nvitop import Device >>> device = Device(0) >>> with device.oneshot(): ... device.memory_info() # collect multiple info ... device.memory_used() # return cached value ... device.memory_free_human() # return cached value ... device.memory_percent() # return cached value """ # pylint: disable=line-too-long with self._lock: # pylint: disable=no-member if hasattr(self, '_cache'): # NOOP: this covers the use case where the user enters the # context twice: # # >>> with device.oneshot(): # ... with device.oneshot(): # ... # # Also, since as_snapshot() internally uses oneshot() # I expect that the code below will be a pretty common # "mistake" that the user will make, so let's guard # against that: # # >>> with device.oneshot(): # ... device.as_snapshot() # ... yield else: try: self.memory_info.cache_activate(self) # type: ignore[attr-defined] self.bar1_memory_info.cache_activate(self) # type: ignore[attr-defined] self.utilization_rates.cache_activate(self) # type: ignore[attr-defined] self.clock_infos.cache_activate(self) # type: ignore[attr-defined] self.max_clock_infos.cache_activate(self) # type: ignore[attr-defined] self.power_usage.cache_activate(self) # type: ignore[attr-defined] self.power_limit.cache_activate(self) # type: ignore[attr-defined] yield finally: self.memory_info.cache_deactivate(self) # type: ignore[attr-defined] self.bar1_memory_info.cache_deactivate(self) # type: ignore[attr-defined] self.utilization_rates.cache_deactivate(self) # type: ignore[attr-defined] self.clock_infos.cache_deactivate(self) # type: ignore[attr-defined] self.max_clock_infos.cache_deactivate(self) # type: ignore[attr-defined] self.power_usage.cache_deactivate(self) # type: ignore[attr-defined] self.power_limit.cache_deactivate(self) # type: ignore[attr-defined]
[docs]class PhysicalDevice(Device): """Class for physical devices. This is the real GPU installed in the system. """ _nvml_index: int index: int nvml_index: int @property def physical_index(self) -> int: """Zero based index of the GPU. Can change at each boot. Command line equivalent: .. code:: bash nvidia-smi --id=<IDENTIFIER> --format=csv,noheader,nounits --query-gpu=index """ return self._nvml_index
[docs] def max_mig_device_count(self) -> int: """Return the maximum number of MIG instances the device supports. This method will return 0 if the device does not support MIG mode. """ return libnvml.nvmlQuery( 'nvmlDeviceGetMaxMigDeviceCount', self.handle, default=0, ignore_function_not_found=True, )
[docs] def mig_device(self, mig_index: int) -> MigDevice: """Return a child MIG device of the given index. Raises: libnvml.NVMLError: If the device does not support MIG mode or the given MIG device does not exist. """ with _global_physical_device(self): return MigDevice(index=(self.index, mig_index))
[docs] def mig_devices(self) -> list[MigDevice]: """Return a list of children MIG devices of the current device. This method will return an empty list if the MIG mode is disabled or the device does not support MIG mode. """ mig_devices = [] if self.is_mig_mode_enabled(): max_mig_device_count = self.max_mig_device_count() with _global_physical_device(self): for mig_index in range(max_mig_device_count): try: mig_device = MigDevice(index=(self.index, mig_index)) except libnvml.NVMLError: # noqa: PERF203 break else: mig_devices.append(mig_device) return mig_devices
[docs]class MigDevice(Device): # pylint: disable=too-many-instance-attributes """Class for MIG devices.""" _nvml_index: tuple[int, int] nvml_index: tuple[int, int]
[docs] @classmethod def count(cls) -> int: """The number of total MIG devices aggregated over all physical devices.""" return len(cls.all())
[docs] @classmethod def all(cls) -> list[MigDevice]: # type: ignore[override] """Return a list of MIG devices aggregated over all physical devices.""" mig_devices = [] for device in PhysicalDevice.all(): mig_devices.extend(device.mig_devices()) return mig_devices
[docs] @classmethod def from_indices( # type: ignore[override] # pylint: disable=signature-differs cls, indices: Iterable[tuple[int, int]], ) -> list[MigDevice]: """Return a list of MIG devices of the given indices. Args: indices (Iterable[Tuple[int, int]]): Indices of the MIG devices. Each index is a tuple of two integers. Returns: List[MigDevice] A list of :class:`MigDevice` instances of the given indices. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. """ return list(map(cls, indices))
# pylint: disable-next=super-init-not-called
[docs] def __init__( self, index: tuple[int, int] | str | None = None, *, uuid: str | None = None, ) -> None: """Initialize the instance created by :meth:`__new__()`. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. """ if isinstance(index, str) and self.UUID_PATTERN.match(index) is not None: # passed by UUID index, uuid = None, index index, uuid = (arg.encode() if isinstance(arg, str) else arg for arg in (index, uuid)) self._name: str = NA self._uuid: str = NA self._bus_id: str = NA self._memory_total: int | NaType = NA self._memory_total_human: str = NA self._gpu_instance_id: int | NaType = NA self._compute_instance_id: int | NaType = NA self._nvlink_link_count: int | None = None self._nvlink_throughput_counters: tuple[tuple[int | NaType, int]] | None = None self._is_mig_device: bool = True self._cuda_index: int | None = None self._cuda_compute_capability: tuple[int, int] | NaType | None = None if index is not None: self._nvml_index = index # type: ignore[assignment] self._handle = None parent = _get_global_physical_device() if ( parent is None or parent.handle is None or parent.physical_index != self.physical_index ): parent = PhysicalDevice(index=self.physical_index) self._parent = parent if self.parent.handle is not None: try: self._handle = libnvml.nvmlQuery( 'nvmlDeviceGetMigDeviceHandleByIndex', self.parent.handle, self.mig_index, ignore_errors=False, ) except libnvml.NVMLError_GpuIsLost: pass else: self._handle = libnvml.nvmlQuery('nvmlDeviceGetHandleByUUID', uuid, ignore_errors=False) parent_handle = libnvml.nvmlQuery( 'nvmlDeviceGetDeviceHandleFromMigDeviceHandle', self.handle, ignore_errors=False, ) parent_index = libnvml.nvmlQuery( 'nvmlDeviceGetIndex', parent_handle, ignore_errors=False, ) self._parent = PhysicalDevice(index=parent_index) for mig_device in self.parent.mig_devices(): if self.uuid() == mig_device.uuid(): self._nvml_index = mig_device.index break else: raise libnvml.NVMLError_NotFound self._max_clock_infos = ClockInfos(graphics=NA, sm=NA, memory=NA, video=NA) self._lock = threading.RLock() self._ident = (self.index, self.uuid()) self._hash = None
@property def index(self) -> tuple[int, int]: """The index of the MIG device. This is a tuple of two integers.""" return self._nvml_index @property def physical_index(self) -> int: """The index of the parent physical device.""" return self._nvml_index[0] @property def mig_index(self) -> int: """The index of the MIG device over the all MIG devices of the parent device.""" return self._nvml_index[1] @property def parent(self) -> PhysicalDevice: """The parent physical device.""" return self._parent
[docs] def gpu_instance_id(self) -> int | NaType: """The gpu instance ID of the MIG device. Returns: Union[int, NaType] The gpu instance ID of the MIG device, or :const:`nvitop.NA` when not applicable. """ if self._gpu_instance_id is NA: self._gpu_instance_id = libnvml.nvmlQuery( 'nvmlDeviceGetGpuInstanceId', self.handle, default=UINT_MAX, ) if self._gpu_instance_id == UINT_MAX: self._gpu_instance_id = NA return self._gpu_instance_id
[docs] def compute_instance_id(self) -> int | NaType: """The compute instance ID of the MIG device. Returns: Union[int, NaType] The compute instance ID of the MIG device, or :const:`nvitop.NA` when not applicable. """ if self._compute_instance_id is NA: self._compute_instance_id = libnvml.nvmlQuery( 'nvmlDeviceGetComputeInstanceId', self.handle, default=UINT_MAX, ) if self._compute_instance_id == UINT_MAX: self._compute_instance_id = NA return self._compute_instance_id
[docs] def as_snapshot(self) -> Snapshot: """Return a onetime snapshot of the device. The attributes are defined in :attr:`SNAPSHOT_KEYS`. """ snapshot = super().as_snapshot() snapshot.mig_index = self.mig_index # type: ignore[attr-defined] return snapshot
SNAPSHOT_KEYS: ClassVar[list[str]] = [ *Device.SNAPSHOT_KEYS, 'gpu_instance_id', 'compute_instance_id', ]
[docs]class CudaDevice(Device): """Class for devices enumerated over the CUDA ordinal. The order can be vary for different ``CUDA_VISIBLE_DEVICES`` environment variable. See also for CUDA Device Enumeration: - `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_ - `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_ :meth:`CudaDevice.__new__()` returns different types depending on the given arguments. .. code-block:: python - (cuda_index: int) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (uuid: str) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (nvml_index: int) -> CudaDevice - (nvml_index: (int, int)) -> CudaMigDevice Examples: >>> import os >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' >>> os.environ['CUDA_VISIBLE_DEVICES'] = '3,2,1,0' >>> CudaDevice.count() # number of NVIDIA GPUs visible to CUDA applications 4 >>> Device.cuda.count() # use alias in class `Device` 4 >>> CudaDevice.all() # all CUDA visible devices (or `Device.cuda.all()`) [ CudaDevice(cuda_index=0, nvml_index=3, ...), CudaDevice(cuda_index=1, nvml_index=2, ...), ... ] >>> cuda0 = CudaDevice(cuda_index=0) # use CUDA ordinal (or `Device.cuda(0)`) >>> cuda1 = CudaDevice(nvml_index=2) # use NVML ordinal >>> cuda2 = CudaDevice(uuid='GPU-xxxxxx') # use UUID string >>> cuda0.memory_free() # total free memory in bytes 11550654464 >>> cuda0.memory_free_human() # total free memory in human readable format '11016MiB' >>> cuda1.as_snapshot() # takes an onetime snapshot of the device CudaDeviceSnapshot( real=CudaDevice(cuda_index=1, nvml_index=2, ...), ... ) Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the NVML index is out of range. TypeError: If the number of non-None arguments is not exactly 1. TypeError: If the given NVML index is a tuple but is not consist of two integers. RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ # pylint: disable=line-too-long _nvml_index: int index: int nvml_index: int
[docs] @classmethod def is_available(cls) -> bool: """Test whether there are any CUDA-capable devices available.""" return cls.count() > 0
[docs] @classmethod def count(cls) -> int: """The number of GPUs visible to CUDA applications.""" try: return len(super().parse_cuda_visible_devices()) except libnvml.NVMLError: return 0
[docs] @classmethod def all(cls) -> list[CudaDevice]: # type: ignore[override] """All CUDA visible devices. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. """ return cls.from_indices()
[docs] @classmethod def from_indices( # type: ignore[override] cls, indices: int | Iterable[int] | None = None, ) -> list[CudaDevice]: """Return a list of CUDA devices of the given CUDA indices. The CUDA ordinal will be enumerate from the ``CUDA_VISIBLE_DEVICES`` environment variable. See also for CUDA Device Enumeration: - `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_ - `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_ Args: cuda_indices (Iterable[int]): The indices of the GPU in CUDA ordinal, if not given, returns all visible CUDA devices. Returns: List[CudaDevice] A list of :class:`CudaDevice` of the given CUDA indices. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ return super().from_cuda_indices(indices)
[docs] def __new__( cls, cuda_index: int | None = None, *, nvml_index: int | tuple[int, int] | None = None, uuid: str | None = None, ) -> Self: """Create a new instance of CudaDevice. The type of the result is determined by the given argument. .. code-block:: python - (cuda_index: int) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (uuid: str) -> Union[CudaDevice, CudaMigDevice] # depending on `CUDA_VISIBLE_DEVICES` - (nvml_index: int) -> CudaDevice - (nvml_index: (int, int)) -> CudaMigDevice Note: This method takes exact 1 non-None argument. Returns: Union[CudaDevice, CudaMigDevice] A :class:`CudaDevice` instance or a :class:`CudaMigDevice` instance. Raises: TypeError: If the number of non-None arguments is not exactly 1. TypeError: If the given NVML index is a tuple but is not consist of two integers. RuntimeError: If the index is out of range for the given ``CUDA_VISIBLE_DEVICES`` environment variable. """ if nvml_index is not None and uuid is not None: raise TypeError( f'CudaDevice(cuda_index=None, nvml_index=None, uuid=None) takes 1 non-None arguments ' f'but (cuda_index, nvml_index, uuid) = {(cuda_index, nvml_index, uuid)!r} were given', ) if cuda_index is not None and nvml_index is None and uuid is None: cuda_visible_devices = cls.parse_cuda_visible_devices() if not isinstance(cuda_index, int) or not 0 <= cuda_index < len(cuda_visible_devices): raise RuntimeError(f'CUDA Error: invalid device ordinal: {cuda_index!r}.') nvml_index = cuda_visible_devices[cuda_index] if cls is not CudaDevice: # Use the subclass type if the type is explicitly specified return super().__new__(cls, index=nvml_index, uuid=uuid) # Auto subclass type inference logic goes here when `cls` is `CudaDevice` (e.g., calls `CudaDevice(...)`) if (nvml_index is not None and not isinstance(nvml_index, int)) or is_mig_device_uuid(uuid): return super().__new__(CudaMigDevice, index=nvml_index, uuid=uuid) # type: ignore[return-value] return super().__new__(CudaDevice, index=nvml_index, uuid=uuid) # type: ignore[return-value]
[docs] def __init__( self, cuda_index: int | None = None, *, nvml_index: int | tuple[int, int] | None = None, uuid: str | None = None, ) -> None: """Initialize the instance created by :meth:`__new__()`. Raises: libnvml.NVMLError_LibraryNotFound: If cannot find the NVML library, usually the NVIDIA driver is not installed. libnvml.NVMLError_DriverNotLoaded: If NVIDIA driver is not loaded. libnvml.NVMLError_LibRmVersionMismatch: If RM detects a driver/library version mismatch, usually after an upgrade for NVIDIA driver without reloading the kernel module. libnvml.NVMLError_NotFound: If the device is not found for the given NVML identifier. libnvml.NVMLError_InvalidArgument: If the NVML index is out of range. RuntimeError: If the given device is not visible to CUDA applications (i.e. not listed in the ``CUDA_VISIBLE_DEVICES`` environment variable or the environment variable is invalid). """ if cuda_index is not None and nvml_index is None and uuid is None: cuda_visible_devices = self.parse_cuda_visible_devices() if not isinstance(cuda_index, int) or not 0 <= cuda_index < len(cuda_visible_devices): raise RuntimeError(f'CUDA Error: invalid device ordinal: {cuda_index!r}.') nvml_index = cuda_visible_devices[cuda_index] super().__init__(index=nvml_index, uuid=uuid) # type: ignore[arg-type] if cuda_index is None: cuda_index = super().cuda_index self._cuda_index: int = cuda_index self._ident: tuple[Hashable, str] = ((self._cuda_index, self.index), self.uuid())
[docs] def __repr__(self) -> str: """Return a string representation of the CUDA device.""" return '{}(cuda_index={}, nvml_index={}, name="{}", total_memory={})'.format( # noqa: UP032 self.__class__.__name__, self.cuda_index, self.index, self.name(), self.memory_total_human(), )
[docs] def __reduce__(self) -> tuple[type[CudaDevice], tuple[int]]: """Return state information for pickling.""" return self.__class__, (self._cuda_index,)
[docs] def as_snapshot(self) -> Snapshot: """Return a onetime snapshot of the device. The attributes are defined in :attr:`SNAPSHOT_KEYS`. """ snapshot = super().as_snapshot() snapshot.cuda_index = self.cuda_index # type: ignore[attr-defined] return snapshot
Device.cuda = CudaDevice """Shortcut for class :class:`CudaDevice`."""
[docs]class CudaMigDevice(CudaDevice, MigDevice): # type: ignore[misc] """Class for CUDA devices that are MIG devices.""" _nvml_index: tuple[int, int] # type: ignore[assignment] index: tuple[int, int] # type: ignore[assignment] nvml_index: tuple[int, int] # type: ignore[assignment]
def is_mig_device_uuid(uuid: str | None) -> bool: """Return :data:`True` if the argument is a MIG device UUID, otherwise, return :data:`False`.""" if isinstance(uuid, str): match = Device.UUID_PATTERN.match(uuid) if match is not None and match.group('MigMode') is not None: return True return False
[docs]def parse_cuda_visible_devices( cuda_visible_devices: str | None = _VALUE_OMITTED, ) -> list[int] | list[tuple[int, int]]: """Parse the given ``CUDA_VISIBLE_DEVICES`` value into a list of NVML device indices. This function is aliased by :meth:`Device.parse_cuda_visible_devices`. Note: The result could be empty if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_ - `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_ Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. Returns: Union[List[int], List[Tuple[int, int]]] A list of int (physical device) or a list of tuple of two integers (MIG device) for the corresponding real device indices. Examples: >>> import os >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' >>> os.environ['CUDA_VISIBLE_DEVICES'] = '6,5' >>> parse_cuda_visible_devices() # parse the `CUDA_VISIBLE_DEVICES` environment variable to NVML indices [6, 5] >>> parse_cuda_visible_devices('0,4') # pass the `CUDA_VISIBLE_DEVICES` value explicitly [0, 4] >>> parse_cuda_visible_devices('GPU-18ef14e9,GPU-849d5a8d') # accept abbreviated UUIDs [5, 6] >>> parse_cuda_visible_devices(None) # get all devices when the `CUDA_VISIBLE_DEVICES` environment variable unset [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] >>> parse_cuda_visible_devices('MIG-d184f67c-c95f-5ef2-a935-195bd0094fbd') # MIG device support (MIG UUID) [(0, 0)] >>> parse_cuda_visible_devices('MIG-GPU-3eb79704-1571-707c-aee8-f43ce747313d/13/0') # MIG device support (GPU UUID) [(0, 1)] >>> parse_cuda_visible_devices('MIG-GPU-3eb79704/13/0') # MIG device support (abbreviated GPU UUID) [(0, 1)] >>> parse_cuda_visible_devices('') # empty string [] >>> parse_cuda_visible_devices('0,0') # invalid `CUDA_VISIBLE_DEVICES` (duplicate device ordinal) [] >>> parse_cuda_visible_devices('16') # invalid `CUDA_VISIBLE_DEVICES` (device ordinal out of range) [] """ # pylint: disable=line-too-long if cuda_visible_devices is _VALUE_OMITTED: cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None) return _parse_cuda_visible_devices(cuda_visible_devices, format='index')
[docs]def normalize_cuda_visible_devices(cuda_visible_devices: str | None = _VALUE_OMITTED) -> str: """Parse the given ``CUDA_VISIBLE_DEVICES`` value and convert it into a comma-separated string of UUIDs. This function is aliased by :meth:`Device.normalize_cuda_visible_devices`. Note: The result could be empty string if the ``CUDA_VISIBLE_DEVICES`` environment variable is invalid. See also for CUDA Device Enumeration: - `CUDA Environment Variables <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars>`_ - `CUDA Device Enumeration for MIG Device <https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#cuda-visible-devices>`_ Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. Returns: str The comma-separated string (GPU UUIDs) of the ``CUDA_VISIBLE_DEVICES`` environment variable. Examples: >>> import os >>> os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' >>> os.environ['CUDA_VISIBLE_DEVICES'] = '6,5' >>> normalize_cuda_visible_devices() # normalize the `CUDA_VISIBLE_DEVICES` environment variable to UUID strings 'GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794,GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1' >>> normalize_cuda_visible_devices('4') # pass the `CUDA_VISIBLE_DEVICES` value explicitly 'GPU-96de99c9-d68f-84c8-424c-7c75e59cc0a0' >>> normalize_cuda_visible_devices('GPU-18ef14e9,GPU-849d5a8d') # normalize abbreviated UUIDs 'GPU-18ef14e9-dec6-1d7e-1284-3010c6ce98b1,GPU-849d5a8d-610e-eeea-1fd4-81ff44a23794' >>> normalize_cuda_visible_devices(None) # get all devices when the `CUDA_VISIBLE_DEVICES` environment variable unset 'GPU-<GPU0-UUID>,GPU-<GPU1-UUID>,...' # all GPU UUIDs >>> normalize_cuda_visible_devices('MIG-d184f67c-c95f-5ef2-a935-195bd0094fbd') # MIG device support (MIG UUID) 'MIG-d184f67c-c95f-5ef2-a935-195bd0094fbd' >>> normalize_cuda_visible_devices('MIG-GPU-3eb79704-1571-707c-aee8-f43ce747313d/13/0') # MIG device support (GPU UUID) 'MIG-37b51284-1df4-5451-979d-3231ccb0822e' >>> normalize_cuda_visible_devices('MIG-GPU-3eb79704/13/0') # MIG device support (abbreviated GPU UUID) 'MIG-37b51284-1df4-5451-979d-3231ccb0822e' >>> normalize_cuda_visible_devices('') # empty string '' >>> normalize_cuda_visible_devices('0,0') # invalid `CUDA_VISIBLE_DEVICES` (duplicate device ordinal) '' >>> normalize_cuda_visible_devices('16') # invalid `CUDA_VISIBLE_DEVICES` (device ordinal out of range) '' """ # pylint: disable=line-too-long if cuda_visible_devices is _VALUE_OMITTED: cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None) return ','.join(_parse_cuda_visible_devices(cuda_visible_devices, format='uuid'))
# Helper functions ################################################################################# class _PhysicalDeviceAttrs(NamedTuple): index: int # type: ignore[assignment] name: str uuid: str support_mig_mode: bool _PHYSICAL_DEVICE_ATTRS: OrderedDict[str, _PhysicalDeviceAttrs] | None = None _GLOBAL_PHYSICAL_DEVICE: PhysicalDevice | None = None _GLOBAL_PHYSICAL_DEVICE_LOCK: threading.RLock = threading.RLock() def _get_all_physical_device_attrs() -> OrderedDict[str, _PhysicalDeviceAttrs]: global _PHYSICAL_DEVICE_ATTRS # pylint: disable=global-statement if _PHYSICAL_DEVICE_ATTRS is not None: return _PHYSICAL_DEVICE_ATTRS with _GLOBAL_PHYSICAL_DEVICE_LOCK: if _PHYSICAL_DEVICE_ATTRS is None: _PHYSICAL_DEVICE_ATTRS = OrderedDict( [ ( device.uuid(), _PhysicalDeviceAttrs( device.index, device.name(), device.uuid(), libnvml.nvmlCheckReturn(device.mig_mode()), ), ) for device in PhysicalDevice.all() ], ) return _PHYSICAL_DEVICE_ATTRS def _does_any_device_support_mig_mode(uuids: Iterable[str] | None = None) -> bool: physical_device_attrs = _get_all_physical_device_attrs() uuids = uuids or physical_device_attrs.keys() return any(physical_device_attrs[uuid].support_mig_mode for uuid in uuids) @contextlib.contextmanager def _global_physical_device(device: PhysicalDevice) -> Generator[PhysicalDevice, None, None]: global _GLOBAL_PHYSICAL_DEVICE # pylint: disable=global-statement with _GLOBAL_PHYSICAL_DEVICE_LOCK: try: _GLOBAL_PHYSICAL_DEVICE = device yield _GLOBAL_PHYSICAL_DEVICE finally: _GLOBAL_PHYSICAL_DEVICE = None def _get_global_physical_device() -> PhysicalDevice: with _GLOBAL_PHYSICAL_DEVICE_LOCK: return _GLOBAL_PHYSICAL_DEVICE # type: ignore[return-value] @overload def _parse_cuda_visible_devices( cuda_visible_devices: str | None, format: Literal['index'], # pylint: disable=redefined-builtin ) -> list[int] | list[tuple[int, int]]: ... @overload def _parse_cuda_visible_devices( cuda_visible_devices: str | None, format: Literal['uuid'], # pylint: disable=redefined-builtin ) -> list[str]: ... @functools.lru_cache() def _parse_cuda_visible_devices( # pylint: disable=too-many-branches,too-many-statements cuda_visible_devices: str | None = None, format: Literal['index', 'uuid'] = 'index', # pylint: disable=redefined-builtin ) -> list[int] | list[tuple[int, int]] | list[str]: """The underlining implementation for :meth:`parse_cuda_visible_devices`. The result will be cached.""" assert format in {'index', 'uuid'} try: physical_device_attrs = _get_all_physical_device_attrs() except libnvml.NVMLError: return [] gpu_uuids = set(physical_device_attrs) try: raw_uuids = ( subprocess.check_output( [ # noqa: S603 sys.executable, '-c', textwrap.dedent( f""" import nvitop.api.device print( ','.join( nvitop.api.device._parse_cuda_visible_devices_to_uuids( {cuda_visible_devices!r}, verbose=False, ), ), ) """, ), ], ) .decode('utf-8') .strip() .split(',') ) except subprocess.CalledProcessError: pass else: uuids = [ uuid if uuid in gpu_uuids else uuid.replace('GPU', 'MIG', 1) for uuid in map('GPU-{}'.format, raw_uuids) ] if gpu_uuids.issuperset(uuids) and not _does_any_device_support_mig_mode(uuids): if format == 'uuid': return uuids return [physical_device_attrs[uuid].index for uuid in uuids] cuda_visible_devices = ','.join(uuids) if cuda_visible_devices is None: cuda_visible_devices = ','.join(physical_device_attrs.keys()) devices: list[Device] = [] presented: set[str] = set() use_integer_identifiers: bool | None = None def from_index_or_uuid(index_or_uuid: int | str) -> Device: nonlocal use_integer_identifiers if isinstance(index_or_uuid, str): if index_or_uuid.isdigit(): index_or_uuid = int(index_or_uuid) elif Device.UUID_PATTERN.match(index_or_uuid) is None: raise libnvml.NVMLError_NotFound if use_integer_identifiers is None: use_integer_identifiers = isinstance(index_or_uuid, int) if isinstance(index_or_uuid, int) and use_integer_identifiers: return Device(index=index_or_uuid) if isinstance(index_or_uuid, str) and not use_integer_identifiers: return Device(uuid=index_or_uuid) raise ValueError('invalid identifier') def strip_identifier(identifier: str) -> str: identifier = identifier.strip() if len(identifier) > 0 and ( identifier[0].isdigit() or (len(identifier) > 1 and identifier[0] in {'+', '-'} and identifier[1].isdigit()) ): offset = 1 if identifier[0] in {'+', '-'} else 0 while offset < len(identifier) and identifier[offset].isdigit(): offset += 1 identifier = identifier[:offset] return identifier for identifier in map(strip_identifier, cuda_visible_devices.split(',')): if identifier in presented: return [] # duplicate identifiers found try: device = from_index_or_uuid(identifier) except (ValueError, libnvml.NVMLError): break devices.append(device) presented.add(identifier) mig_devices = [device for device in devices if device.is_mig_device()] if len(mig_devices) > 0: # Got MIG devices enumerated, use the first one devices = mig_devices[:1] # at most one MIG device is visible else: # All devices in `CUDA_VISIBLE_DEVICES` are physical devices # Check if any GPU that enables MIG mode devices_backup = devices.copy() devices = [] for device in devices_backup: if device.is_mig_mode_enabled(): # Got available MIG devices, use the first MIG device and ignore all non-MIG GPUs try: devices = [device.mig_device(mig_index=0)] # at most one MIG device is visible except libnvml.NVMLError: continue # no MIG device available on the GPU else: break # got one MIG device else: devices.append(device) # non-MIG device if format == 'uuid': return [device.uuid() for device in devices] return [device.index for device in devices] # type: ignore[return-value] def _parse_cuda_visible_devices_to_uuids( cuda_visible_devices: str | None = _VALUE_OMITTED, verbose: bool = True, ) -> list[str]: """Parse the given ``CUDA_VISIBLE_DEVICES`` environment variable in a separate process and return a list of device UUIDs. The UUIDs do not have a prefix ``GPU-`` or ``MIG-``. Args: cuda_visible_devices (Optional[str]): The value of the ``CUDA_VISIBLE_DEVICES`` variable. If not given, the value from the environment will be used. If explicitly given by :data:`None`, the ``CUDA_VISIBLE_DEVICES`` environment variable will be unset before parsing. Returns: List[str] A list of device UUIDs without ``GPU-`` or ``MIG-`` prefixes. Raises: libcuda.CUDAError_NotInitialized: If cannot found the CUDA driver libraries. libcuda.CUDAError: If failed to parse the ``CUDA_VISIBLE_DEVICES`` environment variable. """ # pylint: disable=line-too-long if cuda_visible_devices is _VALUE_OMITTED: cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES', default=None) # Do not inherit file descriptors and handles from the parent process # The `fork` start method should be considered unsafe as it can lead to crashes of the subprocess ctx = mp.get_context('spawn') queue = ctx.SimpleQueue() try: parser = ctx.Process( target=_cuda_visible_devices_parser, args=(cuda_visible_devices, queue, verbose), name='`CUDA_VISIBLE_DEVICES` parser', daemon=True, ) parser.start() parser.join() finally: parser.kill() result = queue.get() if isinstance(result, Exception): raise result return result def _cuda_visible_devices_parser( cuda_visible_devices: str | None, queue: mp.SimpleQueue, verbose: bool = True, ) -> None: try: if cuda_visible_devices is not None: os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices else: os.environ.pop('CUDA_VISIBLE_DEVICES', None) # pylint: disable=no-member try: libcuda.cuInit() except ( libcuda.CUDAError_NoDevice, libcuda.CUDAError_InvalidDevice, libcuda.CUDAError_SystemDriverMismatch, libcuda.CUDAError_CompatNotSupportedOnDevice, ): queue.put([]) raise count = libcuda.cuDeviceGetCount() uuids = [libcuda.cuDeviceGetUuid(libcuda.cuDeviceGet(i)) for i in range(count)] except Exception as ex: # noqa: BLE001 # pylint: disable=broad-except queue.put(ex) if verbose: raise else: queue.put(uuids) return finally: # Ensure non-empty queue queue.put(libcuda.CUDAError_NotInitialized()) # pylint: disable=no-member