You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

340 lines
13 KiB

import collections
import ctypes
from typing import Any, Union
import torch
from torch._utils import _dummy_type
from torch.types import Device
from . import _get_device_index, _is_compiled, _lazy_init, is_initialized
if not _is_compiled():
# Define dummy base classes
torch._C.__dict__["_xpu_XPUAllocator"] = _dummy_type("_xpu_XPUAllocator")
_device_t = Union[Device, str, int, None]
def empty_cache() -> None:
r"""Release all unoccupied cached memory currently held by the caching
allocator so that those can be used in other XPU application.
.. note::
:func:`~torch.xpu.empty_cache` doesn't increase the amount of XPU
memory available for PyTorch. However, it may help reduce fragmentation
of XPU memory in certain cases.
"""
if is_initialized():
torch._C._xpu_emptyCache()
def reset_peak_memory_stats(device: _device_t = None) -> None:
r"""Reset the "peak" stats tracked by the XPU memory allocator.
See :func:`~torch.xpu.memory_stats` for details. Peak stats correspond to the
`"peak"` key in each individual stat dict.
Args:
device (torch.device or int or str, optional): selected device. Returns
statistic for the current device, given by :func:`~torch.xpu.current_device`,
if :attr:`device` is ``None`` (default).
"""
device = _get_device_index(device, optional=True)
return torch._C._xpu_resetPeakMemoryStats(device)
def reset_accumulated_memory_stats(device: _device_t = None) -> None:
r"""Reset the "accumulated" (historical) stats tracked by the XPU memory allocator.
See :func:`~torch.xpu.memory_stats` for details. Accumulated stats correspond to
the `"allocated"` and `"freed"` keys in each individual stat dict.
Args:
device (torch.device or int or str, optional): selected device. Returns
statistic for the current device, given by :func:`~torch.xpu.current_device`,
if :attr:`device` is ``None`` (default).
"""
device = _get_device_index(device, optional=True)
return torch._C._xpu_resetAccumulatedMemoryStats(device)
def memory_stats_as_nested_dict(device: _device_t = None) -> dict[str, Any]:
r"""Return the result of :func:`~torch.xpu.memory_stats` as a nested dictionary."""
if not is_initialized():
return {}
device = _get_device_index(device, optional=True)
return torch._C._xpu_memoryStats(device)
def memory_stats(device: _device_t = None) -> dict[str, Any]:
r"""Return a dictionary of XPU memory allocator statistics for a given device.
The return value of this function is a dictionary of statistics, each of
which is a non-negative integer.
Core statistics:
- ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
amount of allocated memory.
- ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
amount of reserved memory.
- ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
amount of active memory.
- ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``:
memory requested by client code, compare this with allocated_bytes to check if
allocation rounding adds too much overhead.
For these core statistics, values are broken down as follows.
Pool type:
- ``all``: combined statistics across all memory pools.
- ``large_pool``: statistics for the large allocation pool (for size >= 1MB allocations).
- ``small_pool``: statistics for the small allocation pool (for size < 1MB allocations).
Metric type:
- ``current``: current value of this metric.
- ``peak``: maximum value of this metric.
- ``allocated``: historical total increase in this metric.
- ``freed``: historical total decrease in this metric.
Args:
device (torch.device or int or str, optional): selected device. Returns
statistics for the current device, given by :func:`~torch.xpu.current_device`,
if :attr:`device` is ``None`` (default).
"""
result = []
def _recurse_add_to_result(prefix: str, obj: Any) -> None:
if isinstance(obj, dict):
if len(prefix) > 0:
prefix += "."
for k, v in obj.items():
_recurse_add_to_result(prefix + k, v)
else:
result.append((prefix, obj))
stats = memory_stats_as_nested_dict(device=device)
_recurse_add_to_result("", stats)
result.sort()
return collections.OrderedDict(result)
def memory_allocated(device: _device_t = None) -> int:
r"""Return the current GPU memory occupied by tensors in bytes for a given device.
Args:
device (torch.device or int or str, optional): selected device. Returns
statistic for the current device, given by :func:`~torch.xpu.current_device`,
if :attr:`device` is ``None`` (default).
.. note::
This is likely less than the amount shown in `xpu-smi` since some
unused memory can be held by the caching allocator and some context
needs to be created on GPU.
"""
return memory_stats(device=device).get("allocated_bytes.all.current", 0)
def max_memory_allocated(device: _device_t = None) -> int:
r"""Return the maximum GPU memory occupied by tensors in bytes for a given device.
By default, this returns the peak allocated memory since the beginning of
this program. :func:`~torch.xpu.reset_peak_memory_stats` can be used to
reset the starting point in tracking this metric. For example, these two
functions can measure the peak allocated memory usage of each iteration in a
training loop.
Args:
device (torch.device or int or str, optional): selected device. Returns
statistic for the current device, given by :func:`~torch.xpu.current_device`,
if :attr:`device` is ``None`` (default).
"""
return memory_stats(device=device).get("allocated_bytes.all.peak", 0)
def memory_reserved(device: _device_t = None) -> int:
r"""Return the current GPU memory managed by the caching allocator in bytes for a given device.
Args:
device (torch.device or int or str, optional): selected device. Returns
statistic for the current device, given by :func:`~torch.xpu.current_device`,
if :attr:`device` is ``None`` (default).
"""
return memory_stats(device=device).get("reserved_bytes.all.current", 0)
def max_memory_reserved(device: _device_t = None) -> int:
r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device.
By default, this returns the peak cached memory since the beginning of this
program. :func:`~torch.xpu.reset_peak_memory_stats` can be used to reset
the starting point in tracking this metric. For example, these two functions
can measure the peak cached memory amount of each iteration in a training
loop.
Args:
device (torch.device or int or str, optional): selected device. Returns
statistic for the current device, given by :func:`~torch.xpu.current_device`,
if :attr:`device` is ``None`` (default).
"""
return memory_stats(device=device).get("reserved_bytes.all.peak", 0)
def mem_get_info(device: _device_t = None) -> tuple[int, int]:
r"""Return the global free and total GPU memory for a given device.
Args:
device (torch.device or int or str, optional): selected device. Returns
statistic for the current device, given by :func:`~torch.xpu.current_device`,
if :attr:`device` is ``None`` (default).
Returns:
int: the memory available on the device in units of bytes.
int: the total memory on the device in units of bytes
"""
_lazy_init()
device = _get_device_index(device, optional=True)
return torch._C._xpu_getMemoryInfo(device)
def get_per_process_memory_fraction(device: _device_t = None) -> float:
r"""
Retrieve the memory fraction currently set for a process on a given XPU device.
This fraction represents the portion of the total device memory that
the caching allocator is allowed to use. The allowed memory is calculated as:
.. math:: \text{allowed\_memory} = \text{total\_memory} \times \text{fraction}
Args:
device (torch.device or int or str, optional): selected device. It uses the current device,
given by :func:`~torch.xpu.current_device`, if :attr:`device` is ``None`` (default).
Returns:
float: The memory fraction in the range 0.0 to 1.0.
"""
_lazy_init()
device = _get_device_index(device, optional=True)
return torch._C._xpu_getMemoryFraction(device)
def set_per_process_memory_fraction(fraction: float, device: _device_t = None) -> None:
r"""
Set the memory fraction for a single process on XPU device.
This function limits the amount of memory that the caching allocator can allocate
on the specified XPU device. The allowed memory is computed as:
.. math:: \text{allowed\_memory} = \text{total\_memory} \times \text{fraction}
If the process attempts to allocate more than this allowed memory,
an out-of-memory error will be raised by the allocator.
Arguments:
fraction (float): Range: 0~1. Allowed memory equals total_memory * fraction.
device (torch.device or int or str, optional): selected device. It uses the current device,
given by :func:`~torch.xpu.current_device`, if :attr:`device` is ``None`` (default).
.. note:: In general, the total available free memory is less than the total capacity.
"""
_lazy_init()
device = _get_device_index(device, optional=True)
if not isinstance(fraction, float):
raise TypeError("Invalid type for fraction argument, must be `float`")
# pyrefly: ignore [missing-attribute]
torch._C._xpu_setMemoryFraction(fraction, device)
class _XPUAllocator:
r"""Wrapper over internal XPU memory allocators."""
def __init__(self, allocator: torch._C._xpu_XPUAllocator):
self._allocator = allocator
def allocator(self):
return self._allocator
class XPUPluggableAllocator(_XPUAllocator):
r"""XPU memory allocator loaded from a shared library."""
def __init__(self, path_to_lib_file: str, alloc_fn_name: str, free_fn_name: str):
r"""XPU memory allocator loaded dynamically from a shared library.
This lets users provide custom allocation and free functions implemented
in a separate shared library. The allocator is registered through
``torch._C._xpu_customAllocator`` and becomes available for use via
``torch.memory.xpu.change_current_allocator``.
Arguments:
path_to_lib_file (str):
Filesystem path to the shared library file containing the allocation
and free functions.
alloc_fn_name (str):
Name of the allocation function exported from the shared library.
The function must have the signature:
``void* alloc_fn(size_t size, int device, sycl::queue* queue);``
free_fn_name (str):
Name of the free function exported from the shared library.
The function must have the signature:
``void free_fn(void* ptr, size_t size, sycl::queue* queue);``
"""
allocator_lib = ctypes.CDLL(path_to_lib_file)
alloc_fn_ptr = getattr(allocator_lib, alloc_fn_name)
free_fn_ptr = getattr(allocator_lib, free_fn_name)
alloc_fn_addr = ctypes.cast(alloc_fn_ptr, ctypes.c_void_p).value
free_fn_addr = ctypes.cast(free_fn_ptr, ctypes.c_void_p).value
if alloc_fn_addr is None or free_fn_addr is None:
raise RuntimeError(
"Failed to load allocator symbols from the shared library."
)
self._allocator = torch._C._xpu_customAllocator(alloc_fn_addr, free_fn_addr)
def change_current_allocator(allocator: _XPUAllocator) -> None:
r"""Change the currently used memory allocator to be the one provided.
.. note::
If the current allocator has already been used/initialized, this function will error.
Arguments:
allocator (torch.xpu.memory._XPUAllocator): allocator to be set as the active one.
"""
torch._C._xpu_changeCurrentAllocator(allocator.allocator())
def _get_current_allocator() -> _XPUAllocator:
r"""Return the allocator being currently used.
Returns:
_XPUAllocator: the allocator being currently used.
"""
return _XPUAllocator(torch._C._xpu_getAllocator())
__all__ = [
"XPUPluggableAllocator",
"change_current_allocator",
"empty_cache",
"get_per_process_memory_fraction",
"max_memory_allocated",
"max_memory_reserved",
"mem_get_info",
"memory_allocated",
"memory_reserved",
"memory_stats",
"memory_stats_as_nested_dict",
"reset_accumulated_memory_stats",
"reset_peak_memory_stats",
"set_per_process_memory_fraction",
]