You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
141 lines
5.4 KiB
141 lines
5.4 KiB
import torch
|
|
from torch._C import dtype
|
|
|
|
|
|
__all__ = ["GPULimits"]
|
|
|
|
|
|
class GPULimits:
|
|
r"""Utility class that provides the theoretical limits of Nvidia GPU devices. The
|
|
limits don't take into account thermal throttling (assume that the GPU run at its
|
|
peak rated frequency). This is because user hardware configuration may influence
|
|
power behavior.
|
|
"""
|
|
|
|
def __init__(self, target_device: torch.device):
|
|
# The device properties object is obtained by calling 'cudaGetDeviceProperties' CUDA
|
|
# runtime function. We need the total memory bus width and the memory clock rate to
|
|
# calculate the memory bandwidth.
|
|
self.device_properties = torch.cuda.get_device_properties(target_device)
|
|
|
|
# The compute capability is needed to determine the number of FLOPs per cycle per SM
|
|
self.compute_capability = int(
|
|
f"{self.device_properties.major}{self.device_properties.minor}"
|
|
)
|
|
|
|
# FLOPs per cycle information derived from Table 2 in:
|
|
# https://resources.nvidia.com/en-us-hopper-architecture/nvidia-h100-tensor-c
|
|
|
|
# Returns the number of FMA instructions retired per cycle per SM for a given
|
|
# data type, when tensor cores are NOT used
|
|
def get_fma_per_cycle_per_sm_cuda_cores(self, data_type: dtype) -> int:
|
|
hardcoded_device_values = {
|
|
# Ampere Architecture
|
|
"fp16_80": 256,
|
|
"fp32_80": 64,
|
|
"fp64_80": 32,
|
|
# Hopper Architecture
|
|
"fp16_90": 64,
|
|
"fp32_90": 128,
|
|
"fp64_90": 64,
|
|
# Blackwell Architecture
|
|
"fp16_100": 256,
|
|
"fp32_100": 128,
|
|
"fp64_100": 64,
|
|
}
|
|
dict_key = ""
|
|
if data_type is torch.float16:
|
|
dict_key = f"fp16_{self.compute_capability}"
|
|
elif data_type is torch.float32:
|
|
dict_key = f"fp32_{self.compute_capability}"
|
|
elif data_type is torch.float64:
|
|
dict_key = f"fp64_{self.compute_capability}"
|
|
else:
|
|
dict_key = "unknown"
|
|
|
|
if dict_key not in hardcoded_device_values:
|
|
raise RuntimeError(
|
|
f"No data for sm_{self.compute_capability} and {data_type}."
|
|
)
|
|
|
|
return hardcoded_device_values[dict_key]
|
|
|
|
# Returns the number of FMA instructions retired per cycle per SM for a given
|
|
# data type, when tensor cores ARE used
|
|
def get_fma_per_cycle_per_sm_tensor_cores(self, data_type: dtype) -> int:
|
|
hardcoded_device_values = {
|
|
# Ampere Architecture
|
|
"int8_80": 2048,
|
|
"fp16_80": 1024,
|
|
"fp32_80": 512,
|
|
"fp64_80": 64,
|
|
# Hopper Architecture
|
|
"int8_90": 4096,
|
|
"fp8_90": 4096,
|
|
"fp16_90": 2048,
|
|
"fp32_90": 1024,
|
|
"fp64_90": 128,
|
|
# Blackwell Architecture
|
|
"int8_100": 8192,
|
|
"fp8_100": 8192,
|
|
"fp16_100": 4096,
|
|
"fp32_100": 2048,
|
|
}
|
|
dict_key = ""
|
|
if data_type is torch.float16:
|
|
dict_key = f"fp16_{self.compute_capability}"
|
|
elif data_type is torch.bfloat16:
|
|
# FP16 and BF16 are equivalent in terms of FLOPs per cycle per SM
|
|
dict_key = f"fp16_{self.compute_capability}"
|
|
elif data_type is torch.float32:
|
|
dict_key = f"fp32_{self.compute_capability}"
|
|
elif data_type is torch.int8:
|
|
dict_key = f"int8_{self.compute_capability}"
|
|
elif data_type is torch.float64:
|
|
dict_key = f"fp64_{self.compute_capability}"
|
|
else:
|
|
dict_key = "unknown"
|
|
|
|
if dict_key not in hardcoded_device_values:
|
|
raise RuntimeError(
|
|
f"No data for sm_{self.compute_capability} and {data_type}."
|
|
)
|
|
|
|
return hardcoded_device_values[dict_key]
|
|
|
|
def get_tflops_per_second(
|
|
self, data_type: dtype, use_tensor_cores: bool = True
|
|
) -> float:
|
|
num_sms = self.device_properties.multi_processor_count
|
|
clock_rate = self.device_properties.clock_rate # KHz
|
|
|
|
fma_per_cycle = 0
|
|
if use_tensor_cores:
|
|
fma_per_cycle = self.get_fma_per_cycle_per_sm_tensor_cores(data_type)
|
|
else:
|
|
fma_per_cycle = self.get_fma_per_cycle_per_sm_cuda_cores(data_type)
|
|
|
|
# 1 FMA counts as 2 floating point operations
|
|
# Clock rate is in KHz
|
|
tflops_per_second = num_sms * fma_per_cycle * 2 * clock_rate / 1e9
|
|
return tflops_per_second
|
|
|
|
def get_memory_bandwidth_Bps(self) -> int:
|
|
# DRAM devices are Double-Data which means they provide an output at both fronts of
|
|
# a clock beat
|
|
bus_bytes_per_cycle = int(2 * self.device_properties.memory_bus_width / 8)
|
|
mem_clock_rate_Hz = self.device_properties.memory_clock_rate * 1000
|
|
bytes_per_second = bus_bytes_per_cycle * mem_clock_rate_Hz * 2
|
|
return bytes_per_second
|
|
|
|
def get_shared_memory_bandwidth_Bps(self) -> int:
|
|
# Each warp can LD or ST 32 x 4 bytes per cycle. To calculate the
|
|
# device's throughput we need to multiply with frequency and number of SMs.
|
|
num_sms = self.device_properties.multi_processor_count
|
|
bytes_per_cycle_per_sm = 128
|
|
bytes_per_cycle_per_device = num_sms * bytes_per_cycle_per_sm
|
|
bytes_per_second = (
|
|
bytes_per_cycle_per_device * self.device_properties.clock_rate * 1000
|
|
)
|
|
return bytes_per_second
|