You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

367 lines
13 KiB

import numpy
from .. import registry
from ..compat import cublas, cupy, cupyx
from ..types import DeviceTypes
from ..util import (
is_cupy_array,
is_mxnet_gpu_array,
is_tensorflow_gpu_array,
is_torch_cuda_array,
mxnet2xp,
tensorflow2xp,
torch2xp,
)
from . import _custom_kernels
from .numpy_ops import NumpyOps
from .ops import Ops
@registry.ops("CupyOps")
class CupyOps(Ops):
name = "cupy"
xp = cupy
_xp2 = cupyx
def __init__(
self, device_type: DeviceTypes = "gpu", device_id: int = 0, **kwargs
) -> None:
self.device_type = device_type
self.device_id = device_id
def to_numpy(self, data, *, byte_order=None):
if not isinstance(data, numpy.ndarray):
data = data.get()
if byte_order:
dtype = data.dtype.newbyteorder(byte_order)
data = numpy.asarray(data, dtype=dtype)
return data
def gather_add(self, table, indices):
if table.dtype in ("float32", "float64"):
return _custom_kernels.gather_add(table, indices)
else:
return super().gather_add(table, indices)
def dish(self, X, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.dish(X, inplace=inplace)
else:
return super().dish(X, inplace=inplace)
def backprop_dish(self, dY, X, inplace=False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_dish(dY, X, inplace=inplace)
else:
return super().backprop_dish(dY, X, inplace=inplace)
def gelu(self, X, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.gelu(X, inplace=inplace, threshold=6.0)
else:
return super().gelu(X, inplace=inplace)
def backprop_gelu(self, dY, X, inplace=False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_gelu(dY, X, inplace=inplace, threshold=6.0)
else:
return super().backprop_gelu(dY, X, inplace=inplace)
def gemm(self, x, y, out=None, trans1=False, trans2=False):
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
raise ValueError(
"Encountered a numpy array when processing with cupy. "
"Did you call model.ops.asarray on your data?"
)
if trans1:
x = x.T
if trans2:
y = y.T
if out is None:
return self.xp.dot(x, y)
else:
self.xp.dot(x, y, out=out)
return out
def asarray(self, data, dtype=None):
# We'll try to perform a zero-copy conversion if possible.
if is_cupy_array(data):
array = self.xp.asarray(data, dtype=dtype)
elif is_torch_cuda_array(data):
array = torch2xp(data)
elif is_tensorflow_gpu_array(data):
array = tensorflow2xp(data)
elif is_mxnet_gpu_array(data):
array = mxnet2xp(data)
else:
array = self.xp.array(data, dtype=dtype)
if dtype is not None:
array = array.astype(dtype=dtype, copy=False)
return array
def pad(self, seqs, round_to=1):
"""Perform padding on a list of arrays so that they each have the same
length, by taking the maximum dimension across each axis. This only
works on non-empty sequences with the same `ndim` and `dtype`.
"""
# TODO: This should be generalized to handle different ranks
if not seqs:
raise ValueError("Cannot pad empty sequence")
if len(set(seq.ndim for seq in seqs)) != 1:
raise ValueError("Cannot pad sequences with different ndims")
if len(set(seq.dtype for seq in seqs)) != 1:
raise ValueError("Cannot pad sequences with different dtypes")
if len(set(seq.shape[1:] for seq in seqs)) != 1:
raise ValueError("Cannot pad sequences that differ on other dimensions")
# Our CUDA kernel can currently only handle C contiguous arrays.
if not all(seq.flags["C_CONTIGUOUS"] for seq in seqs) or seqs[0].dtype not in (
"float32",
"float64",
"int32",
"int64",
):
return super().pad(seqs, round_to)
return _custom_kernels.pad(seqs, round_to)
def maxout(self, X):
if X.dtype in ("float32", "float64"):
return _custom_kernels.maxout(X)
else:
return super().maxout(X)
def backprop_maxout(self, dY, which, P):
if dY.dtype in ("float32", "float64") and which.dtype == "int32":
return _custom_kernels.backprop_maxout(dY, which, P)
else:
return super().backprop_maxout(dY, which, P)
def relu(self, X, inplace=False):
if not inplace:
return X * (X > 0)
else:
X *= X > 0
return X
def backprop_relu(self, dY, Y, inplace=False):
if not inplace:
return dY * (Y > 0)
dY *= Y > 0
return dY
def clipped_linear(
self,
X,
slope: float = 1.0,
offset: float = 0.0,
min_val: float = 0.0,
max_val: float = 1.0,
inplace: bool = False,
):
if X.dtype in ("float32", "float64"):
return _custom_kernels.clipped_linear(
X,
inplace=inplace,
slope=slope,
offset=offset,
min_val=min_val,
max_val=max_val,
)
else:
return super().clipped_linear(
X,
inplace=inplace,
slope=slope,
offset=offset,
min_val=min_val,
max_val=max_val,
)
def backprop_clipped_linear(
self,
dY,
X,
slope: float = 1.0,
offset: float = 0.0,
min_val: float = 0.0,
max_val: float = 1.0,
inplace: bool = False,
):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_clipped_linear(
dY,
X,
slope=slope,
offset=offset,
min_val=min_val,
max_val=max_val,
inplace=inplace,
)
else:
return super().backprop_clipped_linear(
dY=dY,
X=X,
slope=slope,
offset=offset,
min_val=min_val,
max_val=max_val,
inplace=inplace,
)
def backprop_hard_swish(self, dY, X, inplace: bool = False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_hard_swish(dY, X, inplace=inplace)
else:
return super().backprop_hard_swish(dY, X, inplace=inplace)
def backprop_hard_swish_mobilenet(self, dY, X, inplace: bool = False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_hard_swish_mobilenet(dY, X, inplace=inplace)
else:
return super().backprop_hard_swish_mobilenet(dY, X, inplace=inplace)
def mish(self, X, threshold=20.0, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.mish(X, inplace=inplace, threshold=threshold)
else:
return super().mish(X, threshold, inplace)
def backprop_mish(self, dY, X, threshold=20.0, inplace=False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_mish(
dY, X, inplace=inplace, threshold=threshold
)
else:
return super().backprop_mish(dY, X, threshold, inplace)
def swish(self, X, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.swish(X, inplace=inplace, threshold=17.0)
else:
return super().swish(X, inplace=inplace)
def backprop_swish(self, dY, X, Y, inplace=False):
if X.dtype == dY.dtype == Y.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_swish(
dY, X, Y, inplace=inplace, threshold=17.0
)
else:
return super().backprop_swish(dY, X, Y, inplace=inplace)
def clip_gradient(self, gradient, threshold):
# We do not use CuPy's linalg.norm, since it uses scalar reductions
# using one CUDA block. This is a lot slower than the cuBLAS
# implementation.
def frobenius_norm(X):
X_vec = X.reshape(-1)
return cublas.nrm2(X_vec)
grad_norm = cupy.maximum(frobenius_norm(gradient), 1e-12)
gradient *= cupy.minimum(threshold, grad_norm) / grad_norm
return gradient
def seq2col(self, seq, nW, *, lengths=None):
"""Given an (M, N) sequence of vectors, return an (M, N*(nW*2+1)) sequence.
The new sequence is constructed by concatenating nW preceding and succeeding
vectors onto each column in the sequence, to extract a window of features.
"""
if seq.dtype in ("float32", "float64") and (
lengths is None or lengths.dtype == "int32"
):
return _custom_kernels.seq2col(seq, nW, lengths=lengths)
else:
return super().seq2col(seq, nW, lengths=lengths)
def backprop_seq2col(self, dY, nW, *, lengths=None):
if dY.dtype in ("float32", "float64") and (
lengths is None or lengths.dtype == "int32"
):
return _custom_kernels.backprop_seq2col(dY, nW, lengths=lengths)
else:
return super().backprop_seq2col(dY, nW, lengths=lengths)
def reduce_mean(self, X, lengths):
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.reduce_mean(X, lengths=lengths)
else:
super().reduce_mean(X, lengths)
def backprop_reduce_mean(self, d_means, lengths):
if d_means.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.backprop_reduce_mean(d_means, lengths)
else:
super().backprop_reduce_mean(d_means, lengths)
def reduce_max(self, X, lengths):
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.reduce_max(X, lengths)
else:
super().reduce_max(X, lengths)
def backprop_reduce_max(self, d_maxes, which, lengths):
if (
d_maxes.dtype in ("float32", "float64")
and which.dtype == "int32"
and lengths.dtype == "int32"
):
return _custom_kernels.backprop_reduce_max(d_maxes, which, lengths)
else:
super().backprop_reduce_max(d_maxes, which, lengths)
def reduce_sum(self, X, lengths):
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.reduce_sum(X, lengths)
else:
return super().reduce_sum(X, lengths)
def backprop_reduce_sum(self, d_sums, lengths):
if d_sums.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.backprop_reduce_sum(d_sums, lengths)
else:
return super().backprop_reduce_sum(d_sums, lengths)
def hash(self, ids, seed):
return _custom_kernels.hash(ids, seed)
def scatter_add(self, table, indices, values):
self._xp2.scatter_add(table, indices, values)
def adam(
self, weights, gradient, mom1, mom2, beta1, beta2, eps, learn_rate, mod_rate=1.0
):
_check_compatible_shape(weights, gradient)
_check_compatible_shape(weights, mom1)
_check_compatible_shape(weights, mom2)
adam_kernel(
gradient, learn_rate, 1 - beta1, 1 - beta2, eps, weights, mom1, mom2
)
gradient.fill(0)
return weights, gradient, mom1, mom2
def position_encode(self, N, D, period=10000, out=None):
positions = NumpyOps().position_encode(N, D, period=period, out=out)
return self.asarray(positions)
if cupy is not None:
adam_kernel = cupy.ElementwiseKernel(
"T grad, T lr, T one_minus_beta1, T one_minus_beta2, T eps",
"T param, T m, T v",
"""m += one_minus_beta1 * (grad - m);
v += one_minus_beta2 * (grad * grad - v);
param -= lr * m / (sqrt(v) + eps);""",
"adam",
)
else:
adam_kernel = None
def _check_compatible_shape(u, v):
if u.shape != v.shape:
msg = f"arrays have incompatible shapes: {u.shape} and {v.shape}"
raise ValueError(msg)