codon/stdlib/gpu.codon

752 lines
19 KiB
Python

# Copyright (C) 2022-2025 Exaloop Inc. <https://exaloop.io>
from internal.gc import sizeof as _sizeof
@tuple
class Device:
_device: i32
def __new__(device: int):
from C import seq_nvptx_device(int) -> i32
return Device(seq_nvptx_device(device))
@staticmethod
def count():
from C import seq_nvptx_device_count() -> int
return seq_nvptx_device_count()
def __str__(self):
from C import seq_nvptx_device_name(i32) -> str
return seq_nvptx_device_name(self._device)
def __index__(self):
return int(self._device)
def __bool__(self):
return True
@property
def compute_capability(self):
from C import seq_nvptx_device_capability(i32) -> int
c = seq_nvptx_device_capability(self._device)
return (c >> 32, c & 0xffffffff)
@tuple
class Memory[T]:
_ptr: Ptr[byte]
def _alloc(n: int, T: type):
from C import seq_nvptx_device_alloc(int) -> Ptr[byte]
return Memory[T](seq_nvptx_device_alloc(n * _sizeof(T)))
def _read(self, p: Ptr[T], n: int):
from C import seq_nvptx_memcpy_d2h(Ptr[byte], Ptr[byte], int)
seq_nvptx_memcpy_d2h(p.as_byte(), self._ptr, n * _sizeof(T))
def _write(self, p: Ptr[T], n: int):
from C import seq_nvptx_memcpy_h2d(Ptr[byte], Ptr[byte], int)
seq_nvptx_memcpy_h2d(self._ptr, p.as_byte(), n * _sizeof(T))
def _free(self):
from C import seq_nvptx_device_free(Ptr[byte])
seq_nvptx_device_free(self._ptr)
@llvm
def syncthreads() -> None:
declare void @llvm.nvvm.barrier0()
call void @llvm.nvvm.barrier0()
ret {} {}
@tuple
class Dim3:
_x: u32
_y: u32
_z: u32
def __new__(x: int, y: int, z: int):
return Dim3(u32(x), u32(y), u32(z))
@property
def x(self):
return int(self._x)
@property
def y(self):
return int(self._y)
@property
def z(self):
return int(self._z)
@tuple
class Thread:
@property
def x(self):
@pure
@llvm
def get_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %res
return int(get_x())
@property
def y(self):
@pure
@llvm
def get_y() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
%res = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %res
return int(get_y())
@property
def z(self):
@pure
@llvm
def get_z() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
%res = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
ret i32 %res
return int(get_z())
@tuple
class Block:
@property
def x(self):
@pure
@llvm
def get_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %res
return int(get_x())
@property
def y(self):
@pure
@llvm
def get_y() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %res
return int(get_y())
@property
def z(self):
@pure
@llvm
def get_z() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
ret i32 %res
return int(get_z())
@property
def dim(self):
@pure
@llvm
def get_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %res
@pure
@llvm
def get_y() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %res
@pure
@llvm
def get_z() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
ret i32 %res
return Dim3(get_x(), get_y(), get_z())
@tuple
class Grid:
@property
def dim(self):
@pure
@llvm
def get_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
ret i32 %res
@pure
@llvm
def get_y() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
%res = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
ret i32 %res
@pure
@llvm
def get_z() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
%res = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
ret i32 %res
return Dim3(get_x(), get_y(), get_z())
@tuple
class Warp:
def __len__(self):
@pure
@llvm
def get_warpsize() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
%res = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
ret i32 %res
return int(get_warpsize())
thread = Thread()
block = Block()
grid = Grid()
warp = Warp()
def _catch():
return (thread, block, grid, warp)
_catch()
@tuple
class AllocCache:
v: List[Ptr[byte]]
def add(self, p: Ptr[byte]):
self.v.append(p)
def free(self):
for p in self.v:
Memory[byte](p)._free()
def _tuple_from_gpu(args, gpu_args):
if staticlen(args) > 0:
a = args[0]
g = gpu_args[0]
a.__from_gpu__(g)
_tuple_from_gpu(args[1:], gpu_args[1:])
def kernel(fn):
from C import seq_nvptx_function(str) -> cobj
from C import seq_nvptx_invoke(cobj, u32, u32, u32, u32, u32, u32, u32, cobj)
def canonical_dim(dim):
if isinstance(dim, NoneType):
return (1, 1, 1)
elif isinstance(dim, int):
return (dim, 1, 1)
elif isinstance(dim, Tuple[int,int]):
return (dim[0], dim[1], 1)
elif isinstance(dim, Tuple[int,int,int]):
return dim
elif isinstance(dim, Dim3):
return (dim.x, dim.y, dim.z)
else:
compile_error("bad dimension argument")
def offsets(t):
@pure
@llvm
def offsetof(t: T, i: Static[int], T: type, S: type) -> int:
%p = getelementptr {=T}, ptr null, i64 0, i32 {=i}
%s = ptrtoint ptr %p to i64
ret i64 %s
if staticlen(t) == 0:
return ()
else:
T = type(t)
S = type(t[-1])
return (*offsets(t[:-1]), offsetof(t, staticlen(t) - 1, T, S))
def wrapper(*args, grid, block):
grid = canonical_dim(grid)
block = canonical_dim(block)
cache = AllocCache([])
shared_mem = 0
gpu_args = tuple(arg.__to_gpu__(cache) for arg in args)
kernel_ptr = seq_nvptx_function(__realized__(fn, gpu_args).__llvm_name__)
p = __ptr__(gpu_args).as_byte()
arg_ptrs = tuple((p + offset) for offset in offsets(gpu_args))
seq_nvptx_invoke(kernel_ptr, u32(grid[0]), u32(grid[1]), u32(grid[2]), u32(block[0]),
u32(block[1]), u32(block[2]), u32(shared_mem), __ptr__(arg_ptrs).as_byte())
_tuple_from_gpu(args, gpu_args)
cache.free()
return wrapper
def _ptr_to_gpu(p: Ptr[T], n: int, cache: AllocCache, index_filter = lambda i: True, T: type):
from internal.gc import atomic
if not atomic(T):
tmp = Ptr[T](n)
for i in range(n):
if index_filter(i):
tmp[i] = p[i].__to_gpu__(cache)
p = tmp
mem = Memory._alloc(n, T)
cache.add(mem._ptr)
mem._write(p, n)
return Ptr[T](mem._ptr)
def _ptr_from_gpu(p: Ptr[T], q: Ptr[T], n: int, index_filter = lambda i: True, T: type):
from internal.gc import atomic
mem = Memory[T](q.as_byte())
if not atomic(T):
tmp = Ptr[T](n)
mem._read(tmp, n)
for i in range(n):
if index_filter(i):
p[i] = T.__from_gpu_new__(tmp[i])
else:
mem._read(p, n)
@pure
@llvm
def _ptr_to_type(p: cobj, T: type) -> T:
ret ptr %p
def _object_to_gpu(obj: T, cache: AllocCache, T: type):
s = tuple(obj)
gpu_mem = Memory._alloc(1, type(s))
cache.add(gpu_mem._ptr)
gpu_mem._write(__ptr__(s), 1)
return _ptr_to_type(gpu_mem._ptr, T)
def _object_from_gpu(obj):
T = type(obj)
S = type(tuple(obj))
tmp = T.__new__()
p = Ptr[S](tmp.__raw__())
q = Ptr[S](obj.__raw__())
mem = Memory[S](q.as_byte())
mem._read(p, 1)
return tmp
@tuple
class Pointer[T]:
_ptr: Ptr[T]
_len: int
def __to_gpu__(self, cache: AllocCache):
return _ptr_to_gpu(self._ptr, self._len, cache)
def __from_gpu__(self, other: Ptr[T]):
_ptr_from_gpu(self._ptr, other, self._len)
def __from_gpu_new__(other: Ptr[T]):
return other
def raw(v: List[T], T: type):
return Pointer(v.arr.ptr, len(v))
@extend
class Ptr:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: Ptr[T]):
pass
def __from_gpu_new__(other: Ptr[T]):
return other
@extend
class NoneType:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: NoneType):
pass
def __from_gpu_new__(other: NoneType):
return other
@extend
class int:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: int):
pass
def __from_gpu_new__(other: int):
return other
@extend
class float:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: float):
pass
def __from_gpu_new__(other: float):
return other
@extend
class float32:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: float32):
pass
def __from_gpu_new__(other: float32):
return other
@extend
class bool:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: bool):
pass
def __from_gpu_new__(other: bool):
return other
@extend
class byte:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: byte):
pass
def __from_gpu_new__(other: byte):
return other
@extend
class Int:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: Int[N]):
pass
def __from_gpu_new__(other: Int[N]):
return other
@extend
class UInt:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: UInt[N]):
pass
def __from_gpu_new__(other: UInt[N]):
return other
@extend
class str:
def __to_gpu__(self, cache: AllocCache):
n = self.len
return str(_ptr_to_gpu(self.ptr, n, cache), n)
def __from_gpu__(self, other: str):
pass
def __from_gpu_new__(other: str):
n = other.len
p = Ptr[byte](n)
_ptr_from_gpu(p, other.ptr, n)
return str(p, n)
@extend
class List:
@inline
def __to_gpu__(self, cache: AllocCache):
mem = List[T].__new__()
n = self.len
gpu_ptr = _ptr_to_gpu(self.arr.ptr, n, cache)
mem.arr = Array[T](gpu_ptr, n)
mem.len = n
return _object_to_gpu(mem, cache)
@inline
def __from_gpu__(self, other: List[T]):
mem = _object_from_gpu(other)
my_cap = self.arr.len
other_cap = mem.arr.len
if other_cap > my_cap:
self._resize(other_cap)
_ptr_from_gpu(self.arr.ptr, mem.arr.ptr, mem.len)
self.len = mem.len
@inline
def __from_gpu_new__(other: List[T]):
mem = _object_from_gpu(other)
arr = Array[T](mem.arr.len)
_ptr_from_gpu(arr.ptr, mem.arr.ptr, arr.len)
mem.arr = arr
return mem
@extend
class DynamicTuple:
@inline
def __to_gpu__(self, cache: AllocCache):
n = self._len
gpu_ptr = _ptr_to_gpu(self._ptr, n, cache)
return DynamicTuple(gpu_ptr, n)
@inline
def __from_gpu__(self, other: DynamicTuple[T]):
_ptr_from_gpu(self._ptr, other._ptr, self._len)
@inline
def __from_gpu_new__(other: DynamicTuple[T]):
n = other._len
p = Ptr[T](n)
_ptr_from_gpu(p, other._ptr, n)
return DynamicTuple(p, n)
@extend
class Dict:
def __to_gpu__(self, cache: AllocCache):
from internal.khash import __ac_fsize
mem = Dict[K,V].__new__()
n = self._n_buckets
f = __ac_fsize(n) if n else 0
mem._n_buckets = n
mem._size = self._size
mem._n_occupied = self._n_occupied
mem._upper_bound = self._upper_bound
mem._flags = _ptr_to_gpu(self._flags, f, cache)
mem._keys = _ptr_to_gpu(self._keys, n, cache, lambda i: self._kh_exist(i))
mem._vals = _ptr_to_gpu(self._vals, n, cache, lambda i: self._kh_exist(i))
return _object_to_gpu(mem, cache)
def __from_gpu__(self, other: Dict[K,V]):
from internal.khash import __ac_fsize
mem = _object_from_gpu(other)
my_n = self._n_buckets
n = mem._n_buckets
f = __ac_fsize(n) if n else 0
if my_n != n:
self._flags = Ptr[u32](f)
self._keys = Ptr[K](n)
self._vals = Ptr[V](n)
_ptr_from_gpu(self._flags, mem._flags, f)
_ptr_from_gpu(self._keys, mem._keys, n, lambda i: self._kh_exist(i))
_ptr_from_gpu(self._vals, mem._vals, n, lambda i: self._kh_exist(i))
self._n_buckets = n
self._size = mem._size
self._n_occupied = mem._n_occupied
self._upper_bound = mem._upper_bound
def __from_gpu_new__(other: Dict[K,V]):
from internal.khash import __ac_fsize
mem = _object_from_gpu(other)
n = mem._n_buckets
f = __ac_fsize(n) if n else 0
flags = Ptr[u32](f)
keys = Ptr[K](n)
vals = Ptr[V](n)
_ptr_from_gpu(flags, mem._flags, f)
mem._flags = flags
_ptr_from_gpu(keys, mem._keys, n, lambda i: mem._kh_exist(i))
mem._keys = keys
_ptr_from_gpu(vals, mem._vals, n, lambda i: mem._kh_exist(i))
mem._vals = vals
return mem
@extend
class Set:
def __to_gpu__(self, cache: AllocCache):
from internal.khash import __ac_fsize
mem = Set[K].__new__()
n = self._n_buckets
f = __ac_fsize(n) if n else 0
mem._n_buckets = n
mem._size = self._size
mem._n_occupied = self._n_occupied
mem._upper_bound = self._upper_bound
mem._flags = _ptr_to_gpu(self._flags, f, cache)
mem._keys = _ptr_to_gpu(self._keys, n, cache, lambda i: self._kh_exist(i))
return _object_to_gpu(mem, cache)
def __from_gpu__(self, other: Set[K]):
from internal.khash import __ac_fsize
mem = _object_from_gpu(other)
my_n = self._n_buckets
n = mem._n_buckets
f = __ac_fsize(n) if n else 0
if my_n != n:
self._flags = Ptr[u32](f)
self._keys = Ptr[K](n)
_ptr_from_gpu(self._flags, mem._flags, f)
_ptr_from_gpu(self._keys, mem._keys, n, lambda i: self._kh_exist(i))
self._n_buckets = n
self._size = mem._size
self._n_occupied = mem._n_occupied
self._upper_bound = mem._upper_bound
def __from_gpu_new__(other: Set[K]):
from internal.khash import __ac_fsize
mem = _object_from_gpu(other)
n = mem._n_buckets
f = __ac_fsize(n) if n else 0
flags = Ptr[u32](f)
keys = Ptr[K](n)
_ptr_from_gpu(flags, mem._flags, f)
mem._flags = flags
_ptr_from_gpu(keys, mem._keys, n, lambda i: mem._kh_exist(i))
mem._keys = keys
return mem
@extend
class Optional:
def __to_gpu__(self, cache: AllocCache):
if self is None:
return self
else:
return Optional[T](self.__val__().__to_gpu__(cache))
def __from_gpu__(self, other: Optional[T]):
if self is not None and other is not None:
self.__val__().__from_gpu__(other.__val__())
def __from_gpu_new__(other: Optional[T]):
if other is None:
return Optional[T]()
else:
return Optional[T](T.__from_gpu_new__(other.__val__()))
@extend
class __internal__:
def class_to_gpu(obj, cache: AllocCache):
if isinstance(obj, Tuple):
return tuple(a.__to_gpu__(cache) for a in obj)
elif isinstance(obj, ByVal):
T = type(obj)
return T(*tuple(a.__to_gpu__(cache) for a in tuple(obj)))
else:
T = type(obj)
S = type(tuple(obj))
mem = T.__new__()
Ptr[S](mem.__raw__())[0] = tuple(obj).__to_gpu__(cache)
return _object_to_gpu(mem, cache)
def class_from_gpu(obj, other):
if isinstance(obj, Tuple):
_tuple_from_gpu(obj, other)
elif isinstance(obj, ByVal):
_tuple_from_gpu(tuple(obj), tuple(other))
else:
S = type(tuple(obj))
Ptr[S](obj.__raw__())[0] = S.__from_gpu_new__(tuple(_object_from_gpu(other)))
def class_from_gpu_new(other):
if isinstance(other, Tuple):
return tuple(type(a).__from_gpu_new__(a) for a in other)
elif isinstance(other, ByVal):
T = type(other)
return T(*tuple(type(a).__from_gpu_new__(a) for a in tuple(other)))
else:
S = type(tuple(other))
mem = _object_from_gpu(other)
Ptr[S](mem.__raw__())[0] = S.__from_gpu_new__(tuple(mem))
return mem
# @par(gpu=True) support
@pure
@llvm
def _gpu_thread_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %res
@pure
@llvm
def _gpu_block_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %res
@pure
@llvm
def _gpu_block_dim_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %res
def _gpu_loop_outline_template(start, stop, args, instance: Static[int]):
@nonpure
def _loop_step():
return 1
@kernel
def _kernel_stub(start: int, count: int, args):
@nonpure
def _gpu_loop_body_stub(idx, args):
pass
@nonpure
def _dummy_use(n):
pass
_dummy_use(instance)
idx = (int(_gpu_block_dim_x()) * int(_gpu_block_x())) + int(_gpu_thread_x())
step = _loop_step()
if idx < count:
_gpu_loop_body_stub(start + (idx * step), args)
step = _loop_step()
loop = range(start, stop, step)
MAX_BLOCK = 1024
MAX_GRID = 2147483647
G = MAX_BLOCK * MAX_GRID
n = len(loop)
if n == 0:
return
elif n > G:
raise ValueError(f'loop exceeds GPU iteration limit of {G}')
block = n
grid = 1
if n > MAX_BLOCK:
block = MAX_BLOCK
grid = (n // MAX_BLOCK) + (0 if n % MAX_BLOCK == 0 else 1)
_kernel_stub(start, n, args, grid=grid, block=block)