# Copyright (C) 2022-2024 Exaloop Inc. from internal.gc import sizeof as _sizeof @tuple class Device: _device: i32 def __new__(device: int): from C import seq_nvptx_device(int) -> i32 return Device(seq_nvptx_device(device)) @staticmethod def count(): from C import seq_nvptx_device_count() -> int return seq_nvptx_device_count() def __str__(self): from C import seq_nvptx_device_name(i32) -> str return seq_nvptx_device_name(self._device) def __index__(self): return int(self._device) def __bool__(self): return True @property def compute_capability(self): from C import seq_nvptx_device_capability(i32) -> int c = seq_nvptx_device_capability(self._device) return (c >> 32, c & 0xffffffff) @tuple class Memory[T]: _ptr: Ptr[byte] def _alloc(n: int, T: type): from C import seq_nvptx_device_alloc(int) -> Ptr[byte] return Memory[T](seq_nvptx_device_alloc(n * _sizeof(T))) def _read(self, p: Ptr[T], n: int): from C import seq_nvptx_memcpy_d2h(Ptr[byte], Ptr[byte], int) seq_nvptx_memcpy_d2h(p.as_byte(), self._ptr, n * _sizeof(T)) def _write(self, p: Ptr[T], n: int): from C import seq_nvptx_memcpy_h2d(Ptr[byte], Ptr[byte], int) seq_nvptx_memcpy_h2d(self._ptr, p.as_byte(), n * _sizeof(T)) def _free(self): from C import seq_nvptx_device_free(Ptr[byte]) seq_nvptx_device_free(self._ptr) @llvm def syncthreads() -> None: declare void @llvm.nvvm.barrier0() call void @llvm.nvvm.barrier0() ret {} {} @tuple class Dim3: _x: u32 _y: u32 _z: u32 def __new__(x: int, y: int, z: int): return Dim3(u32(x), u32(y), u32(z)) @property def x(self): return int(self._x) @property def y(self): return int(self._y) @property def z(self): return int(self._z) @tuple class Thread: @property def x(self): @pure @llvm def get_x() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() %res = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ret i32 %res return int(get_x()) @property def y(self): @pure @llvm def get_y() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() %res = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() ret i32 %res return int(get_y()) @property def z(self): @pure @llvm def get_z() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() %res = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() ret i32 %res return int(get_z()) @tuple class Block: @property def x(self): @pure @llvm def get_x() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() %res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ret i32 %res return int(get_x()) @property def y(self): @pure @llvm def get_y() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() %res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() ret i32 %res return int(get_y()) @property def z(self): @pure @llvm def get_z() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() %res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() ret i32 %res return int(get_z()) @property def dim(self): @pure @llvm def get_x() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() %res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ret i32 %res @pure @llvm def get_y() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() %res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() ret i32 %res @pure @llvm def get_z() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z() %res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() ret i32 %res return Dim3(get_x(), get_y(), get_z()) @tuple class Grid: @property def dim(self): @pure @llvm def get_x() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() %res = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ret i32 %res @pure @llvm def get_y() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() %res = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() ret i32 %res @pure @llvm def get_z() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() %res = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() ret i32 %res return Dim3(get_x(), get_y(), get_z()) @tuple class Warp: def __len__(self): @pure @llvm def get_warpsize() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() %res = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() ret i32 %res return int(get_warpsize()) thread = Thread() block = Block() grid = Grid() warp = Warp() def _catch(): return (thread, block, grid, warp) _catch() @tuple class AllocCache: v: List[Ptr[byte]] def add(self, p: Ptr[byte]): self.v.append(p) def free(self): for p in self.v: Memory[byte](p)._free() def _tuple_from_gpu(args, gpu_args): if staticlen(args) > 0: a = args[0] g = gpu_args[0] a.__from_gpu__(g) _tuple_from_gpu(args[1:], gpu_args[1:]) def kernel(fn): from C import seq_nvptx_function(str) -> cobj from C import seq_nvptx_invoke(cobj, u32, u32, u32, u32, u32, u32, u32, cobj) def canonical_dim(dim): if isinstance(dim, NoneType): return (1, 1, 1) elif isinstance(dim, int): return (dim, 1, 1) elif isinstance(dim, Tuple[int,int]): return (dim[0], dim[1], 1) elif isinstance(dim, Tuple[int,int,int]): return dim elif isinstance(dim, Dim3): return (dim.x, dim.y, dim.z) else: compile_error("bad dimension argument") def offsets(t): @pure @llvm def offsetof(t: T, i: Static[int], T: type, S: type) -> int: %p = getelementptr {=T}, ptr null, i64 0, i32 {=i} %s = ptrtoint ptr %p to i64 ret i64 %s if staticlen(t) == 0: return () else: T = type(t) S = type(t[-1]) return (*offsets(t[:-1]), offsetof(t, staticlen(t) - 1, T, S)) def wrapper(*args, grid, block): grid = canonical_dim(grid) block = canonical_dim(block) cache = AllocCache([]) shared_mem = 0 gpu_args = tuple(arg.__to_gpu__(cache) for arg in args) kernel_ptr = seq_nvptx_function(__realized__(fn, gpu_args).__llvm_name__) p = __ptr__(gpu_args).as_byte() arg_ptrs = tuple((p + offset) for offset in offsets(gpu_args)) seq_nvptx_invoke(kernel_ptr, u32(grid[0]), u32(grid[1]), u32(grid[2]), u32(block[0]), u32(block[1]), u32(block[2]), u32(shared_mem), __ptr__(arg_ptrs).as_byte()) _tuple_from_gpu(args, gpu_args) cache.free() return wrapper def _ptr_to_gpu(p: Ptr[T], n: int, cache: AllocCache, index_filter = lambda i: True, T: type): from internal.gc import atomic if not atomic(T): tmp = Ptr[T](n) for i in range(n): if index_filter(i): tmp[i] = p[i].__to_gpu__(cache) p = tmp mem = Memory._alloc(n, T) cache.add(mem._ptr) mem._write(p, n) return Ptr[T](mem._ptr) def _ptr_from_gpu(p: Ptr[T], q: Ptr[T], n: int, index_filter = lambda i: True, T: type): from internal.gc import atomic mem = Memory[T](q.as_byte()) if not atomic(T): tmp = Ptr[T](n) mem._read(tmp, n) for i in range(n): if index_filter(i): p[i] = T.__from_gpu_new__(tmp[i]) else: mem._read(p, n) @pure @llvm def _ptr_to_type(p: cobj, T: type) -> T: ret ptr %p def _object_to_gpu(obj: T, cache: AllocCache, T: type): s = tuple(obj) gpu_mem = Memory._alloc(1, type(s)) cache.add(gpu_mem._ptr) gpu_mem._write(__ptr__(s), 1) return _ptr_to_type(gpu_mem._ptr, T) def _object_from_gpu(obj): T = type(obj) S = type(tuple(obj)) tmp = T.__new__() p = Ptr[S](tmp.__raw__()) q = Ptr[S](obj.__raw__()) mem = Memory[S](q.as_byte()) mem._read(p, 1) return tmp @tuple class Pointer[T]: _ptr: Ptr[T] _len: int def __to_gpu__(self, cache: AllocCache): return _ptr_to_gpu(self._ptr, self._len, cache) def __from_gpu__(self, other: Ptr[T]): _ptr_from_gpu(self._ptr, other, self._len) def __from_gpu_new__(other: Ptr[T]): return other def raw(v: List[T], T: type): return Pointer(v.arr.ptr, len(v)) @extend class Ptr: def __to_gpu__(self, cache: AllocCache): return self def __from_gpu__(self, other: Ptr[T]): pass def __from_gpu_new__(other: Ptr[T]): return other @extend class NoneType: def __to_gpu__(self, cache: AllocCache): return self def __from_gpu__(self, other: NoneType): pass def __from_gpu_new__(other: NoneType): return other @extend class int: def __to_gpu__(self, cache: AllocCache): return self def __from_gpu__(self, other: int): pass def __from_gpu_new__(other: int): return other @extend class float: def __to_gpu__(self, cache: AllocCache): return self def __from_gpu__(self, other: float): pass def __from_gpu_new__(other: float): return other @extend class float32: def __to_gpu__(self, cache: AllocCache): return self def __from_gpu__(self, other: float32): pass def __from_gpu_new__(other: float32): return other @extend class bool: def __to_gpu__(self, cache: AllocCache): return self def __from_gpu__(self, other: bool): pass def __from_gpu_new__(other: bool): return other @extend class byte: def __to_gpu__(self, cache: AllocCache): return self def __from_gpu__(self, other: byte): pass def __from_gpu_new__(other: byte): return other @extend class Int: def __to_gpu__(self, cache: AllocCache): return self def __from_gpu__(self, other: Int[N]): pass def __from_gpu_new__(other: Int[N]): return other @extend class UInt: def __to_gpu__(self, cache: AllocCache): return self def __from_gpu__(self, other: UInt[N]): pass def __from_gpu_new__(other: UInt[N]): return other @extend class str: def __to_gpu__(self, cache: AllocCache): n = self.len return str(_ptr_to_gpu(self.ptr, n, cache), n) def __from_gpu__(self, other: str): pass def __from_gpu_new__(other: str): n = other.len p = Ptr[byte](n) _ptr_from_gpu(p, other.ptr, n) return str(p, n) @extend class List: @inline def __to_gpu__(self, cache: AllocCache): mem = List[T].__new__() n = self.len gpu_ptr = _ptr_to_gpu(self.arr.ptr, n, cache) mem.arr = Array[T](gpu_ptr, n) mem.len = n return _object_to_gpu(mem, cache) @inline def __from_gpu__(self, other: List[T]): mem = _object_from_gpu(other) my_cap = self.arr.len other_cap = mem.arr.len if other_cap > my_cap: self._resize(other_cap) _ptr_from_gpu(self.arr.ptr, mem.arr.ptr, mem.len) self.len = mem.len @inline def __from_gpu_new__(other: List[T]): mem = _object_from_gpu(other) arr = Array[T](mem.arr.len) _ptr_from_gpu(arr.ptr, mem.arr.ptr, arr.len) mem.arr = arr return mem @extend class DynamicTuple: @inline def __to_gpu__(self, cache: AllocCache): n = self._len gpu_ptr = _ptr_to_gpu(self._ptr, n, cache) return DynamicTuple(gpu_ptr, n) @inline def __from_gpu__(self, other: DynamicTuple[T]): _ptr_from_gpu(self._ptr, other._ptr, self._len) @inline def __from_gpu_new__(other: DynamicTuple[T]): n = other._len p = Ptr[T](n) _ptr_from_gpu(p, other._ptr, n) return DynamicTuple(p, n) @extend class Dict: def __to_gpu__(self, cache: AllocCache): from internal.khash import __ac_fsize mem = Dict[K,V].__new__() n = self._n_buckets f = __ac_fsize(n) if n else 0 mem._n_buckets = n mem._size = self._size mem._n_occupied = self._n_occupied mem._upper_bound = self._upper_bound mem._flags = _ptr_to_gpu(self._flags, f, cache) mem._keys = _ptr_to_gpu(self._keys, n, cache, lambda i: self._kh_exist(i)) mem._vals = _ptr_to_gpu(self._vals, n, cache, lambda i: self._kh_exist(i)) return _object_to_gpu(mem, cache) def __from_gpu__(self, other: Dict[K,V]): from internal.khash import __ac_fsize mem = _object_from_gpu(other) my_n = self._n_buckets n = mem._n_buckets f = __ac_fsize(n) if n else 0 if my_n != n: self._flags = Ptr[u32](f) self._keys = Ptr[K](n) self._vals = Ptr[V](n) _ptr_from_gpu(self._flags, mem._flags, f) _ptr_from_gpu(self._keys, mem._keys, n, lambda i: self._kh_exist(i)) _ptr_from_gpu(self._vals, mem._vals, n, lambda i: self._kh_exist(i)) self._n_buckets = n self._size = mem._size self._n_occupied = mem._n_occupied self._upper_bound = mem._upper_bound def __from_gpu_new__(other: Dict[K,V]): from internal.khash import __ac_fsize mem = _object_from_gpu(other) n = mem._n_buckets f = __ac_fsize(n) if n else 0 flags = Ptr[u32](f) keys = Ptr[K](n) vals = Ptr[V](n) _ptr_from_gpu(flags, mem._flags, f) mem._flags = flags _ptr_from_gpu(keys, mem._keys, n, lambda i: mem._kh_exist(i)) mem._keys = keys _ptr_from_gpu(vals, mem._vals, n, lambda i: mem._kh_exist(i)) mem._vals = vals return mem @extend class Set: def __to_gpu__(self, cache: AllocCache): from internal.khash import __ac_fsize mem = Set[K].__new__() n = self._n_buckets f = __ac_fsize(n) if n else 0 mem._n_buckets = n mem._size = self._size mem._n_occupied = self._n_occupied mem._upper_bound = self._upper_bound mem._flags = _ptr_to_gpu(self._flags, f, cache) mem._keys = _ptr_to_gpu(self._keys, n, cache, lambda i: self._kh_exist(i)) return _object_to_gpu(mem, cache) def __from_gpu__(self, other: Set[K]): from internal.khash import __ac_fsize mem = _object_from_gpu(other) my_n = self._n_buckets n = mem._n_buckets f = __ac_fsize(n) if n else 0 if my_n != n: self._flags = Ptr[u32](f) self._keys = Ptr[K](n) _ptr_from_gpu(self._flags, mem._flags, f) _ptr_from_gpu(self._keys, mem._keys, n, lambda i: self._kh_exist(i)) self._n_buckets = n self._size = mem._size self._n_occupied = mem._n_occupied self._upper_bound = mem._upper_bound def __from_gpu_new__(other: Set[K]): from internal.khash import __ac_fsize mem = _object_from_gpu(other) n = mem._n_buckets f = __ac_fsize(n) if n else 0 flags = Ptr[u32](f) keys = Ptr[K](n) _ptr_from_gpu(flags, mem._flags, f) mem._flags = flags _ptr_from_gpu(keys, mem._keys, n, lambda i: mem._kh_exist(i)) mem._keys = keys return mem @extend class Optional: def __to_gpu__(self, cache: AllocCache): if self is None: return self else: return Optional[T](self.__val__().__to_gpu__(cache)) def __from_gpu__(self, other: Optional[T]): if self is not None and other is not None: self.__val__().__from_gpu__(other.__val__()) def __from_gpu_new__(other: Optional[T]): if other is None: return Optional[T]() else: return Optional[T](T.__from_gpu_new__(other.__val__())) @extend class __internal__: def class_to_gpu(obj, cache: AllocCache): if isinstance(obj, Tuple): return tuple(a.__to_gpu__(cache) for a in obj) elif isinstance(obj, ByVal): T = type(obj) return T(*tuple(a.__to_gpu__(cache) for a in tuple(obj))) else: T = type(obj) S = type(tuple(obj)) mem = T.__new__() Ptr[S](mem.__raw__())[0] = tuple(obj).__to_gpu__(cache) return _object_to_gpu(mem, cache) def class_from_gpu(obj, other): if isinstance(obj, Tuple): _tuple_from_gpu(obj, other) elif isinstance(obj, ByVal): _tuple_from_gpu(tuple(obj), tuple(other)) else: S = type(tuple(obj)) Ptr[S](obj.__raw__())[0] = S.__from_gpu_new__(tuple(_object_from_gpu(other))) def class_from_gpu_new(other): if isinstance(other, Tuple): return tuple(type(a).__from_gpu_new__(a) for a in other) elif isinstance(other, ByVal): T = type(other) return T(*tuple(type(a).__from_gpu_new__(a) for a in tuple(other))) else: S = type(tuple(other)) mem = _object_from_gpu(other) Ptr[S](mem.__raw__())[0] = S.__from_gpu_new__(tuple(mem)) return mem # @par(gpu=True) support @pure @llvm def _gpu_thread_x() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() %res = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ret i32 %res @pure @llvm def _gpu_block_x() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() %res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ret i32 %res @pure @llvm def _gpu_block_dim_x() -> u32: declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() %res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ret i32 %res def _gpu_loop_outline_template(start, stop, args, instance: Static[int]): @nonpure def _loop_step(): return 1 @kernel def _kernel_stub(start: int, count: int, args): @nonpure def _gpu_loop_body_stub(idx, args): pass @nonpure def _dummy_use(n): pass _dummy_use(instance) idx = (int(_gpu_block_dim_x()) * int(_gpu_block_x())) + int(_gpu_thread_x()) step = _loop_step() if idx < count: _gpu_loop_body_stub(start + (idx * step), args) step = _loop_step() loop = range(start, stop, step) MAX_BLOCK = 1024 MAX_GRID = 2147483647 G = MAX_BLOCK * MAX_GRID n = len(loop) if n == 0: return elif n > G: raise ValueError(f'loop exceeds GPU iteration limit of {G}') block = n grid = 1 if n > MAX_BLOCK: block = MAX_BLOCK grid = (n // MAX_BLOCK) + (0 if n % MAX_BLOCK == 0 else 1) _kernel_stub(start, n, args, grid=grid, block=block)