1
0
mirror of https://github.com/exaloop/codon.git synced 2025-06-03 15:03:52 +08:00
codon/stdlib/gpu.codon
Ibrahim Numanagić 5de233a64e
Dynamic Polymorphism (#58)
* Use Static[] for static inheritance

* Support .seq extension

* Fix #36

* Polymorphic typechecking; vtables [wip]

* v-table dispatch [wip]

* vtable routing [wip; bug]

* vtable routing [MVP]

* Fix texts

* Add union type support

* Update FAQs

* Clarify

* Add BSL license

* Add makeUnion

* Add IR UnionType

* Update union representation in LLVM

* Update README

* Update README.md

* Update README

* Update README.md

* Add benchmarks

* Add more benchmarks and README

* Add primes benchmark

* Update benchmarks

* Fix cpp

* Clean up list

* Update faq.md

* Add binary trees benchmark

* Add fannkuch benchmark

* Fix paths

* Add PyPy

* Abort on fail

* More benchmarks

* Add cpp word_count

* Update set_partition cpp

* Add nbody cpp

* Add TAQ cpp; fix word_count timing

* Update CODEOWNERS

* Update README

* Update README.md

* Update CODEOWNERS

* Fix bench script

* Update binary_trees.cpp

* Update taq.cpp

* Fix primes benchmark

* Add mandelbrot benchmark

* Fix OpenMP init

* Add Module::unsafeGetUnionType

* UnionType [wip] [skip ci]

* Integrate IR unions and Union

* UnionType refactor [skip ci]

* Update README.md

* Update docs

* UnionType [wip] [skip ci]

* UnionType and automatic unions

* Add Slack

* Update faq.md

* Refactor types

* New error reporting [wip]

* New error reporting [wip]

* peglib updates [wip] [skip_ci]

* Fix parsing issues

* Fix parsing issues

* Fix error reporting issues

* Make sure random module matches Python

* Update releases.md

* Fix tests

* Fix #59

* Fix #57

* Fix #50

* Fix #49

* Fix #26; Fix #51; Fix #47; Fix #49

* Fix collection extension methods

* Fix #62

* Handle *args/**kwargs with Callable[]; Fix #43

* Fix #43

* Fix Ptr.__sub__; Fix polymorphism issues

* Add typeinfo

* clang-format

* Upgrade fmtlib to v9; Use CPM for fmtlib; format spec support; __format__ support

* Use CPM for semver and toml++

* Remove extension check

* Revamp str methods

* Update str.zfill

* Fix thunk crashes [wip] [skip_ci]

* Fix str.__reversed__

* Fix count_with_max

* Fix vtable memory allocation issues

* Add poly AST tests

* Use PDQsort when stability does not matter

* Fix dotted imports; Fix  issues

* Fix kwargs passing to Python

* Fix #61

* Fix #37

* Add isinstance support for unions; Union methods return Union type if different

* clang-format

* Nicely format error tracebacks

* Fix build issues; clang-format

* Fix OpenMP init

* Fix OpenMP init

* Update README.md

* Fix tests

* Update license [skip ci]

* Update license [ci skip]

* Add copyright header to all source files

* Fix super(); Fix error recovery in ClassStmt

* Clean up whitespace [ci skip]

* Use Python 3.9 on CI

* Print info in random test

* Fix single unions

* Update random_test.codon

* Fix polymorhic thunk instantiation

* Fix random test

* Add operator.attrgetter and operator.methodcaller

* Add code documentation

* Update documentation

* Update README.md

* Fix tests

* Fix random init

Co-authored-by: A. R. Shajii <ars@ars.me>
2022-12-04 19:45:21 -05:00

752 lines
19 KiB
Python

# Copyright (C) 2022 Exaloop Inc. <https://exaloop.io>
from internal.gc import sizeof as _sizeof
@tuple
class Device:
_device: i32
def __new__(device: int):
from C import seq_nvptx_device(int) -> i32
return Device(seq_nvptx_device(device))
@staticmethod
def count():
from C import seq_nvptx_device_count() -> int
return seq_nvptx_device_count()
def __str__(self):
from C import seq_nvptx_device_name(i32) -> str
return seq_nvptx_device_name(self._device)
def __index__(self):
return int(self._device)
def __bool__(self):
return True
@property
def compute_capability(self):
from C import seq_nvptx_device_capability(i32) -> int
c = seq_nvptx_device_capability(self._device)
return (c >> 32, c & 0xffffffff)
@tuple
class Memory[T]:
_ptr: Ptr[byte]
def _alloc(n: int, T: type):
from C import seq_nvptx_device_alloc(int) -> Ptr[byte]
return Memory[T](seq_nvptx_device_alloc(n * _sizeof(T)))
def _read(self, p: Ptr[T], n: int):
from C import seq_nvptx_memcpy_d2h(Ptr[byte], Ptr[byte], int)
seq_nvptx_memcpy_d2h(p.as_byte(), self._ptr, n * _sizeof(T))
def _write(self, p: Ptr[T], n: int):
from C import seq_nvptx_memcpy_h2d(Ptr[byte], Ptr[byte], int)
seq_nvptx_memcpy_h2d(self._ptr, p.as_byte(), n * _sizeof(T))
def _free(self):
from C import seq_nvptx_device_free(Ptr[byte])
seq_nvptx_device_free(self._ptr)
@llvm
def syncthreads() -> None:
declare void @llvm.nvvm.barrier0()
call void @llvm.nvvm.barrier0()
ret {} {}
@tuple
class Dim3:
_x: u32
_y: u32
_z: u32
def __new__(x: int, y: int, z: int):
return Dim3(u32(x), u32(y), u32(z))
@property
def x(self):
return int(self._x)
@property
def y(self):
return int(self._y)
@property
def z(self):
return int(self._z)
@tuple
class Thread:
@property
def x(self):
@pure
@llvm
def get_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %res
return int(get_x())
@property
def y(self):
@pure
@llvm
def get_y() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
%res = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
ret i32 %res
return int(get_y())
@property
def z(self):
@pure
@llvm
def get_z() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
%res = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
ret i32 %res
return int(get_z())
@tuple
class Block:
@property
def x(self):
@pure
@llvm
def get_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %res
return int(get_x())
@property
def y(self):
@pure
@llvm
def get_y() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
ret i32 %res
return int(get_y())
@property
def z(self):
@pure
@llvm
def get_z() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
ret i32 %res
return int(get_z())
@property
def dim(self):
@pure
@llvm
def get_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %res
@pure
@llvm
def get_y() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
ret i32 %res
@pure
@llvm
def get_z() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
ret i32 %res
return Dim3(get_x(), get_y(), get_z())
@tuple
class Grid:
@property
def dim(self):
@pure
@llvm
def get_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
ret i32 %res
@pure
@llvm
def get_y() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
%res = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
ret i32 %res
@pure
@llvm
def get_z() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
%res = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
ret i32 %res
return Dim3(get_x(), get_y(), get_z())
@tuple
class Warp:
def __len__(self):
@pure
@llvm
def get_warpsize() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
%res = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
ret i32 %res
return int(get_warpsize())
thread = Thread()
block = Block()
grid = Grid()
warp = Warp()
def _catch():
return (thread, block, grid, warp)
_catch()
@tuple
class AllocCache:
v: List[Ptr[byte]]
def add(self, p: Ptr[byte]):
self.v.append(p)
def free(self):
for p in self.v:
Memory[byte](p)._free()
def _tuple_from_gpu(args, gpu_args):
if staticlen(args) > 0:
a = args[0]
g = gpu_args[0]
a.__from_gpu__(g)
_tuple_from_gpu(args[1:], gpu_args[1:])
def kernel(fn):
from C import seq_nvptx_function(str) -> cobj
from C import seq_nvptx_invoke(cobj, u32, u32, u32, u32, u32, u32, u32, cobj)
def canonical_dim(dim):
if isinstance(dim, NoneType):
return (1, 1, 1)
elif isinstance(dim, int):
return (dim, 1, 1)
elif isinstance(dim, Tuple[int,int]):
return (dim[0], dim[1], 1)
elif isinstance(dim, Tuple[int,int,int]):
return dim
elif isinstance(dim, Dim3):
return (dim.x, dim.y, dim.z)
else:
compile_error("bad dimension argument")
def offsets(t):
@pure
@llvm
def offsetof(t: T, i: Static[int], T: type, S: type) -> int:
%p = getelementptr {=T}, ptr null, i64 0, i32 {=i}
%s = ptrtoint ptr %p to i64
ret i64 %s
if staticlen(t) == 0:
return ()
else:
T = type(t)
S = type(t[-1])
return (*offsets(t[:-1]), offsetof(t, staticlen(t) - 1, T, S))
def wrapper(*args, grid, block):
grid = canonical_dim(grid)
block = canonical_dim(block)
cache = AllocCache([])
shared_mem = 0
gpu_args = tuple(arg.__to_gpu__(cache) for arg in args)
kernel_ptr = seq_nvptx_function(__realized__(fn, gpu_args).__llvm_name__)
p = __ptr__(gpu_args).as_byte()
arg_ptrs = tuple((p + offset) for offset in offsets(gpu_args))
seq_nvptx_invoke(kernel_ptr, u32(grid[0]), u32(grid[1]), u32(grid[2]), u32(block[0]),
u32(block[1]), u32(block[2]), u32(shared_mem), __ptr__(arg_ptrs).as_byte())
_tuple_from_gpu(args, gpu_args)
cache.free()
return wrapper
def _ptr_to_gpu(p: Ptr[T], n: int, cache: AllocCache, index_filter = lambda i: True, T: type):
from internal.gc import atomic
if not atomic(T):
tmp = Ptr[T](n)
for i in range(n):
if index_filter(i):
tmp[i] = p[i].__to_gpu__(cache)
p = tmp
mem = Memory._alloc(n, T)
cache.add(mem._ptr)
mem._write(p, n)
return Ptr[T](mem._ptr)
def _ptr_from_gpu(p: Ptr[T], q: Ptr[T], n: int, index_filter = lambda i: True, T: type):
from internal.gc import atomic
mem = Memory[T](q.as_byte())
if not atomic(T):
tmp = Ptr[T](n)
mem._read(tmp, n)
for i in range(n):
if index_filter(i):
p[i] = T.__from_gpu_new__(tmp[i])
else:
mem._read(p, n)
@pure
@llvm
def _ptr_to_type(p: cobj, T: type) -> T:
ret ptr %p
def _object_to_gpu(obj: T, cache: AllocCache, T: type):
s = tuple(obj)
gpu_mem = Memory._alloc(1, type(s))
cache.add(gpu_mem._ptr)
gpu_mem._write(__ptr__(s), 1)
return _ptr_to_type(gpu_mem._ptr, T)
def _object_from_gpu(obj):
T = type(obj)
S = type(tuple(obj))
tmp = T.__new__()
p = Ptr[S](tmp.__raw__())
q = Ptr[S](obj.__raw__())
mem = Memory[S](q.as_byte())
mem._read(p, 1)
return tmp
@tuple
class Pointer[T]:
_ptr: Ptr[T]
_len: int
def __to_gpu__(self, cache: AllocCache):
return _ptr_to_gpu(self._ptr, self._len, cache)
def __from_gpu__(self, other: Ptr[T]):
_ptr_from_gpu(self._ptr, other, self._len)
def __from_gpu_new__(other: Ptr[T]):
return other
def raw(v: List[T], T: type):
return Pointer(v.arr.ptr, len(v))
@extend
class Ptr:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: Ptr[T]):
pass
def __from_gpu_new__(other: Ptr[T]):
return other
@extend
class NoneType:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: NoneType):
pass
def __from_gpu_new__(other: NoneType):
return other
@extend
class int:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: int):
pass
def __from_gpu_new__(other: int):
return other
@extend
class float:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: float):
pass
def __from_gpu_new__(other: float):
return other
@extend
class float32:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: float32):
pass
def __from_gpu_new__(other: float32):
return other
@extend
class bool:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: bool):
pass
def __from_gpu_new__(other: bool):
return other
@extend
class byte:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: byte):
pass
def __from_gpu_new__(other: byte):
return other
@extend
class Int:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: Int[N]):
pass
def __from_gpu_new__(other: Int[N]):
return other
@extend
class UInt:
def __to_gpu__(self, cache: AllocCache):
return self
def __from_gpu__(self, other: UInt[N]):
pass
def __from_gpu_new__(other: UInt[N]):
return other
@extend
class str:
def __to_gpu__(self, cache: AllocCache):
n = self.len
return str(_ptr_to_gpu(self.ptr, n, cache), n)
def __from_gpu__(self, other: str):
pass
def __from_gpu_new__(other: str):
n = other.len
p = Ptr[byte](n)
_ptr_from_gpu(p, other.ptr, n)
return str(p, n)
@extend
class List:
@inline
def __to_gpu__(self, cache: AllocCache):
mem = List[T].__new__()
n = self.len
gpu_ptr = _ptr_to_gpu(self.arr.ptr, n, cache)
mem.arr = Array[T](gpu_ptr, n)
mem.len = n
return _object_to_gpu(mem, cache)
@inline
def __from_gpu__(self, other: List[T]):
mem = _object_from_gpu(other)
my_cap = self.arr.len
other_cap = mem.arr.len
if other_cap > my_cap:
self._resize(other_cap)
_ptr_from_gpu(self.arr.ptr, mem.arr.ptr, mem.len)
self.len = mem.len
@inline
def __from_gpu_new__(other: List[T]):
mem = _object_from_gpu(other)
arr = Array[T](mem.arr.len)
_ptr_from_gpu(arr.ptr, mem.arr.ptr, arr.len)
mem.arr = arr
return mem
@extend
class DynamicTuple:
@inline
def __to_gpu__(self, cache: AllocCache):
n = self._len
gpu_ptr = _ptr_to_gpu(self._ptr, n, cache)
return DynamicTuple(gpu_ptr, n)
@inline
def __from_gpu__(self, other: DynamicTuple[T]):
_ptr_from_gpu(self._ptr, other._ptr, self._len)
@inline
def __from_gpu_new__(other: DynamicTuple[T]):
n = other._len
p = Ptr[T](n)
_ptr_from_gpu(p, other._ptr, n)
return DynamicTuple(p, n)
@extend
class Dict:
def __to_gpu__(self, cache: AllocCache):
from internal.khash import __ac_fsize
mem = Dict[K,V].__new__()
n = self._n_buckets
f = __ac_fsize(n) if n else 0
mem._n_buckets = n
mem._size = self._size
mem._n_occupied = self._n_occupied
mem._upper_bound = self._upper_bound
mem._flags = _ptr_to_gpu(self._flags, f, cache)
mem._keys = _ptr_to_gpu(self._keys, n, cache, lambda i: self._kh_exist(i))
mem._vals = _ptr_to_gpu(self._vals, n, cache, lambda i: self._kh_exist(i))
return _object_to_gpu(mem, cache)
def __from_gpu__(self, other: Dict[K,V]):
from internal.khash import __ac_fsize
mem = _object_from_gpu(other)
my_n = self._n_buckets
n = mem._n_buckets
f = __ac_fsize(n) if n else 0
if my_n != n:
self._flags = Ptr[u32](f)
self._keys = Ptr[K](n)
self._vals = Ptr[V](n)
_ptr_from_gpu(self._flags, mem._flags, f)
_ptr_from_gpu(self._keys, mem._keys, n, lambda i: self._kh_exist(i))
_ptr_from_gpu(self._vals, mem._vals, n, lambda i: self._kh_exist(i))
self._n_buckets = n
self._size = mem._size
self._n_occupied = mem._n_occupied
self._upper_bound = mem._upper_bound
def __from_gpu_new__(other: Dict[K,V]):
from internal.khash import __ac_fsize
mem = _object_from_gpu(other)
n = mem._n_buckets
f = __ac_fsize(n) if n else 0
flags = Ptr[u32](f)
keys = Ptr[K](n)
vals = Ptr[V](n)
_ptr_from_gpu(flags, mem._flags, f)
mem._flags = flags
_ptr_from_gpu(keys, mem._keys, n, lambda i: mem._kh_exist(i))
mem._keys = keys
_ptr_from_gpu(vals, mem._vals, n, lambda i: mem._kh_exist(i))
mem._vals = vals
return mem
@extend
class Set:
def __to_gpu__(self, cache: AllocCache):
from internal.khash import __ac_fsize
mem = Set[K].__new__()
n = self._n_buckets
f = __ac_fsize(n) if n else 0
mem._n_buckets = n
mem._size = self._size
mem._n_occupied = self._n_occupied
mem._upper_bound = self._upper_bound
mem._flags = _ptr_to_gpu(self._flags, f, cache)
mem._keys = _ptr_to_gpu(self._keys, n, cache, lambda i: self._kh_exist(i))
return _object_to_gpu(mem, cache)
def __from_gpu__(self, other: Set[K]):
from internal.khash import __ac_fsize
mem = _object_from_gpu(other)
my_n = self._n_buckets
n = mem._n_buckets
f = __ac_fsize(n) if n else 0
if my_n != n:
self._flags = Ptr[u32](f)
self._keys = Ptr[K](n)
_ptr_from_gpu(self._flags, mem._flags, f)
_ptr_from_gpu(self._keys, mem._keys, n, lambda i: self._kh_exist(i))
self._n_buckets = n
self._size = mem._size
self._n_occupied = mem._n_occupied
self._upper_bound = mem._upper_bound
def __from_gpu_new__(other: Set[K]):
from internal.khash import __ac_fsize
mem = _object_from_gpu(other)
n = mem._n_buckets
f = __ac_fsize(n) if n else 0
flags = Ptr[u32](f)
keys = Ptr[K](n)
_ptr_from_gpu(flags, mem._flags, f)
mem._flags = flags
_ptr_from_gpu(keys, mem._keys, n, lambda i: mem._kh_exist(i))
mem._keys = keys
return mem
@extend
class Optional:
def __to_gpu__(self, cache: AllocCache):
if self is None:
return self
else:
return Optional[T](self.__val__().__to_gpu__(cache))
def __from_gpu__(self, other: Optional[T]):
if self is not None and other is not None:
self.__val__().__from_gpu__(other.__val__())
def __from_gpu_new__(other: Optional[T]):
if other is None:
return Optional[T]()
else:
return Optional[T](T.__from_gpu_new__(other.__val__()))
@extend
class __internal__:
def class_to_gpu(obj, cache: AllocCache):
if isinstance(obj, Tuple):
return tuple(a.__to_gpu__(cache) for a in obj)
elif isinstance(obj, ByVal):
T = type(obj)
return T(*tuple(a.__to_gpu__(cache) for a in tuple(obj)))
else:
T = type(obj)
S = type(tuple(obj))
mem = T.__new__()
Ptr[S](mem.__raw__())[0] = tuple(obj).__to_gpu__(cache)
return _object_to_gpu(mem, cache)
def class_from_gpu(obj, other):
if isinstance(obj, Tuple):
_tuple_from_gpu(obj, other)
elif isinstance(obj, ByVal):
_tuple_from_gpu(tuple(obj), tuple(other))
else:
S = type(tuple(obj))
Ptr[S](obj.__raw__())[0] = S.__from_gpu_new__(tuple(_object_from_gpu(other)))
def class_from_gpu_new(other):
if isinstance(other, Tuple):
return tuple(type(a).__from_gpu_new__(a) for a in other)
elif isinstance(other, ByVal):
T = type(other)
return T(*tuple(type(a).__from_gpu_new__(a) for a in tuple(other)))
else:
S = type(tuple(other))
mem = _object_from_gpu(other)
Ptr[S](mem.__raw__())[0] = S.__from_gpu_new__(tuple(mem))
return mem
# @par(gpu=True) support
@pure
@llvm
def _gpu_thread_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
ret i32 %res
@pure
@llvm
def _gpu_block_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
ret i32 %res
@pure
@llvm
def _gpu_block_dim_x() -> u32:
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%res = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
ret i32 %res
def _gpu_loop_outline_template(start, stop, args, instance: Static[int]):
@nonpure
def _loop_step():
return 1
@kernel
def _kernel_stub(start: int, count: int, args):
@nonpure
def _gpu_loop_body_stub(idx, args):
pass
@nonpure
def _dummy_use(n):
pass
_dummy_use(instance)
idx = (int(_gpu_block_dim_x()) * int(_gpu_block_x())) + int(_gpu_thread_x())
step = _loop_step()
if idx < count:
_gpu_loop_body_stub(start + (idx * step), args)
step = _loop_step()
loop = range(start, stop, step)
MAX_BLOCK = 1024
MAX_GRID = 2147483647
G = MAX_BLOCK * MAX_GRID
n = len(loop)
if n == 0:
return
elif n > G:
raise ValueError(f'loop exceeds GPU iteration limit of {G}')
block = n
grid = 1
if n > MAX_BLOCK:
block = MAX_BLOCK
grid = (n // MAX_BLOCK) + (0 if n % MAX_BLOCK == 0 else 1)
_kernel_stub(start, n, args, grid=grid, block=block)