codon/test/transform/kernels.codon

267 lines
6.2 KiB
Python
Raw Normal View History

GPU and other updates (#52) * Add nvptx pass * Fix spaces * Don't change name * Add runtime support * Add init call * Add more runtime functions * Add launch function * Add intrinsics * Fix codegen * Run GPU pass between general opt passes * Set data layout * Create context * Link libdevice * Add function remapping * Fix linkage * Fix libdevice link * Fix linking * Fix personality * Fix linking * Fix linking * Fix linking * Add internalize pass * Add more math conversions * Add more re-mappings * Fix conversions * Fix __str__ * Add decorator attribute for any decorator * Update kernel decorator * Fix kernel decorator * Fix kernel decorator * Fix kernel decorator * Fix kernel decorator * Remove old decorator * Fix pointer calc * Fix fill-in codegen * Fix linkage * Add comment * Update list conversion * Add more conversions * Add dict and set conversions * Add float32 type to IR/LLVM * Add float32 * Add float32 stdlib * Keep required global values in PTX module * Fix PTX module pruning * Fix malloc * Set will-return * Fix name cleanup * Fix access * Fix name cleanup * Fix function renaming * Update dimension API * Fix args * Clean up API * Move GPU transformations to end of opt pipeline * Fix alloc replacements * Fix naming * Target PTX 4.2 * Fix global renaming * Fix early return in static blocks; Add __realized__ function * Format * Add __llvm_name__ for functions * Add vector type to IR * SIMD support [wip] * Update kernel naming * Fix early returns; Fix SIMD calls * Fix kernel naming * Fix IR matcher * Remove module print * Update realloc * Add overloads for 32-bit float math ops * Add gpu.Pointer type for working with raw pointers * Add float32 conversion * Add to_gpu and from_gpu * clang-format * Add f32 reduction support to OpenMP * Fix automatic GPU class conversions * Fix conversion functions * Fix conversions * Rename self * Fix tuple conversion * Fix conversions * Fix conversions * Update PTX filename * Fix filename * Add raw function * Add GPU docs * Allow nested object conversions * Add tests (WIP) * Update SIMD * Add staticrange and statictuple loop support * SIMD updates * Add new Vec constructors * Fix UInt conversion * Fix size-0 allocs * Add more tests * Add matmul test * Rename gpu test file * Add more tests * Add alloc cache * Fix object_to_gpu * Fix frees * Fix str conversion * Fix set conversion * Fix conversions * Fix class conversion * Fix str conversion * Fix byte conversion * Fix list conversion * Fix pointer conversions * Fix conversions * Fix conversions * Update tests * Fix conversions * Fix tuple conversion * Fix tuple conversion * Fix auto conversions * Fix conversion * Fix magics * Update tests * Support GPU in JIT mode * Fix GPU+JIT * Fix kernel filename in JIT mode * Add __static_print__; Add earlyDefines; Various domination bugfixes; SimplifyContext RAII base handling * Fix global static handling * Fix float32 tests * FIx gpu module * Support OpenMP "collapse" option * Add more collapse tests * Capture generics and statics * TraitVar handling * Python exceptions / isinstance [wip; no_ci] * clang-format * Add list comparison operators * Support empty raise in IR * Add dict 'or' operator * Fix repr * Add copy module * Fix spacing * Use sm_30 * Python exceptions * TypeTrait support; Fix defaultDict * Fix earlyDefines * Add defaultdict * clang-format * Fix invalid canonicalizations * Fix empty raise * Fix copyright * Add Python numerics option * Support py-numerics in math module * Update docs * Add static Python division / modulus * Add static py numerics tests * Fix staticrange/tuple; Add KwTuple.__getitem__ * clang-format * Add gpu parameter to par * Fix globals * Don't init loop vars on loop collapse * Add par-gpu tests * Update gpu docs * Fix isinstance check * Remove invalid test * Add -libdevice to set custom path [skip ci] * Add release notes; bump version [skip ci] * Add libdevice docs [skip ci] Co-authored-by: Ibrahim Numanagić <ibrahimpasa@gmail.com>
2022-09-16 03:40:00 +08:00
import gpu
@test
def test_hello_world():
@gpu.kernel
def kernel(a, b, c):
i = gpu.thread.x
c[i] = a[i] + b[i]
a = [i for i in range(16)]
b = [2*i for i in range(16)]
c = [0 for _ in range(16)]
kernel(a, b, c, grid=1, block=16)
assert c == [3*i for i in range(16)]
@test
def test_raw():
@gpu.kernel
def kernel(a, b, c):
i = gpu.thread.x
c[i] = a[i] + b[i]
a = [i for i in range(16)]
b = [2*i for i in range(16)]
c = [0 for _ in range(16)]
kernel(gpu.raw(a), gpu.raw(b), gpu.raw(c), grid=1, block=16)
assert c == [3*i for i in range(16)]
@test
def test_conversions():
@gpu.kernel
def kernel(x, v):
v[0] = x
def check(x):
T = type(x)
v = [T()]
kernel(x, v, grid=1, block=1)
return v == [x]
assert check(None)
assert check(42)
assert check(3.14)
assert check(f32(2.718))
assert check(byte(99))
assert check(Int[128](123123))
assert check(UInt[128](321321))
assert check(Optional[int]())
assert check(Optional(111))
assert check((1, 2, 3))
assert check(([1], [2], [3]))
# assert check(()) # TODO: PTX can't handle this; why?
assert check(DynamicTuple((1, 2, 3)))
assert check(DynamicTuple(([1], [2], [3])))
assert check(DynamicTuple[int]())
assert check(DynamicTuple[List[List[List[str]]]]())
GPU and other updates (#52) * Add nvptx pass * Fix spaces * Don't change name * Add runtime support * Add init call * Add more runtime functions * Add launch function * Add intrinsics * Fix codegen * Run GPU pass between general opt passes * Set data layout * Create context * Link libdevice * Add function remapping * Fix linkage * Fix libdevice link * Fix linking * Fix personality * Fix linking * Fix linking * Fix linking * Add internalize pass * Add more math conversions * Add more re-mappings * Fix conversions * Fix __str__ * Add decorator attribute for any decorator * Update kernel decorator * Fix kernel decorator * Fix kernel decorator * Fix kernel decorator * Fix kernel decorator * Remove old decorator * Fix pointer calc * Fix fill-in codegen * Fix linkage * Add comment * Update list conversion * Add more conversions * Add dict and set conversions * Add float32 type to IR/LLVM * Add float32 * Add float32 stdlib * Keep required global values in PTX module * Fix PTX module pruning * Fix malloc * Set will-return * Fix name cleanup * Fix access * Fix name cleanup * Fix function renaming * Update dimension API * Fix args * Clean up API * Move GPU transformations to end of opt pipeline * Fix alloc replacements * Fix naming * Target PTX 4.2 * Fix global renaming * Fix early return in static blocks; Add __realized__ function * Format * Add __llvm_name__ for functions * Add vector type to IR * SIMD support [wip] * Update kernel naming * Fix early returns; Fix SIMD calls * Fix kernel naming * Fix IR matcher * Remove module print * Update realloc * Add overloads for 32-bit float math ops * Add gpu.Pointer type for working with raw pointers * Add float32 conversion * Add to_gpu and from_gpu * clang-format * Add f32 reduction support to OpenMP * Fix automatic GPU class conversions * Fix conversion functions * Fix conversions * Rename self * Fix tuple conversion * Fix conversions * Fix conversions * Update PTX filename * Fix filename * Add raw function * Add GPU docs * Allow nested object conversions * Add tests (WIP) * Update SIMD * Add staticrange and statictuple loop support * SIMD updates * Add new Vec constructors * Fix UInt conversion * Fix size-0 allocs * Add more tests * Add matmul test * Rename gpu test file * Add more tests * Add alloc cache * Fix object_to_gpu * Fix frees * Fix str conversion * Fix set conversion * Fix conversions * Fix class conversion * Fix str conversion * Fix byte conversion * Fix list conversion * Fix pointer conversions * Fix conversions * Fix conversions * Update tests * Fix conversions * Fix tuple conversion * Fix tuple conversion * Fix auto conversions * Fix conversion * Fix magics * Update tests * Support GPU in JIT mode * Fix GPU+JIT * Fix kernel filename in JIT mode * Add __static_print__; Add earlyDefines; Various domination bugfixes; SimplifyContext RAII base handling * Fix global static handling * Fix float32 tests * FIx gpu module * Support OpenMP "collapse" option * Add more collapse tests * Capture generics and statics * TraitVar handling * Python exceptions / isinstance [wip; no_ci] * clang-format * Add list comparison operators * Support empty raise in IR * Add dict 'or' operator * Fix repr * Add copy module * Fix spacing * Use sm_30 * Python exceptions * TypeTrait support; Fix defaultDict * Fix earlyDefines * Add defaultdict * clang-format * Fix invalid canonicalizations * Fix empty raise * Fix copyright * Add Python numerics option * Support py-numerics in math module * Update docs * Add static Python division / modulus * Add static py numerics tests * Fix staticrange/tuple; Add KwTuple.__getitem__ * clang-format * Add gpu parameter to par * Fix globals * Don't init loop vars on loop collapse * Add par-gpu tests * Update gpu docs * Fix isinstance check * Remove invalid test * Add -libdevice to set custom path [skip ci] * Add release notes; bump version [skip ci] * Add libdevice docs [skip ci] Co-authored-by: Ibrahim Numanagić <ibrahimpasa@gmail.com>
2022-09-16 03:40:00 +08:00
assert check('hello world')
assert check([1, 2, 3])
assert check([[1], [2], [3]])
assert check({1: [1.1], 2: [2.2]})
assert check({'a', 'b', 'c'})
assert check(Optional([1, 2, 3]))
@test
def test_user_classes():
@dataclass(gpu=True, eq=True)
class A:
x: int
y: List[int]
@tuple
class B:
x: int
y: List[int]
@gpu.kernel
def kernel(a, b, c):
a.x += b.x + c[0]
c[1][0][0] = 9999
a.y[0] = c[0] + 1
b.y[0] = c[0] + 2
a = A(42, [-1])
b = B(100, [-2])
c = (1000, [[-1]])
kernel(a, b, c, grid=1, block=1)
assert a == A(1142, [1001])
assert b == B(100, [1002])
assert c == (1000, [[9999]])
@gpu.kernel
def kernel2(a, b, c):
a[0].x += b[0].x + c[0][0]
c[0][1][0][0] = 9999
a[0].y[0] = c[0][0] + 1
b[0].y[0] = c[0][0] + 2
a = [A(42, [-1])]
b = [B(100, [-2])]
c = [(1000, [[-1]])]
kernel2(a, b, c, grid=1, block=1)
assert a == [A(1142, [1001])]
assert b == [B(100, [1002])]
assert c == [(1000, [[9999]])]
@test
def test_intrinsics():
@gpu.kernel
def kernel(v):
block_id = (gpu.block.x + gpu.block.y*gpu.grid.dim.x +
gpu.block.z*gpu.grid.dim.x*gpu.grid.dim.y)
thread_id = (block_id*gpu.block.dim.x*gpu.block.dim.y*gpu.block.dim.z +
gpu.thread.z*gpu.block.dim.x*gpu.block.dim.y +
gpu.thread.y*gpu.block.dim.x +
gpu.thread.x)
v[thread_id] = thread_id
gpu.syncthreads()
grid = gpu.Dim3(3, 4, 5)
block = gpu.Dim3(6, 7, 8)
N = grid.x * grid.y * grid.z * block.x * block.y * block.z
v = [0 for _ in range(N)]
kernel(v, grid=grid, block=block)
assert v == list(range(N))
@test
def test_matmul():
A = [[12, 7, 3],
[4, 5, 6],
[7, 8, 9]]
B = [[5, 8, 1, 2],
[6, 7, 3, 0],
[4, 5, 9, 1]]
def mmz(A, B):
return [[0]*len(B[0]) for _ in range(len(A))]
def matmul(A, B):
result = mmz(A, B)
for i in range(len(A)):
for j in range(len(B[0])):
for k in range(len(B)):
result[i][j] += A[i][k] * B[k][j]
return result
expected = matmul(A, B)
@gpu.kernel
def kernel(A, B, result):
i = gpu.thread.x
j = gpu.thread.y
result[i][j] = sum(A[i][k]*B[k][j] for k in range(len(A[0])))
result = mmz(A, B)
kernel(A, B, result, grid=1, block=(len(result), len(result[0])))
assert result == expected
MAX = 1000 # maximum Mandelbrot iterations
N = 256 # width and height of image
@test
def test_mandelbrot():
pixels = [0 for _ in range(N * N)]
def scale(x, a, b):
return a + (x/N)*(b - a)
expected = [0 for _ in range(N * N)]
for i in range(N):
for j in range(N):
c = complex(scale(j, -2.00, 0.47), scale(i, -1.12, 1.12))
z = 0j
iteration = 0
while abs(z) <= 2 and iteration < MAX:
z = z**2 + c
iteration += 1
expected[N*i + j] = int(255 * iteration/MAX)
@gpu.kernel
def kernel(pixels):
idx = (gpu.block.x * gpu.block.dim.x) + gpu.thread.x
i, j = divmod(idx, N)
c = complex(scale(j, -2.00, 0.47), scale(i, -1.12, 1.12))
z = 0j
iteration = 0
while abs(z) <= 2 and iteration < MAX:
z = z**2 + c
iteration += 1
pixels[idx] = int(255 * iteration/MAX)
kernel(pixels, grid=(N*N)//1024, block=1024)
assert pixels == expected
@test
def test_kitchen_sink():
@gpu.kernel
def kernel(x):
i = gpu.thread.x
d = {1: 2.1, 2: 3.5, 3: 4.2}
s = {4, 5, 6}
z = sum(
d.get(x[i], j) + (j if i in s else -j)
for j in range(i)
)
x[i] = int(z)
x = [i for i in range(16)]
kernel(x, grid=1, block=16)
assert x == [0, 2, 6, 9, 12, 20, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0]
@test
def test_auto_par():
a = [i for i in range(16)]
b = [2*i for i in range(16)]
c = [0 for _ in range(16)]
@par(gpu=True)
for i in range(16):
c[i] = a[i] + b[i]
assert c == [3*i for i in range(16)]
@par(gpu=True)
for i in range(16):
c[i] += a[i] + b[i]
assert c == [6*i for i in range(16)]
N = 200
Z = 42
x = [0] * (N*N)
y = [0] * (N*N)
for i in range(2, N - 1, 3):
for j in range(3, N, 2):
x[i*N + j] = i + j + Z
@par(gpu=True, collapse=2)
for i in range(2, N - 1, 3):
for j in range(3, N, 2):
y[i*N + j] = i + j + Z
assert x == y
@par(gpu=True)
for i in range(1):
pass
test_hello_world()
test_raw()
test_conversions()
test_user_classes()
test_intrinsics()
test_matmul()
test_mandelbrot()
test_kitchen_sink()
test_auto_par()