codon/stdlib/experimental/simd.codon

311 lines
12 KiB
Python
Raw Normal View History

GPU and other updates (#52) * Add nvptx pass * Fix spaces * Don't change name * Add runtime support * Add init call * Add more runtime functions * Add launch function * Add intrinsics * Fix codegen * Run GPU pass between general opt passes * Set data layout * Create context * Link libdevice * Add function remapping * Fix linkage * Fix libdevice link * Fix linking * Fix personality * Fix linking * Fix linking * Fix linking * Add internalize pass * Add more math conversions * Add more re-mappings * Fix conversions * Fix __str__ * Add decorator attribute for any decorator * Update kernel decorator * Fix kernel decorator * Fix kernel decorator * Fix kernel decorator * Fix kernel decorator * Remove old decorator * Fix pointer calc * Fix fill-in codegen * Fix linkage * Add comment * Update list conversion * Add more conversions * Add dict and set conversions * Add float32 type to IR/LLVM * Add float32 * Add float32 stdlib * Keep required global values in PTX module * Fix PTX module pruning * Fix malloc * Set will-return * Fix name cleanup * Fix access * Fix name cleanup * Fix function renaming * Update dimension API * Fix args * Clean up API * Move GPU transformations to end of opt pipeline * Fix alloc replacements * Fix naming * Target PTX 4.2 * Fix global renaming * Fix early return in static blocks; Add __realized__ function * Format * Add __llvm_name__ for functions * Add vector type to IR * SIMD support [wip] * Update kernel naming * Fix early returns; Fix SIMD calls * Fix kernel naming * Fix IR matcher * Remove module print * Update realloc * Add overloads for 32-bit float math ops * Add gpu.Pointer type for working with raw pointers * Add float32 conversion * Add to_gpu and from_gpu * clang-format * Add f32 reduction support to OpenMP * Fix automatic GPU class conversions * Fix conversion functions * Fix conversions * Rename self * Fix tuple conversion * Fix conversions * Fix conversions * Update PTX filename * Fix filename * Add raw function * Add GPU docs * Allow nested object conversions * Add tests (WIP) * Update SIMD * Add staticrange and statictuple loop support * SIMD updates * Add new Vec constructors * Fix UInt conversion * Fix size-0 allocs * Add more tests * Add matmul test * Rename gpu test file * Add more tests * Add alloc cache * Fix object_to_gpu * Fix frees * Fix str conversion * Fix set conversion * Fix conversions * Fix class conversion * Fix str conversion * Fix byte conversion * Fix list conversion * Fix pointer conversions * Fix conversions * Fix conversions * Update tests * Fix conversions * Fix tuple conversion * Fix tuple conversion * Fix auto conversions * Fix conversion * Fix magics * Update tests * Support GPU in JIT mode * Fix GPU+JIT * Fix kernel filename in JIT mode * Add __static_print__; Add earlyDefines; Various domination bugfixes; SimplifyContext RAII base handling * Fix global static handling * Fix float32 tests * FIx gpu module * Support OpenMP "collapse" option * Add more collapse tests * Capture generics and statics * TraitVar handling * Python exceptions / isinstance [wip; no_ci] * clang-format * Add list comparison operators * Support empty raise in IR * Add dict 'or' operator * Fix repr * Add copy module * Fix spacing * Use sm_30 * Python exceptions * TypeTrait support; Fix defaultDict * Fix earlyDefines * Add defaultdict * clang-format * Fix invalid canonicalizations * Fix empty raise * Fix copyright * Add Python numerics option * Support py-numerics in math module * Update docs * Add static Python division / modulus * Add static py numerics tests * Fix staticrange/tuple; Add KwTuple.__getitem__ * clang-format * Add gpu parameter to par * Fix globals * Don't init loop vars on loop collapse * Add par-gpu tests * Update gpu docs * Fix isinstance check * Remove invalid test * Add -libdevice to set custom path [skip ci] * Add release notes; bump version [skip ci] * Add libdevice docs [skip ci] Co-authored-by: Ibrahim Numanagić <ibrahimpasa@gmail.com>
2022-09-16 03:40:00 +08:00
@tuple
class Vec[T, N: Static[int]]:
ZERO_16x8i = Vec[u8,16](u8(0))
FF_16x8i = Vec[u8,16](u8(0xff))
ZERO_32x8i = Vec[u8,32](u8(0))
FF_32x8i = Vec[u8,32](u8(0xff))
@llvm
def _mm_set1_epi8(val: u8) -> Vec[u8, 16]:
%0 = insertelement <16 x i8> undef, i8 %val, i32 0
%1 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
ret <16 x i8> %1
@llvm
def _mm256_set1_epi8(val: u8) -> Vec[u8, 32]:
%0 = insertelement <32 x i8> undef, i8 %val, i32 0
%1 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
ret <32 x i8> %1
@llvm
def _mm_loadu_si128(data) -> Vec[u8, 16]:
%0 = bitcast i8* %data to <16 x i8>*
%1 = load <16 x i8>, <16 x i8>* %0, align 1
ret <16 x i8> %1
@llvm
def _mm256_loadu_si256(data) -> Vec[u8, 32]:
%0 = bitcast i8* %data to <32 x i8>*
%1 = load <32 x i8>, <32 x i8>* %0, align 1
ret <32 x i8> %1
@llvm
def _mm256_set1_ps(val: f32) -> Vec[f32, 8]:
%0 = insertelement <8 x float> undef, float %val, i32 0
%1 = shufflevector <8 x float> %0, <8 x float> undef, <8 x i32> zeroinitializer
ret <8 x float> %1
@llvm
def _mm512_set1_ps(val: f32) -> Vec[f32, 16]:
%0 = insertelement <16 x float> undef, float %val, i32 0
%1 = shufflevector <16 x float> %0, <16 x float> undef, <16 x i32> zeroinitializer
ret <16 x float> %1
@llvm
def _mm256_loadu_ps(data: Ptr[f32]) -> Vec[f32, 8]:
%0 = bitcast float* %data to <8 x float>*
%1 = load <8 x float>, <8 x float>* %0
ret <8 x float> %1
@llvm
def _mm512_loadu_ps(data: Ptr[f32]) -> Vec[f32, 16]:
%0 = bitcast float* %data to <16 x float>*
%1 = load <16 x float>, <16 x float>* %0
ret <16 x float> %1
@llvm
def _mm256_cvtepi8_epi32(vec: Vec[u8, 16]) -> Vec[u32, 8]:
%0 = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%1 = sext <8 x i8> %0 to <8 x i32>
ret <8 x i32> %1
@llvm
def _mm512_cvtepi8_epi64(vec: Vec[u8, 32]) -> Vec[u32, 16]:
%0 = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%1 = sext <16 x i8> %0 to <16 x i32>
ret <16 x i32> %1
@llvm
def _mm256_castsi256_ps(vec: Vec[u32, 8]) -> Vec[f32, 8]:
%0 = bitcast <8 x i32> %vec to <8 x float>
ret <8 x float> %0
@llvm
def _mm512_castsi512_ps(vec: Vec[u32, 16]) -> Vec[f32, 16]:
%0 = bitcast <16 x i32> %vec to <16 x float>
ret <16 x float> %0
def __new__(x, T: type, N: Static[int]) -> Vec[T, N]:
if isinstance(T, u8) and N == 16:
if isinstance(x, u8) or isinstance(x, byte): # TODO: u8<->byte
return Vec._mm_set1_epi8(x)
if isinstance(x, Ptr[u8]) or isinstance(x, Ptr[byte]):
return Vec._mm_loadu_si128(x)
if isinstance(x, str):
return Vec._mm_loadu_si128(x.ptr)
if isinstance(T, u8) and N == 32:
if isinstance(x, u8) or isinstance(x, byte): # TODO: u8<->byte
return Vec._mm256_set1_epi8(x)
if isinstance(x, Ptr[u8]) or isinstance(x, Ptr[byte]):
return Vec._mm256_loadu_si256(x)
if isinstance(x, str):
return Vec._mm256_loadu_si256(x.ptr)
if isinstance(T, f32) and N == 8:
if isinstance(x, f32):
return Vec._mm256_set1_ps(x)
if isinstance(x, Ptr[f32]): # TODO: multi-elif does NOT work with statics [why?!]
return Vec._mm256_loadu_ps(x)
if isinstance(x, List[f32]):
return Vec._mm256_loadu_ps(x.arr.ptr)
if isinstance(x, Vec[u8, 16]):
return Vec._mm256_castsi256_ps(Vec._mm256_cvtepi8_epi32(x))
if isinstance(T, f32) and N == 16:
if isinstance(x, f32):
return Vec._mm512_set1_ps(x)
if isinstance(x, Ptr[f32]): # TODO: multi-elif does NOT work with statics [why?!]
return Vec._mm512_loadu_ps(x)
if isinstance(x, List[f32]):
return Vec._mm512_loadu_ps(x.arr.ptr)
if isinstance(x, Vec[u8, 32]):
return Vec._mm512_castsi512_ps(Vec._mm512_cvtepi8_epi64(x))
compile_error("invalid SIMD vector constructor")
def __new__(x: str, offset: int = 0) -> Vec[u8, N]:
return Vec(x.ptr + offset, u8, N)
def __new__(x: List[T], offset: int = 0) -> Vec[T, N]:
return Vec(x.arr.ptr + offset, T, N)
def __new__(x) -> Vec[T, N]:
return Vec(x, T, N)
@llvm
def _mm_cmpeq_epi8(x: Vec[u8, 16], y: Vec[u8, 16]) -> Vec[u8, 16]:
%0 = icmp eq <16 x i8> %x, %y
%1 = sext <16 x i1> %0 to <16 x i8>
ret <16 x i8> %1
def __eq__(self: Vec[u8, 16], other: Vec[u8, 16]) -> Vec[u8, 16]:
return Vec._mm_cmpeq_epi8(self, other)
@llvm
def _mm256_cmpeq_epi8(x: Vec[u8, 32], y: Vec[u8, 32]) -> Vec[u8, 32]:
%0 = icmp eq <32 x i8> %x, %y
%1 = sext <32 x i1> %0 to <32 x i8>
ret <32 x i8> %1
def __eq__(self: Vec[u8, 32], other: Vec[u8, 32]) -> Vec[u8, 32]:
return Vec._mm256_cmpeq_epi8(self, other)
@llvm
def _mm_andnot_si128(x: Vec[u8, 16], y: Vec[u8, 16]) -> Vec[u8, 16]:
%0 = xor <16 x i8> %x, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%1 = and <16 x i8> %y, %0
ret <16 x i8> %1
def __ne__(self: Vec[u8, 16], other: Vec[u8, 16]) -> Vec[u8, 16]:
return Vec._mm_andnot_si128((self == other), Vec.FF_16x8i)
@llvm
def _mm256_andnot_si256(x: Vec[u8, 32], y: Vec[u8, 32]) -> Vec[u8, 32]:
%0 = xor <32 x i8> %x, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%1 = and <32 x i8> %y, %0
ret <32 x i8> %1
def __ne__(self: Vec[u8, 32], other: Vec[u8, 32]) -> Vec[u8, 32]:
return Vec._mm256_andnot_si256((self == other), Vec.FF_32x8i)
def __eq__(self: Vec[u8, 16], other: bool) -> Vec[u8, 16]:
if not other:
return Vec._mm_andnot_si128(self, Vec.FF_16x8i)
else:
return Vec._mm_andnot_si128(self, Vec.ZERO_16x8i)
def __eq__(self: Vec[u8, 32], other: bool) -> Vec[u8, 32]:
if not other:
return Vec._mm256_andnot_si256(self, Vec.FF_32x8i)
else:
return Vec._mm256_andnot_si256(self, Vec.ZERO_32x8i)
@llvm
def _mm_and_si128(x: Vec[u8, 16], y: Vec[u8, 16]) -> Vec[u8, 16]:
%0 = and <16 x i8> %x, %y
ret <16 x i8> %0
def __and__(self: Vec[u8, 16], other: Vec[u8, 16]) -> Vec[u8, 16]:
return Vec._mm_and_si128(self, other)
@llvm
def _mm_and_si256(x: Vec[u8, 32], y: Vec[u8, 32]) -> Vec[u8, 32]:
%0 = and <32 x i8> %x, %y
ret <32 x i8> %0
def __and__(self: Vec[u8, 32], other: Vec[u8, 32]) -> Vec[u8, 32]:
return Vec._mm_and_si256(self, other)
@llvm
def _mm256_and_ps(x: Vec[f32, 8], y: Vec[f32, 8]) -> Vec[f32, 8]:
%0 = bitcast <8 x float> %x to <8 x i32>
%1 = bitcast <8 x float> %y to <8 x i32>
%2 = and <8 x i32> %0, %1
%3 = bitcast <8 x i32> %2 to <8 x float>
ret <8 x float> %3
def __and__(self: Vec[f32, 8], other: Vec[f32, 8]) -> Vec[f32, 8]:
return Vec._mm256_and_ps(self, other)
@llvm
def _mm512_and_ps(x: Vec[f32, 16], y: Vec[f32, 16]) -> Vec[f32, 16]:
%0 = bitcast <16 x float> %x to <16 x i32>
%1 = bitcast <16 x float> %y to <16 x i32>
%2 = and <16 x i32> %0, %1
%3 = bitcast <16 x i32> %2 to <16 x float>
ret <16 x float> %3
def __and__(self: Vec[f32, 16], other: Vec[f32, 16]) -> Vec[f32, 16]:
return Vec._mm512_and_ps(self, other)
@llvm
def _mm_or_si128(x: Vec[u8, 16], y: Vec[u8, 16]) -> Vec[u8, 16]:
%0 = or <16 x i8> %x, %y
ret <16 x i8> %0
def __or__(self: Vec[u8, 16], other: Vec[u8, 16]) -> Vec[u8, 16]:
return Vec._mm_or_si128(self, other)
@llvm
def _mm_or_si256(x: Vec[u8, 32], y: Vec[u8, 32]) -> Vec[u8, 32]:
%0 = or <32 x i8> %x, %y
ret <32 x i8> %0
def __or__(self: Vec[u8, 32], other: Vec[u8, 32]) -> Vec[u8, 32]:
return Vec._mm_or_si256(self, other)
@llvm
def _mm256_or_ps(x: Vec[f32, 8], y: Vec[f32, 8]) -> Vec[f32, 8]:
%0 = bitcast <8 x float> %x to <8 x i32>
%1 = bitcast <8 x float> %y to <8 x i32>
%2 = or <8 x i32> %0, %1
%3 = bitcast <8 x i32> %2 to <8 x float>
ret <8 x float> %3
def __or__(self: Vec[f32, 8], other: Vec[f32, 8]) -> Vec[f32, 8]:
return Vec._mm256_or_ps(self, other)
@llvm
def _mm512_or_ps(x: Vec[f32, 16], y: Vec[f32, 16]) -> Vec[f32, 16]:
%0 = bitcast <16 x float> %x to <16 x i32>
%1 = bitcast <16 x float> %y to <16 x i32>
%2 = or <16 x i32> %0, %1
%3 = bitcast <16 x i32> %2 to <16 x float>
ret <16 x float> %3
def __or__(self: Vec[f32, 16], other: Vec[f32, 16]) -> Vec[f32, 16]:
return Vec._mm512_or_ps(self, other)
@llvm
def _mm_bsrli_si128_8(vec: Vec[u8, 16]) -> Vec[u8, 16]:
%0 = shufflevector <16 x i8> %vec, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
ret <16 x i8> %0
@llvm
def _mm256_add_ps(x: Vec[f32, 8], y: Vec[f32, 8]) -> Vec[f32, 8]:
%0 = fadd <8 x float> %x, %y
ret <8 x float> %0
def __add__(self: Vec[f32, 8], other: Vec[f32, 8]) -> Vec[f32, 8]:
return Vec._mm256_add_ps(self, other)
def __rshift__(self: Vec[u8, 16], shift: Static[int]) -> Vec[u8, 16]:
if shift == 0:
return self
elif shift == 8:
return Vec._mm_bsrli_si128_8(self)
else:
compile_error("invalid bitshift")
@llvm
def _mm_bsrli_256(vec: Vec[u8, 32]) -> Vec[u8, 32]:
%0 = shufflevector <32 x i8> %vec, <32 x i8> zeroinitializer, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
ret <32 x i8> %0
def __rshift__(self: Vec[u8, 32], shift: Static[int]) -> Vec[u8, 32]:
if shift == 0:
return self
elif shift == 16:
return Vec._mm_bsrli_256(self)
else:
compile_error("invalid bitshift")
# @llvm # https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
# def sum(self: Vec[f32, 8]) -> f32:
# %0 = shufflevector <8 x float> %self, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
# %1 = shufflevector <8 x float> %self, <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
# %2 = fadd <4 x float> %0, %1
# %3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
# %4 = fadd <4 x float> %2, %3
# %5 = shufflevector <4 x float> %4, <4 x float> poison, <4 x i32> <i32 2, i32 undef, i32 undef, i32 undef>
# %6 = fadd <4 x float> %4, %5
# %7 = extractelement <4 x float> %6, i32 0
# ret float %7
def sum(self: Vec[f32, 8], x: f32 = f32(0.0)) -> f32:
return x + self[0] + self[1] + self[2] + self[3] + self[4] + self[5] + self[6] + self[7]
@llvm
def __getitem__(self, n: Static[int]) -> T:
%0 = extractelement <{=N} x {=T}> %self, i32 {=n}
ret {=T} %0
def __repr__(self):
if N == 8:
return f"<{self[0]}, {self[1]}, {self[2]}, {self[3]}, {self[4]}, {self[5]}, {self[6]}, {self[7]}>"
elif N == 16:
return f"<{self[0]}, {self[1]}, {self[2]}, {self[3]}, {self[4]}, {self[5]}, {self[6]}, {self[7]}, {self[8]}, {self[9]}, {self[10]}, {self[11]}, {self[12]}, {self[13]}, {self[14]}, {self[15]}>"
else:
return "?"
u8x16 = Vec[u8, 16]
u8x32 = Vec[u8, 32]
f32x8 = Vec[f32, 8]