mirror of https://github.com/exaloop/codon.git
895 lines
25 KiB
Python
895 lines
25 KiB
Python
# Copyright (C) 2022-2023 Exaloop Inc. <https://exaloop.io>
|
|
# OpenMP interface
|
|
# Ref: https://github.com/llvm/llvm-project/tree/main/openmp
|
|
|
|
Routine = Function[[i32, cobj], i32]
|
|
|
|
_KMP_IDENT_IMB = 0x01
|
|
_KMP_IDENT_KMPC = 0x02
|
|
_KMP_IDENT_AUTOPAR = 0x08
|
|
_KMP_IDENT_ATOMIC_REDUCE = 0x10
|
|
_KMP_IDENT_BARRIER_EXPL = 0x20
|
|
_KMP_IDENT_BARRIER_IMPL = 0x0040
|
|
_KMP_IDENT_BARRIER_IMPL_MASK = 0x01C0
|
|
_KMP_IDENT_BARRIER_IMPL_FOR = 0x0040
|
|
_KMP_IDENT_BARRIER_IMPL_SECTIONS = 0x00C0
|
|
_KMP_IDENT_BARRIER_IMPL_SINGLE = 0x0140
|
|
_KMP_IDENT_BARRIER_IMPL_WORKSHARE = 0x01C0
|
|
_KMP_IDENT_WORK_LOOP = 0x200
|
|
_KMP_IDENT_WORK_SECTIONS = 0x400
|
|
_KMP_IDENT_WORK_DISTRIBUTE = 0x800
|
|
_KMP_IDENT_ATOMIC_HINT_MASK = 0xFF0000
|
|
_KMP_IDENT_ATOMIC_HINT_UNCONTENDED = 0x010000
|
|
_KMP_IDENT_ATOMIC_HINT_CONTENDED = 0x020000
|
|
_KMP_IDENT_ATOMIC_HINT_NONSPECULATIVE = 0x040000
|
|
_KMP_IDENT_ATOMIC_HINT_SPECULATIVE = 0x080000
|
|
_KMP_IDENT_OPENMP_SPEC_VERSION_MASK = 0xFF000000
|
|
|
|
@tuple
|
|
class Lock:
|
|
a1: i32
|
|
a2: i32
|
|
a3: i32
|
|
a4: i32
|
|
a5: i32
|
|
a6: i32
|
|
a7: i32
|
|
a8: i32
|
|
|
|
def __new__() -> Lock:
|
|
z = i32(0)
|
|
return Lock(z, z, z, z, z, z, z, z)
|
|
|
|
@tuple
|
|
class Ident:
|
|
reserved_1: i32
|
|
flags: i32
|
|
reserved_2: i32
|
|
reserved_3: i32
|
|
psource: cobj
|
|
|
|
def __new__(flags: int = 0, source: str = ";unknown;unknown;0;0;;") -> Ident:
|
|
return Ident(i32(0), i32(flags | _KMP_IDENT_KMPC), i32(0), i32(0), source.ptr)
|
|
|
|
@tuple
|
|
class LRData:
|
|
routine: Routine
|
|
|
|
@tuple
|
|
class Task:
|
|
shareds: cobj
|
|
routine: Routine
|
|
flags: i32
|
|
x: LRData
|
|
y: LRData
|
|
|
|
@tuple
|
|
class TaskWithPrivates:
|
|
task: Task
|
|
data: T
|
|
T: type
|
|
|
|
@tuple
|
|
class TaskReductionInput:
|
|
reduce_shar: cobj
|
|
reduce_orig: cobj
|
|
reduce_size: int
|
|
reduce_init: cobj
|
|
reduce_fini: cobj
|
|
reduce_comb: cobj
|
|
flags: u32
|
|
|
|
def __new__(reduce_shar, reduce_orig, reduce_size: int,
|
|
reduce_init: cobj, reduce_comb: cobj):
|
|
return TaskReductionInput(reduce_shar.as_byte(), reduce_orig.as_byte(), reduce_size,
|
|
reduce_init, cobj(), reduce_comb, u32(0))
|
|
|
|
@tuple
|
|
class TaskReductionInputArray:
|
|
len: int
|
|
ptr: Ptr[TaskReductionInput]
|
|
|
|
def __setitem__(self, idx: int, x: TaskReductionInput):
|
|
self.ptr[idx] = x
|
|
|
|
_DEFAULT_IDENT = Ident()
|
|
_STATIC_LOOP_IDENT = Ident(_KMP_IDENT_WORK_LOOP)
|
|
_REDUCTION_IDENT = Ident(_KMP_IDENT_ATOMIC_REDUCE)
|
|
|
|
def _default_loc():
|
|
return __ptr__(_DEFAULT_IDENT)
|
|
|
|
_default_loc()
|
|
|
|
def _static_loop_loc():
|
|
return __ptr__(_STATIC_LOOP_IDENT)
|
|
|
|
_static_loop_loc()
|
|
|
|
def _reduction_loc():
|
|
return __ptr__(_REDUCTION_IDENT)
|
|
|
|
_reduction_loc()
|
|
|
|
def _critical_begin(loc_ref: Ptr[Ident], gtid: int, lck: Ptr[Lock]):
|
|
from C import __kmpc_critical(Ptr[Ident], i32, Ptr[Lock])
|
|
__kmpc_critical(loc_ref, i32(gtid), lck)
|
|
|
|
def _critical_end(loc_ref: Ptr[Ident], gtid: int, lck: Ptr[Lock]):
|
|
from C import __kmpc_end_critical(Ptr[Ident], i32, Ptr[Lock])
|
|
__kmpc_end_critical(loc_ref, i32(gtid), lck)
|
|
|
|
def _single_begin(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_single(Ptr[Ident], i32) -> i32
|
|
return int(__kmpc_single(loc_ref, i32(gtid)))
|
|
|
|
def _single_end(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_end_single(Ptr[Ident], i32)
|
|
__kmpc_end_single(loc_ref, i32(gtid))
|
|
|
|
def _master_begin(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_master(Ptr[Ident], i32) -> i32
|
|
return int(__kmpc_master(loc_ref, i32(gtid)))
|
|
|
|
def _master_end(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_end_master(Ptr[Ident], i32)
|
|
__kmpc_end_master(loc_ref, i32(gtid))
|
|
|
|
def _ordered_begin(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_ordered(Ptr[Ident], i32) -> i32
|
|
return int(__kmpc_ordered(loc_ref, i32(gtid)))
|
|
|
|
def _ordered_end(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_end_ordered(Ptr[Ident], i32)
|
|
__kmpc_end_ordered(loc_ref, i32(gtid))
|
|
|
|
def _taskwait(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_omp_taskwait(Ptr[Ident], i32)
|
|
__kmpc_omp_taskwait(loc_ref, i32(gtid))
|
|
|
|
def _taskgroup_begin(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_taskgroup(Ptr[Ident], i32)
|
|
__kmpc_taskgroup(loc_ref, i32(gtid))
|
|
|
|
def _taskgroup_end(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_end_taskgroup(Ptr[Ident], i32)
|
|
__kmpc_end_taskgroup(loc_ref, i32(gtid))
|
|
|
|
def _task_alloc_size(
|
|
size_of_task: int,
|
|
size_of_shareds: int,
|
|
):
|
|
from C import __kmpc_omp_task_alloc_size(int, int) -> int
|
|
return __kmpc_omp_task_alloc_size(size_of_task, size_of_shareds)
|
|
|
|
def _task_alloc(
|
|
loc_ref: Ptr[Ident],
|
|
gtid: int,
|
|
flags: int,
|
|
size_of_task: int,
|
|
size_of_shareds: int,
|
|
task_entry: Routine,
|
|
):
|
|
from internal.gc import alloc
|
|
from C import __kmpc_omp_task_alloc(Ptr[Ident], i32, i32, int, int, Routine, cobj) -> cobj
|
|
|
|
taskdata = alloc(_task_alloc_size(size_of_task, size_of_shareds))
|
|
return __kmpc_omp_task_alloc(
|
|
loc_ref, i32(gtid), i32(flags), size_of_task, size_of_shareds, task_entry, taskdata
|
|
)
|
|
|
|
def _task_run(loc_ref: Ptr[Ident], gtid: int, new_task: cobj):
|
|
from C import __kmpc_omp_task(Ptr[Ident], i32, cobj) -> i32
|
|
return int(__kmpc_omp_task(loc_ref, i32(gtid), new_task))
|
|
|
|
def _barrier(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_barrier(Ptr[Ident], i32)
|
|
__kmpc_barrier(loc_ref, i32(gtid))
|
|
|
|
def _flush(loc_ref: Ptr[Ident]):
|
|
from C import __kmpc_flush(Ptr[Ident])
|
|
__kmpc_flush(loc_ref)
|
|
|
|
def flush():
|
|
_flush(_default_loc())
|
|
|
|
def _static_init(
|
|
loc_ref: Ptr[Ident], gtid: int, schedtype: int, loop: range, incr: int, chunk: int
|
|
):
|
|
from C import __kmpc_for_static_init_8(Ptr[Ident], i32, i32, Ptr[i32], Ptr[int], Ptr[int], Ptr[int], int, int)
|
|
last = i32(0)
|
|
lower = 0
|
|
upper = len(loop) - 1
|
|
stride = 1
|
|
__kmpc_for_static_init_8(
|
|
loc_ref,
|
|
i32(gtid),
|
|
i32(schedtype),
|
|
__ptr__(last),
|
|
__ptr__(lower),
|
|
__ptr__(upper),
|
|
__ptr__(stride),
|
|
incr,
|
|
chunk,
|
|
)
|
|
return bool(last), range(loop._get(lower), loop._get(upper + 1), loop.step), stride
|
|
|
|
def _static_fini(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_for_static_fini(Ptr[Ident], i32)
|
|
__kmpc_for_static_fini(loc_ref, i32(gtid))
|
|
|
|
def _dynamic_init(
|
|
loc_ref: Ptr[Ident], gtid: int, schedtype: int, loop: range, chunk: int
|
|
):
|
|
from C import __kmpc_dispatch_init_8(Ptr[Ident], i32, i32, int, int, int, int)
|
|
lower = 0
|
|
upper = len(loop) - 1
|
|
stride = 1
|
|
__kmpc_dispatch_init_8(
|
|
loc_ref, i32(gtid), i32(schedtype), lower, upper, stride, chunk
|
|
)
|
|
|
|
def _dynamic_next(loc_ref: Ptr[Ident], gtid: int, loop: range):
|
|
from C import __kmpc_dispatch_next_8(Ptr[Ident], i32, Ptr[i32], Ptr[int], Ptr[int], Ptr[int]) -> i32
|
|
last = i32(0)
|
|
lower = 0
|
|
upper = 0
|
|
stride = 0
|
|
more = __kmpc_dispatch_next_8(
|
|
loc_ref,
|
|
i32(gtid),
|
|
__ptr__(last),
|
|
__ptr__(lower),
|
|
__ptr__(upper),
|
|
__ptr__(stride),
|
|
)
|
|
return (
|
|
bool(more),
|
|
bool(last),
|
|
range(loop._get(lower), loop._get(upper + 1), loop.step),
|
|
)
|
|
|
|
def _dynamic_fini(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_dispatch_fini_8(Ptr[Ident], i32)
|
|
__kmpc_dispatch_fini_8(loc_ref, i32(gtid))
|
|
|
|
def _reduce(
|
|
loc_ref: Ptr[Ident],
|
|
gtid: int,
|
|
reduce_data: T,
|
|
reduce_func: cobj,
|
|
lck: cobj,
|
|
T: type,
|
|
):
|
|
from internal.gc import sizeof
|
|
|
|
from C import __kmpc_reduce(Ptr[Ident], i32, i32, int, cobj, cobj, cobj) -> i32
|
|
num_vars = staticlen(reduce_data)
|
|
reduce_size = sizeof(T)
|
|
return int(
|
|
__kmpc_reduce(
|
|
loc_ref,
|
|
i32(gtid),
|
|
i32(num_vars),
|
|
reduce_size,
|
|
__ptr__(reduce_data).as_byte(),
|
|
reduce_func,
|
|
lck,
|
|
)
|
|
)
|
|
|
|
def _end_reduce(loc_ref: Ptr[Ident], gtid: int, lck: cobj):
|
|
from C import __kmpc_end_reduce(Ptr[Ident], i32, cobj)
|
|
__kmpc_end_reduce(loc_ref, i32(gtid), lck)
|
|
|
|
def _reduce_nowait(
|
|
loc_ref: Ptr[Ident],
|
|
gtid: int,
|
|
reduce_data: T,
|
|
reduce_func: cobj,
|
|
lck: Ptr[Lock],
|
|
T: type,
|
|
):
|
|
from internal.gc import sizeof
|
|
|
|
from C import __kmpc_reduce_nowait(Ptr[Ident], i32, i32, int, cobj, cobj, Ptr[Lock]) -> i32
|
|
num_vars = staticlen(reduce_data)
|
|
reduce_size = sizeof(T)
|
|
return int(
|
|
__kmpc_reduce_nowait(
|
|
loc_ref,
|
|
i32(gtid),
|
|
i32(num_vars),
|
|
reduce_size,
|
|
__ptr__(reduce_data).as_byte(),
|
|
reduce_func,
|
|
lck,
|
|
)
|
|
)
|
|
|
|
def _end_reduce_nowait(loc_ref: Ptr[Ident], gtid: int, lck: Ptr[Lock]):
|
|
from C import __kmpc_end_reduce_nowait(Ptr[Ident], i32, Ptr[Lock])
|
|
__kmpc_end_reduce_nowait(loc_ref, i32(gtid), lck)
|
|
|
|
def _taskred_init(loc_ref: Ptr[Ident], gtid: int, num: int, data):
|
|
from C import __kmpc_taskred_modifier_init(Ptr[Ident], i32, i32, i32, cobj) -> cobj
|
|
return __kmpc_taskred_modifier_init(loc_ref, i32(gtid), i32(0), i32(num), data.as_byte())
|
|
|
|
def _taskred_fini(loc_ref: Ptr[Ident], gtid: int):
|
|
from C import __kmpc_task_reduction_modifier_fini(Ptr[Ident], i32, i32)
|
|
__kmpc_task_reduction_modifier_fini(loc_ref, i32(gtid), i32(0))
|
|
|
|
# add tskgrp arg?
|
|
def _taskred_data(gtid: int, data):
|
|
from C import __kmpc_task_reduction_get_th_data(i32, cobj, cobj) -> cobj
|
|
T = type(data)
|
|
return T(__kmpc_task_reduction_get_th_data(i32(gtid), cobj(), data.as_byte()))
|
|
|
|
def _fork_call(microtask: cobj, args):
|
|
from C import __kmpc_fork_call(Ptr[Ident], i32, cobj, ...)
|
|
loc_ref = _default_loc() # TODO: pass real loc?
|
|
__kmpc_fork_call(loc_ref, i32(1), microtask, __ptr__(args))
|
|
|
|
def _static_loop_outline_template(gtid_ptr: Ptr[i32], btid_ptr: Ptr[i32], args):
|
|
@nonpure
|
|
def _loop_step():
|
|
return 1
|
|
|
|
@nonpure
|
|
def _loop_loc_and_gtid(
|
|
loc_ref: Ptr[Ident], reduction_loc_ref: Ptr[Ident], gtid: int
|
|
):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_body_stub(i, args):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_schedule():
|
|
return (1 << 30) | 35 # nonmonotonic, dynamic chunked
|
|
|
|
@nonpure
|
|
def _loop_shared_updates(args):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_reductions(args):
|
|
pass
|
|
|
|
chunk, start, stop, extra = args[0]
|
|
step = _loop_step()
|
|
gtid = int(gtid_ptr[0])
|
|
loc_ref = _default_loc()
|
|
static_loop_loc_ref = _static_loop_loc()
|
|
reduction_loc_ref = _reduction_loc()
|
|
_loop_loc_and_gtid(loc_ref, reduction_loc_ref, gtid)
|
|
loop = range(start, stop, step)
|
|
schedule = _loop_schedule()
|
|
|
|
last, subloop, stride = _static_init(
|
|
static_loop_loc_ref, gtid, schedtype=schedule, loop=loop, incr=1, chunk=1
|
|
)
|
|
i = subloop.start
|
|
stop = min(subloop.stop, loop.stop) if step >= 0 else max(subloop.stop, loop.stop)
|
|
|
|
while (step >= 0 and i < stop) or (step < 0 and i > stop):
|
|
_loop_body_stub(i, extra)
|
|
i += step
|
|
_static_fini(static_loop_loc_ref, gtid)
|
|
|
|
if last:
|
|
_loop_shared_updates(extra)
|
|
|
|
_loop_reductions(extra)
|
|
|
|
def _static_chunked_loop_outline_template(gtid_ptr: Ptr[i32], btid_ptr: Ptr[i32], args):
|
|
@nonpure
|
|
def _loop_step():
|
|
return 1
|
|
|
|
@nonpure
|
|
def _loop_loc_and_gtid(
|
|
loc_ref: Ptr[Ident], reduction_loc_ref: Ptr[Ident], gtid: int
|
|
):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_body_stub(i, args):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_schedule():
|
|
return (1 << 30) | 35 # nonmonotonic, dynamic chunked
|
|
|
|
@nonpure
|
|
def _loop_shared_updates(args):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_reductions(args):
|
|
pass
|
|
|
|
chunk, start, stop, extra = args[0]
|
|
step = _loop_step()
|
|
gtid = int(gtid_ptr[0])
|
|
loc_ref = _default_loc()
|
|
static_loop_loc_ref = _static_loop_loc()
|
|
reduction_loc_ref = _reduction_loc()
|
|
_loop_loc_and_gtid(loc_ref, reduction_loc_ref, gtid)
|
|
loop = range(start, stop, step)
|
|
schedule = _loop_schedule()
|
|
|
|
last, subloop, stride = _static_init(
|
|
static_loop_loc_ref, gtid, schedtype=schedule, loop=loop, incr=1, chunk=chunk
|
|
)
|
|
start = subloop.start
|
|
stop = min(subloop.stop, loop.stop) if step >= 0 else max(subloop.stop, loop.stop)
|
|
|
|
while (step >= 0 and start < loop.stop) or (step < 0 and start > loop.stop):
|
|
i = start
|
|
while (step >= 0 and i < stop) or (step < 0 and i > stop):
|
|
_loop_body_stub(i, extra)
|
|
i += step
|
|
|
|
start += stride * step
|
|
stop += stride * step
|
|
stop = min(stop, loop.stop) if step >= 0 else max(stop, loop.stop)
|
|
_static_fini(static_loop_loc_ref, gtid)
|
|
|
|
if last:
|
|
_loop_shared_updates(extra)
|
|
|
|
_loop_reductions(extra)
|
|
|
|
def _dynamic_loop_outline_template(gtid_ptr: Ptr[i32], btid_ptr: Ptr[i32], args):
|
|
@nonpure
|
|
def _loop_step():
|
|
return 1
|
|
|
|
@nonpure
|
|
def _loop_loc_and_gtid(
|
|
loc_ref: Ptr[Ident], reduction_loc_ref: Ptr[Ident], gtid: int
|
|
):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_body_stub(i, args):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_schedule():
|
|
return (1 << 30) | 35 # nonmonotonic, dynamic chunked
|
|
|
|
@nonpure
|
|
def _loop_shared_updates(args):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_reductions(args):
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_ordered():
|
|
return False
|
|
|
|
chunk, start, stop, extra = args[0]
|
|
step = _loop_step()
|
|
gtid = int(gtid_ptr[0])
|
|
loc_ref = _default_loc()
|
|
reduction_loc_ref = _reduction_loc()
|
|
_loop_loc_and_gtid(loc_ref, reduction_loc_ref, gtid)
|
|
loop = range(start, stop, step)
|
|
schedule = _loop_schedule()
|
|
ordered = _loop_ordered()
|
|
|
|
_dynamic_init(loc_ref, gtid, schedtype=schedule, loop=loop, chunk=chunk)
|
|
while True:
|
|
more, last, subloop = _dynamic_next(loc_ref, gtid, loop)
|
|
if not more:
|
|
break
|
|
i = subloop.start
|
|
while (step >= 0 and i < subloop.stop) or (step < 0 and i > subloop.stop):
|
|
_loop_body_stub(i, extra)
|
|
i += step
|
|
if ordered:
|
|
_dynamic_fini(loc_ref, gtid)
|
|
if last:
|
|
_loop_shared_updates(extra)
|
|
|
|
_loop_reductions(extra)
|
|
|
|
# P = privates; tuple of types
|
|
# S = shareds; tuple of pointers
|
|
def _spawn_and_run_task(
|
|
loc_ref: Ptr[Ident], gtid: int, routine: cobj, priv: P, shared: S, P: type, S: type
|
|
):
|
|
from internal.gc import sizeof
|
|
|
|
TaskThunk = TaskWithPrivates[P]
|
|
flags = 1
|
|
size_of_kmp_task_t = sizeof(TaskThunk)
|
|
size_of_privs = sizeof(P)
|
|
size_of_shareds = sizeof(S)
|
|
loc_ref = _default_loc()
|
|
|
|
task = Ptr[TaskThunk](
|
|
_task_alloc(
|
|
loc_ref, gtid, flags, size_of_kmp_task_t, size_of_shareds, Routine(routine)
|
|
)
|
|
)
|
|
if staticlen(shared) != 0:
|
|
shared_ptr = task[0].task.shareds
|
|
str.memcpy(shared_ptr, __ptr__(shared).as_byte(), size_of_shareds)
|
|
if staticlen(priv) != 0:
|
|
priv_ptr = task.as_byte() + sizeof(Task)
|
|
str.memcpy(priv_ptr, __ptr__(priv).as_byte(), size_of_privs)
|
|
|
|
_task_run(loc_ref, gtid, task.as_byte())
|
|
|
|
# Note: this is different than OpenMP's "taskloop" -- this template simply
|
|
# spawns a new task for each loop iteration.
|
|
def _task_loop_outline_template(gtid_ptr: Ptr[i32], btid_ptr: Ptr[i32], args):
|
|
def _routine_stub(gtid: i32, data: cobj, P: type, S: type):
|
|
@nonpure
|
|
def _task_loop_body_stub(gtid: int, priv, shared):
|
|
pass
|
|
|
|
task = Ptr[TaskWithPrivates[P]](data)[0]
|
|
priv = task.data
|
|
gtid64 = int(gtid)
|
|
if staticlen(S()) != 0:
|
|
shared = Ptr[S](task.task.shareds)[0]
|
|
_task_loop_body_stub(gtid64, priv, shared)
|
|
else:
|
|
shared = ()
|
|
_task_loop_body_stub(gtid64, priv, shared)
|
|
return i32(0)
|
|
|
|
@nonpure
|
|
def _loop_loc_and_gtid(
|
|
loc_ref: Ptr[Ident], reduction_loc_ref: Ptr[Ident], gtid: int
|
|
):
|
|
pass
|
|
|
|
@nonpure
|
|
def _fix_privates_and_shareds(i, priv, shared):
|
|
return priv, shared
|
|
|
|
@nonpure
|
|
def _taskred_setup(args):
|
|
pass
|
|
|
|
@nonpure
|
|
def _taskred_finish():
|
|
pass
|
|
|
|
@nonpure
|
|
def _loop_reductions(args):
|
|
pass
|
|
|
|
iterable, priv, shared = args[0]
|
|
P = type(priv)
|
|
S = type(shared)
|
|
|
|
gtid = int(gtid_ptr[0])
|
|
loc_ref = _default_loc()
|
|
reduction_loc_ref = _reduction_loc()
|
|
_loop_loc_and_gtid(loc_ref, reduction_loc_ref, gtid)
|
|
|
|
_taskred_setup(shared)
|
|
|
|
if _single_begin(loc_ref, gtid) != 0:
|
|
_taskgroup_begin(loc_ref, gtid)
|
|
try:
|
|
for i in iterable:
|
|
priv_fixed, shared_fixed = _fix_privates_and_shareds(i, priv, shared)
|
|
_spawn_and_run_task(
|
|
loc_ref, gtid, _routine_stub(P=P, S=S, ...).__raw__(), priv_fixed, shared_fixed
|
|
)
|
|
finally:
|
|
_taskgroup_end(loc_ref, gtid)
|
|
_single_end(loc_ref, gtid)
|
|
|
|
_taskred_finish()
|
|
_loop_reductions(shared)
|
|
_barrier(loc_ref, gtid)
|
|
|
|
@pure
|
|
def get_num_threads():
|
|
from C import omp_get_num_threads() -> i32
|
|
return int(omp_get_num_threads())
|
|
|
|
@pure
|
|
def get_thread_num():
|
|
from C import omp_get_thread_num() -> i32
|
|
return int(omp_get_thread_num())
|
|
|
|
@pure
|
|
def get_max_threads():
|
|
from C import omp_get_max_threads() -> i32
|
|
return int(omp_get_max_threads())
|
|
|
|
@pure
|
|
def get_num_procs():
|
|
from C import omp_get_num_procs() -> i32
|
|
return int(omp_get_num_procs())
|
|
|
|
def set_num_threads(num_threads: int):
|
|
from C import omp_set_num_threads(i32)
|
|
omp_set_num_threads(i32(num_threads))
|
|
|
|
@pure
|
|
def in_parallel():
|
|
from C import omp_in_parallel() -> i32
|
|
return bool(omp_in_parallel())
|
|
|
|
def set_dynamic(dynamic_threads: bool = True):
|
|
from C import omp_set_dynamic(i32)
|
|
omp_set_dynamic(i32(1 if dynamic_threads else 0))
|
|
|
|
@pure
|
|
def get_dynamic():
|
|
from C import omp_get_dynamic() -> i32
|
|
return bool(omp_get_dynamic())
|
|
|
|
@pure
|
|
def get_cancellation():
|
|
from C import omp_get_cancellation() -> i32
|
|
return bool(omp_get_cancellation())
|
|
|
|
def set_schedule(kind: str, chunk_size: int = 0):
|
|
from C import omp_set_schedule(i32, i32)
|
|
if kind == "static":
|
|
omp_set_schedule(i32(1), i32(chunk_size))
|
|
elif kind == "dynamic":
|
|
omp_set_schedule(i32(2), i32(chunk_size))
|
|
elif kind == "guided":
|
|
omp_set_schedule(i32(3), i32(chunk_size))
|
|
elif kind == "auto":
|
|
if chunk_size != 0:
|
|
raise ValueError("cannot specify chunk size for auto schedule")
|
|
omp_set_schedule(i32(4), i32(chunk_size))
|
|
else:
|
|
raise ValueError(
|
|
"invalid schedule kind; valid ones are: 'static', 'dynamic', 'guided', 'auto'"
|
|
)
|
|
|
|
@pure
|
|
def get_schedule():
|
|
from C import omp_get_schedule(Ptr[i32], Ptr[i32])
|
|
kind_code = i32(0)
|
|
chunk_size = i32(0)
|
|
omp_get_schedule(__ptr__(kind_code), __ptr__(chunk_size))
|
|
idx = int(kind_code)
|
|
kind = (
|
|
("static", "dynamic", "guided", "auto")[idx - 1] if 1 < idx <= 4 else "unknown"
|
|
)
|
|
return kind, int(chunk_size)
|
|
|
|
@pure
|
|
def get_thread_limit():
|
|
from C import omp_get_thread_limit() -> i32
|
|
return int(omp_get_thread_limit())
|
|
|
|
def set_max_active_levels(max_levels: int):
|
|
from C import omp_set_max_active_levels(i32)
|
|
omp_set_max_active_levels(i32(max_levels))
|
|
|
|
@pure
|
|
def get_max_active_levels():
|
|
from C import omp_get_max_active_levels() -> i32
|
|
return int(omp_get_max_active_levels())
|
|
|
|
@pure
|
|
def get_level():
|
|
from C import omp_get_level() -> i32
|
|
return int(omp_get_level())
|
|
|
|
@pure
|
|
def get_ancestor_thread_num(level: int):
|
|
from C import omp_get_ancestor_thread_num(i32) -> i32
|
|
return int(omp_get_ancestor_thread_num(i32(level)))
|
|
|
|
@pure
|
|
def get_team_size(level: int):
|
|
from C import omp_get_team_size(i32) -> i32
|
|
return int(omp_get_team_size(i32(level)))
|
|
|
|
@pure
|
|
def get_active_level():
|
|
from C import omp_get_active_level() -> i32
|
|
return int(omp_get_active_level())
|
|
|
|
@pure
|
|
def in_final():
|
|
from C import omp_in_final() -> i32
|
|
return bool(omp_in_final())
|
|
|
|
@pure
|
|
def get_proc_bind():
|
|
from C import omp_get_proc_bind() -> i32
|
|
result = int(omp_get_proc_bind())
|
|
if result < 0 or result > 4:
|
|
return "unknown"
|
|
return ("false", "true", "master", "close", "spread")[result]
|
|
|
|
def set_default_device(device_num: int):
|
|
from C import omp_set_default_device(i32)
|
|
omp_set_default_device(i32(device_num))
|
|
|
|
@pure
|
|
def get_default_device():
|
|
from C import omp_get_default_device() -> i32
|
|
return int(omp_get_default_device())
|
|
|
|
@pure
|
|
def get_num_devices():
|
|
from C import omp_get_num_devices() -> i32
|
|
return int(omp_get_num_devices())
|
|
|
|
@pure
|
|
def get_num_teams():
|
|
from C import omp_get_num_teams() -> i32
|
|
return int(omp_get_num_teams())
|
|
|
|
@pure
|
|
def get_team_num():
|
|
from C import omp_get_team_num() -> i32
|
|
return int(omp_get_team_num())
|
|
|
|
@pure
|
|
def is_initial_device():
|
|
from C import omp_is_initial_device() -> i32
|
|
return bool(omp_is_initial_device())
|
|
|
|
@pure
|
|
def get_wtime():
|
|
from C import omp_get_wtime() -> float
|
|
return omp_get_wtime()
|
|
|
|
@pure
|
|
def get_wtick():
|
|
from C import omp_get_wtick() -> float
|
|
return omp_get_wtick()
|
|
|
|
def single(func):
|
|
def _wrapper(*args, **kwargs):
|
|
gtid = get_thread_num()
|
|
loc = _default_loc()
|
|
if _single_begin(loc, gtid) != 0:
|
|
try:
|
|
func(*args, **kwargs)
|
|
finally:
|
|
_single_end(loc, gtid)
|
|
|
|
return _wrapper
|
|
|
|
def master(func):
|
|
def _wrapper(*args, **kwargs):
|
|
gtid = get_thread_num()
|
|
loc = _default_loc()
|
|
if _master_begin(loc, gtid) != 0:
|
|
try:
|
|
func(*args, **kwargs)
|
|
finally:
|
|
_master_end(loc, gtid)
|
|
|
|
return _wrapper
|
|
|
|
def ordered(func):
|
|
def _wrapper(*args, **kwargs):
|
|
gtid = get_thread_num()
|
|
loc = _default_loc()
|
|
if _ordered_begin(loc, gtid) != 0:
|
|
try:
|
|
func(*args, **kwargs)
|
|
finally:
|
|
_ordered_end(loc, gtid)
|
|
|
|
return _wrapper
|
|
|
|
_default_lock = Lock()
|
|
|
|
def critical(func):
|
|
def _wrapper(*args, **kwargs):
|
|
gtid = get_thread_num()
|
|
loc = _default_loc()
|
|
_critical_begin(loc, gtid, __ptr__(_default_lock))
|
|
try:
|
|
func(*args, **kwargs)
|
|
finally:
|
|
_critical_end(loc, gtid, __ptr__(_default_lock))
|
|
|
|
return _wrapper
|
|
|
|
def _push_num_threads(num_threads: int):
|
|
from C import __kmpc_push_num_threads(Ptr[Ident], i32, i32)
|
|
gtid = get_thread_num()
|
|
loc = _default_loc()
|
|
__kmpc_push_num_threads(loc, i32(gtid), i32(num_threads))
|
|
|
|
@llvm
|
|
def _atomic_int_add(a: Ptr[int], b: int) -> None:
|
|
%old = atomicrmw add ptr %a, i64 %b monotonic
|
|
ret {} {}
|
|
|
|
def _atomic_int_mul(a: Ptr[int], b: int):
|
|
from C import __kmpc_atomic_fixed8_mul(Ptr[Ident], i32, Ptr[int], int)
|
|
__kmpc_atomic_fixed8_mul(_default_loc(), i32(0), a, b)
|
|
|
|
@llvm
|
|
def _atomic_int_and(a: Ptr[int], b: int) -> None:
|
|
%old = atomicrmw and ptr %a, i64 %b monotonic
|
|
ret {} {}
|
|
|
|
@llvm
|
|
def _atomic_int_or(a: Ptr[int], b: int) -> None:
|
|
%old = atomicrmw or ptr %a, i64 %b monotonic
|
|
ret {} {}
|
|
|
|
@llvm
|
|
def _atomic_int_xor(a: Ptr[int], b: int) -> None:
|
|
%old = atomicrmw xor ptr %a, i64 %b monotonic
|
|
ret {} {}
|
|
|
|
@llvm
|
|
def _atomic_int_min(a: Ptr[int], b: int) -> None:
|
|
%old = atomicrmw min ptr %a, i64 %b monotonic
|
|
ret {} {}
|
|
|
|
@llvm
|
|
def _atomic_int_max(a: Ptr[int], b: int) -> None:
|
|
%old = atomicrmw max ptr %a, i64 %b monotonic
|
|
ret {} {}
|
|
|
|
def _atomic_float_add(a: Ptr[float], b: float) -> None:
|
|
from C import __kmpc_atomic_float8_add(Ptr[Ident], i32, Ptr[float], float)
|
|
__kmpc_atomic_float8_add(_default_loc(), i32(0), a, b)
|
|
|
|
def _atomic_float_mul(a: Ptr[float], b: float):
|
|
from C import __kmpc_atomic_float8_mul(Ptr[Ident], i32, Ptr[float], float)
|
|
__kmpc_atomic_float8_mul(_default_loc(), i32(0), a, b)
|
|
|
|
def _atomic_float_min(a: Ptr[float], b: float):
|
|
from C import __kmpc_atomic_float8_min(Ptr[Ident], i32, Ptr[float], float)
|
|
__kmpc_atomic_float8_min(_default_loc(), i32(0), a, b)
|
|
|
|
def _atomic_float_max(a: Ptr[float], b: float):
|
|
from C import __kmpc_atomic_float8_max(Ptr[Ident], i32, Ptr[float], float)
|
|
__kmpc_atomic_float8_max(_default_loc(), i32(0), a, b)
|
|
|
|
def _atomic_float32_add(a: Ptr[float32], b: float32) -> None:
|
|
from C import __kmpc_atomic_float4_add(Ptr[Ident], i32, Ptr[float32], float32)
|
|
__kmpc_atomic_float4_add(_default_loc(), i32(0), a, b)
|
|
|
|
def _atomic_float32_mul(a: Ptr[float32], b: float32):
|
|
from C import __kmpc_atomic_float4_mul(Ptr[Ident], i32, Ptr[float32], float32)
|
|
__kmpc_atomic_float4_mul(_default_loc(), i32(0), a, b)
|
|
|
|
def _atomic_float32_min(a: Ptr[float32], b: float32) -> None:
|
|
from C import __kmpc_atomic_float4_min(Ptr[Ident], i32, Ptr[float32], float32)
|
|
__kmpc_atomic_float4_min(_default_loc(), i32(0), a, b)
|
|
|
|
def _atomic_float32_max(a: Ptr[float32], b: float32) -> None:
|
|
from C import __kmpc_atomic_float4_max(Ptr[Ident], i32, Ptr[float32], float32)
|
|
__kmpc_atomic_float4_max(_default_loc(), i32(0), a, b)
|
|
|
|
def _range_len(start: int, stop: int, step: int):
|
|
if step > 0 and start < stop:
|
|
return 1 + (stop - 1 - start) // step
|
|
elif step < 0 and start > stop:
|
|
return 1 + (start - 1 - stop) // (-step)
|
|
else:
|
|
return 0
|
|
|
|
def for_par(
|
|
num_threads: int = -1,
|
|
chunk_size: int = -1,
|
|
schedule: Static[str] = "static",
|
|
ordered: Static[int] = False,
|
|
collapse: Static[int] = 0,
|
|
gpu: Static[int] = False,
|
|
):
|
|
pass
|