mirror of https://github.com/exaloop/codon.git
1635 lines
54 KiB
C++
1635 lines
54 KiB
C++
// Copyright (C) 2022-2023 Exaloop Inc. <https://exaloop.io>
|
|
|
|
#include "openmp.h"
|
|
|
|
#include <algorithm>
|
|
#include <iterator>
|
|
#include <limits>
|
|
#include <unordered_set>
|
|
|
|
#include "codon/cir/transform/parallel/schedule.h"
|
|
#include "codon/cir/util/cloning.h"
|
|
#include "codon/cir/util/irtools.h"
|
|
#include "codon/cir/util/outlining.h"
|
|
|
|
namespace codon {
|
|
namespace ir {
|
|
namespace transform {
|
|
namespace parallel {
|
|
namespace {
|
|
const std::string ompModule = "std.openmp";
|
|
const std::string gpuModule = "std.gpu";
|
|
const std::string builtinModule = "std.internal.builtin";
|
|
|
|
void warn(const std::string &msg, const Value *v) {
|
|
auto src = v->getSrcInfo();
|
|
compilationWarning(msg, src.file, src.line, src.col);
|
|
}
|
|
|
|
struct OMPTypes {
|
|
types::Type *i64 = nullptr;
|
|
types::Type *i32 = nullptr;
|
|
types::Type *i8ptr = nullptr;
|
|
types::Type *i32ptr = nullptr;
|
|
|
|
explicit OMPTypes(Module *M) {
|
|
i64 = M->getIntType();
|
|
i32 = M->getIntNType(32, /*sign=*/true);
|
|
i8ptr = M->getPointerType(M->getByteType());
|
|
i32ptr = M->getPointerType(i32);
|
|
}
|
|
};
|
|
|
|
Var *getVarFromOutlinedArg(Value *arg) {
|
|
if (auto *val = cast<VarValue>(arg)) {
|
|
return val->getVar();
|
|
} else if (auto *val = cast<PointerValue>(arg)) {
|
|
return val->getVar();
|
|
} else {
|
|
seqassertn(false, "unknown outline var");
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
Value *ptrFromFunc(Func *func) {
|
|
auto *M = func->getModule();
|
|
auto *funcType = func->getType();
|
|
auto *rawMethod = M->getOrRealizeMethod(funcType, "__raw__", {funcType});
|
|
seqassertn(rawMethod, "cannot find function __raw__ method");
|
|
return util::call(rawMethod, {M->Nr<VarValue>(func)});
|
|
}
|
|
|
|
// we create the locks lazily to avoid them when they're not needed
|
|
struct ReductionLocks {
|
|
Var *mainLock =
|
|
nullptr; // lock used in calls to _reduce_no_wait and _end_reduce_no_wait
|
|
Var *critLock = nullptr; // lock used in reduction critical sections
|
|
|
|
Var *createLock(Module *M) {
|
|
auto *lockType = M->getOrRealizeType("Lock", {}, ompModule);
|
|
seqassertn(lockType, "openmp.Lock type not found");
|
|
auto *var = M->Nr<Var>(lockType, /*global=*/true);
|
|
static int counter = 1;
|
|
var->setName(".omp_lock." + std::to_string(counter++));
|
|
|
|
// add it to main function so it doesn't get demoted by IR pass
|
|
auto *series = cast<SeriesFlow>(cast<BodiedFunc>(M->getMainFunc())->getBody());
|
|
auto *init = (*lockType)();
|
|
seqassertn(init, "could not initialize openmp.Lock");
|
|
series->insert(series->begin(), M->Nr<AssignInstr>(var, init));
|
|
|
|
return var;
|
|
}
|
|
|
|
Var *getMainLock(Module *M) {
|
|
if (!mainLock)
|
|
mainLock = createLock(M);
|
|
return mainLock;
|
|
}
|
|
|
|
Var *getCritLock(Module *M) {
|
|
if (!critLock)
|
|
critLock = createLock(M);
|
|
return critLock;
|
|
}
|
|
};
|
|
|
|
struct Reduction {
|
|
enum Kind {
|
|
NONE,
|
|
ADD,
|
|
MUL,
|
|
AND,
|
|
OR,
|
|
XOR,
|
|
MIN,
|
|
MAX,
|
|
};
|
|
|
|
Kind kind = Kind::NONE;
|
|
Var *shared = nullptr;
|
|
|
|
types::Type *getType() {
|
|
auto *ptrType = cast<types::PointerType>(shared->getType());
|
|
seqassertn(ptrType, "expected shared var to be of pointer type");
|
|
return ptrType->getBase();
|
|
}
|
|
|
|
Value *getInitial() {
|
|
if (!*this)
|
|
return nullptr;
|
|
auto *M = shared->getModule();
|
|
auto *type = getType();
|
|
|
|
if (isA<types::IntType>(type)) {
|
|
switch (kind) {
|
|
case Kind::ADD:
|
|
return M->getInt(0);
|
|
case Kind::MUL:
|
|
return M->getInt(1);
|
|
case Kind::AND:
|
|
return M->getInt(~0);
|
|
case Kind::OR:
|
|
return M->getInt(0);
|
|
case Kind::XOR:
|
|
return M->getInt(0);
|
|
case Kind::MIN:
|
|
return M->getInt(std::numeric_limits<int64_t>::max());
|
|
case Kind::MAX:
|
|
return M->getInt(std::numeric_limits<int64_t>::min());
|
|
default:
|
|
return nullptr;
|
|
}
|
|
} else if (isA<types::FloatType>(type)) {
|
|
switch (kind) {
|
|
case Kind::ADD:
|
|
return M->getFloat(0.);
|
|
case Kind::MUL:
|
|
return M->getFloat(1.);
|
|
case Kind::MIN:
|
|
return M->getFloat(std::numeric_limits<double>::max());
|
|
case Kind::MAX:
|
|
return M->getFloat(std::numeric_limits<double>::min());
|
|
default:
|
|
return nullptr;
|
|
}
|
|
} else if (isA<types::Float32Type>(type)) {
|
|
auto *f32 = M->getOrRealizeType("float32");
|
|
float value = 0.0;
|
|
|
|
switch (kind) {
|
|
case Kind::ADD:
|
|
value = 0.0;
|
|
break;
|
|
case Kind::MUL:
|
|
value = 1.0;
|
|
break;
|
|
case Kind::MIN:
|
|
value = std::numeric_limits<float>::max();
|
|
break;
|
|
case Kind::MAX:
|
|
value = std::numeric_limits<float>::min();
|
|
break;
|
|
default:
|
|
return nullptr;
|
|
}
|
|
|
|
return (*f32)(*M->getFloat(value));
|
|
}
|
|
|
|
auto *init = (*type)();
|
|
if (!init || !init->getType()->is(type))
|
|
return nullptr;
|
|
return init;
|
|
}
|
|
|
|
Value *generateNonAtomicReduction(Value *ptr, Value *arg) {
|
|
auto *M = ptr->getModule();
|
|
Value *lhs = util::ptrLoad(ptr);
|
|
Value *result = nullptr;
|
|
switch (kind) {
|
|
case Kind::ADD:
|
|
result = *lhs + *arg;
|
|
break;
|
|
case Kind::MUL:
|
|
result = *lhs * *arg;
|
|
break;
|
|
case Kind::AND:
|
|
result = *lhs & *arg;
|
|
break;
|
|
case Kind::OR:
|
|
result = *lhs | *arg;
|
|
break;
|
|
case Kind::XOR:
|
|
result = *lhs ^ *arg;
|
|
break;
|
|
case Kind::MIN:
|
|
case Kind::MAX: {
|
|
auto name = (kind == Kind::MIN ? "min" : "max");
|
|
auto *tup = util::makeTuple({lhs, arg});
|
|
auto *none = (*M->getNoneType())();
|
|
auto *fn = M->getOrRealizeFunc(name, {tup->getType(), none->getType()}, {},
|
|
builtinModule);
|
|
seqassertn(fn, "{} function not found", name);
|
|
result = util::call(fn, {tup, none});
|
|
break;
|
|
}
|
|
default:
|
|
return nullptr;
|
|
}
|
|
return util::ptrStore(ptr, result);
|
|
}
|
|
|
|
Value *generateAtomicReduction(Value *ptr, Value *arg, Var *loc, Var *gtid,
|
|
ReductionLocks &locks) {
|
|
auto *M = ptr->getModule();
|
|
auto *type = getType();
|
|
std::string func = "";
|
|
|
|
if (isA<types::IntType>(type)) {
|
|
switch (kind) {
|
|
case Kind::ADD:
|
|
func = "_atomic_int_add";
|
|
break;
|
|
case Kind::MUL:
|
|
func = "_atomic_int_mul";
|
|
break;
|
|
case Kind::AND:
|
|
func = "_atomic_int_and";
|
|
break;
|
|
case Kind::OR:
|
|
func = "_atomic_int_or";
|
|
break;
|
|
case Kind::XOR:
|
|
func = "_atomic_int_xor";
|
|
break;
|
|
case Kind::MIN:
|
|
func = "_atomic_int_min";
|
|
break;
|
|
case Kind::MAX:
|
|
func = "_atomic_int_max";
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
} else if (isA<types::FloatType>(type)) {
|
|
switch (kind) {
|
|
case Kind::ADD:
|
|
func = "_atomic_float_add";
|
|
break;
|
|
case Kind::MUL:
|
|
func = "_atomic_float_mul";
|
|
break;
|
|
case Kind::MIN:
|
|
func = "_atomic_float_min";
|
|
break;
|
|
case Kind::MAX:
|
|
func = "_atomic_float_max";
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
} else if (isA<types::Float32Type>(type)) {
|
|
switch (kind) {
|
|
case Kind::ADD:
|
|
func = "_atomic_float32_add";
|
|
break;
|
|
case Kind::MUL:
|
|
func = "_atomic_float32_mul";
|
|
break;
|
|
case Kind::MIN:
|
|
func = "_atomic_float32_min";
|
|
break;
|
|
case Kind::MAX:
|
|
func = "_atomic_float32_max";
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!func.empty()) {
|
|
auto *atomicOp =
|
|
M->getOrRealizeFunc(func, {ptr->getType(), arg->getType()}, {}, ompModule);
|
|
seqassertn(atomicOp, "atomic op '{}' not found", func);
|
|
return util::call(atomicOp, {ptr, arg});
|
|
}
|
|
|
|
switch (kind) {
|
|
case Kind::ADD:
|
|
func = "__atomic_add__";
|
|
break;
|
|
case Kind::MUL:
|
|
func = "__atomic_mul__";
|
|
break;
|
|
case Kind::AND:
|
|
func = "__atomic_and__";
|
|
break;
|
|
case Kind::OR:
|
|
func = "__atomic_or__";
|
|
break;
|
|
case Kind::XOR:
|
|
func = "__atomic_xor__";
|
|
break;
|
|
case Kind::MIN:
|
|
func = "__atomic_min__";
|
|
break;
|
|
case Kind::MAX:
|
|
func = "__atomic_max__";
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
if (!func.empty()) {
|
|
auto *atomicOp =
|
|
M->getOrRealizeMethod(arg->getType(), func, {ptr->getType(), arg->getType()});
|
|
if (atomicOp)
|
|
return util::call(atomicOp, {ptr, arg});
|
|
}
|
|
|
|
seqassertn(loc && gtid, "loc and/or gtid are null");
|
|
auto *lck = locks.getCritLock(M);
|
|
auto *lckPtrType = M->getPointerType(lck->getType());
|
|
auto *critBegin = M->getOrRealizeFunc("_critical_begin",
|
|
{loc->getType(), gtid->getType(), lckPtrType},
|
|
{}, ompModule);
|
|
seqassertn(critBegin, "critical begin function not found");
|
|
auto *critEnd = M->getOrRealizeFunc(
|
|
"_critical_end", {loc->getType(), gtid->getType(), lckPtrType}, {}, ompModule);
|
|
seqassertn(critEnd, "critical end function not found");
|
|
|
|
auto *critEnter =
|
|
util::call(critBegin, {M->Nr<VarValue>(loc), M->Nr<VarValue>(gtid),
|
|
M->Nr<PointerValue>(lck)});
|
|
auto *operation = generateNonAtomicReduction(ptr, arg);
|
|
auto *critExit = util::call(critEnd, {M->Nr<VarValue>(loc), M->Nr<VarValue>(gtid),
|
|
M->Nr<PointerValue>(lck)});
|
|
// make sure the unlock is in a finally-block
|
|
return util::series(critEnter, M->Nr<TryCatchFlow>(util::series(operation),
|
|
util::series(critExit)));
|
|
}
|
|
|
|
operator bool() const { return kind != Kind::NONE; }
|
|
};
|
|
|
|
struct ReductionFunction {
|
|
std::string name;
|
|
Reduction::Kind kind;
|
|
bool method;
|
|
};
|
|
|
|
struct ReductionIdentifier : public util::Operator {
|
|
std::vector<Var *> shareds;
|
|
Var *loopVarArg;
|
|
std::unordered_map<id_t, Reduction> reductions;
|
|
|
|
ReductionIdentifier()
|
|
: util::Operator(), shareds(), loopVarArg(nullptr), reductions() {}
|
|
|
|
ReductionIdentifier(std::vector<Var *> shareds, Var *loopVarArg)
|
|
: util::Operator(), shareds(std::move(shareds)), loopVarArg(loopVarArg),
|
|
reductions() {}
|
|
|
|
bool isShared(Var *shared) {
|
|
if (loopVarArg && shared->getId() == loopVarArg->getId())
|
|
return false;
|
|
for (auto *v : shareds) {
|
|
if (shared->getId() == v->getId())
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool isSharedDeref(Var *shared, Value *v) {
|
|
auto *M = v->getModule();
|
|
auto *ptrType = cast<types::PointerType>(shared->getType());
|
|
seqassertn(ptrType, "expected shared var to be of pointer type");
|
|
auto *type = ptrType->getBase();
|
|
|
|
if (util::isCallOf(v, Module::GETITEM_MAGIC_NAME, {ptrType, M->getIntType()}, type,
|
|
/*method=*/true)) {
|
|
auto *call = cast<CallInstr>(v);
|
|
auto *var = util::getVar(call->front());
|
|
return util::isConst<int64_t>(call->back(), 0) && var &&
|
|
var->getId() == shared->getId();
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static void extractAssociativeOpChain(Value *v, const std::string &op,
|
|
types::Type *type,
|
|
std::vector<Value *> &result) {
|
|
if (util::isCallOf(v, op, {type, type}, type, /*method=*/true)) {
|
|
auto *call = cast<CallInstr>(v);
|
|
extractAssociativeOpChain(call->front(), op, type, result);
|
|
extractAssociativeOpChain(call->back(), op, type, result);
|
|
} else {
|
|
result.push_back(v);
|
|
}
|
|
}
|
|
|
|
Reduction getReductionFromCall(CallInstr *v) {
|
|
auto *M = v->getModule();
|
|
auto *func = util::getFunc(v->getCallee());
|
|
if (v->numArgs() != 3 || !func ||
|
|
func->getUnmangledName() != Module::SETITEM_MAGIC_NAME)
|
|
return {};
|
|
|
|
std::vector<Value *> args(v->begin(), v->end());
|
|
Value *self = args[0];
|
|
Value *idx = args[1];
|
|
Value *item = args[2];
|
|
|
|
Var *shared = util::getVar(self);
|
|
if (!shared || !isShared(shared) || !util::isConst<int64_t>(idx, 0))
|
|
return {};
|
|
|
|
auto *ptrType = cast<types::PointerType>(shared->getType());
|
|
seqassertn(ptrType, "expected shared var to be of pointer type");
|
|
auto *type = ptrType->getBase();
|
|
auto *noneType = M->getOptionalType(M->getNoneType());
|
|
|
|
// double-check the call
|
|
if (!util::isCallOf(v, Module::SETITEM_MAGIC_NAME,
|
|
{self->getType(), idx->getType(), item->getType()},
|
|
M->getNoneType(), /*method=*/true))
|
|
return {};
|
|
|
|
const std::vector<ReductionFunction> reductionFunctions = {
|
|
{Module::ADD_MAGIC_NAME, Reduction::Kind::ADD, true},
|
|
{Module::MUL_MAGIC_NAME, Reduction::Kind::MUL, true},
|
|
{Module::AND_MAGIC_NAME, Reduction::Kind::AND, true},
|
|
{Module::OR_MAGIC_NAME, Reduction::Kind::OR, true},
|
|
{Module::XOR_MAGIC_NAME, Reduction::Kind::XOR, true},
|
|
{"min", Reduction::Kind::MIN, false},
|
|
{"max", Reduction::Kind::MAX, false},
|
|
};
|
|
|
|
for (auto &rf : reductionFunctions) {
|
|
if (rf.method) {
|
|
if (!util::isCallOf(item, rf.name, {type, type}, type, /*method=*/true))
|
|
continue;
|
|
} else {
|
|
if (!util::isCallOf(item, rf.name, {M->getTupleType({type, type}), noneType},
|
|
type,
|
|
/*method=*/false))
|
|
continue;
|
|
}
|
|
|
|
auto *callRHS = cast<CallInstr>(item);
|
|
Value *deref = nullptr;
|
|
|
|
if (rf.method) {
|
|
std::vector<Value *> opChain;
|
|
extractAssociativeOpChain(callRHS, rf.name, callRHS->front()->getType(),
|
|
opChain);
|
|
if (opChain.size() < 2)
|
|
continue;
|
|
|
|
for (auto *val : opChain) {
|
|
if (isSharedDeref(shared, val)) {
|
|
deref = val;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
callRHS = cast<CallInstr>(callRHS->front()); // this will be Tuple.__new__
|
|
if (!callRHS)
|
|
continue;
|
|
|
|
for (auto *val : *callRHS) {
|
|
if (isSharedDeref(shared, val)) {
|
|
deref = val;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!deref)
|
|
return {};
|
|
|
|
Reduction reduction = {rf.kind, shared};
|
|
if (!reduction.getInitial())
|
|
return {};
|
|
|
|
return reduction;
|
|
}
|
|
|
|
return {};
|
|
}
|
|
|
|
Reduction getReduction(Var *shared) {
|
|
auto it = reductions.find(shared->getId());
|
|
return (it != reductions.end()) ? it->second : Reduction();
|
|
}
|
|
|
|
void handle(CallInstr *v) override {
|
|
if (auto reduction = getReductionFromCall(v)) {
|
|
auto it = reductions.find(reduction.shared->getId());
|
|
// if we've seen the var before, make sure it's consistent
|
|
// otherwise mark as invalid via an empty reduction
|
|
if (it == reductions.end()) {
|
|
reductions.emplace(reduction.shared->getId(), reduction);
|
|
} else if (it->second && it->second.kind != reduction.kind) {
|
|
it->second = {};
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
struct SharedInfo {
|
|
unsigned memb; // member index in template's `extra` arg
|
|
Var *local; // the local var we create to store current value
|
|
Reduction reduction; // the reduction we're performing, or empty if none
|
|
};
|
|
|
|
struct LoopTemplateReplacer : public util::Operator {
|
|
BodiedFunc *parent;
|
|
CallInstr *replacement;
|
|
Var *loopVar;
|
|
|
|
LoopTemplateReplacer(BodiedFunc *parent, CallInstr *replacement, Var *loopVar)
|
|
: util::Operator(), parent(parent), replacement(replacement), loopVar(loopVar) {}
|
|
};
|
|
|
|
struct ParallelLoopTemplateReplacer : public LoopTemplateReplacer {
|
|
ReductionIdentifier *reds;
|
|
std::vector<SharedInfo> sharedInfo;
|
|
ReductionLocks locks;
|
|
Var *locRef;
|
|
Var *reductionLocRef;
|
|
Var *gtid;
|
|
|
|
ParallelLoopTemplateReplacer(BodiedFunc *parent, CallInstr *replacement, Var *loopVar,
|
|
ReductionIdentifier *reds)
|
|
: LoopTemplateReplacer(parent, replacement, loopVar), reds(reds), sharedInfo(),
|
|
locks(), locRef(nullptr), reductionLocRef(nullptr), gtid(nullptr) {}
|
|
|
|
unsigned numReductions() {
|
|
unsigned num = 0;
|
|
for (auto &info : sharedInfo) {
|
|
if (info.reduction)
|
|
num += 1;
|
|
}
|
|
return num;
|
|
}
|
|
|
|
Value *getReductionTuple() {
|
|
auto *M = parent->getModule();
|
|
std::vector<Value *> elements;
|
|
for (auto &info : sharedInfo) {
|
|
if (info.reduction)
|
|
elements.push_back(M->Nr<PointerValue>(info.local));
|
|
}
|
|
return util::makeTuple(elements, M);
|
|
}
|
|
|
|
BodiedFunc *makeReductionFunc() {
|
|
auto *M = parent->getModule();
|
|
auto *tupleType = getReductionTuple()->getType();
|
|
auto *argType = M->getPointerType(tupleType);
|
|
auto *funcType = M->getFuncType(M->getNoneType(), {argType, argType});
|
|
auto *reducer = M->Nr<BodiedFunc>("__omp_reducer");
|
|
reducer->realize(funcType, {"lhs", "rhs"});
|
|
|
|
auto *lhsVar = reducer->arg_front();
|
|
auto *rhsVar = reducer->arg_back();
|
|
auto *body = M->Nr<SeriesFlow>();
|
|
unsigned next = 0;
|
|
for (auto &info : sharedInfo) {
|
|
if (info.reduction) {
|
|
auto *lhs = util::ptrLoad(M->Nr<VarValue>(lhsVar));
|
|
auto *rhs = util::ptrLoad(M->Nr<VarValue>(rhsVar));
|
|
auto *lhsElem = util::tupleGet(lhs, next);
|
|
auto *rhsElem = util::tupleGet(rhs, next);
|
|
body->push_back(
|
|
info.reduction.generateNonAtomicReduction(lhsElem, util::ptrLoad(rhsElem)));
|
|
++next;
|
|
}
|
|
}
|
|
reducer->setBody(body);
|
|
return reducer;
|
|
}
|
|
|
|
void handle(CallInstr *v) override {
|
|
auto *M = v->getModule();
|
|
auto *func = util::getFunc(v->getCallee());
|
|
if (!func)
|
|
return;
|
|
auto name = func->getUnmangledName();
|
|
|
|
if (name == "_loop_loc_and_gtid") {
|
|
seqassertn(v->numArgs() == 3 &&
|
|
std::all_of(v->begin(), v->end(),
|
|
[](auto x) { return isA<VarValue>(x); }),
|
|
"unexpected loop loc and gtid stub");
|
|
std::vector<Value *> args(v->begin(), v->end());
|
|
locRef = util::getVar(args[0]);
|
|
reductionLocRef = util::getVar(args[1]);
|
|
gtid = util::getVar(args[2]);
|
|
}
|
|
|
|
if (name == "_loop_reductions") {
|
|
seqassertn(reductionLocRef && gtid, "bad visit order in template");
|
|
seqassertn(v->numArgs() == 1 && isA<VarValue>(v->front()),
|
|
"unexpected shared updates stub");
|
|
if (numReductions() == 0)
|
|
return;
|
|
|
|
auto *M = parent->getModule();
|
|
auto *extras = util::getVar(v->front());
|
|
auto *reductionTuple = getReductionTuple();
|
|
auto *reducer = makeReductionFunc();
|
|
auto *lck = locks.getMainLock(M);
|
|
auto *rawReducer = ptrFromFunc(reducer);
|
|
|
|
auto *lckPtrType = M->getPointerType(lck->getType());
|
|
auto *reduceNoWait = M->getOrRealizeFunc(
|
|
"_reduce_nowait",
|
|
{reductionLocRef->getType(), gtid->getType(), reductionTuple->getType(),
|
|
rawReducer->getType(), lckPtrType},
|
|
{}, ompModule);
|
|
seqassertn(reduceNoWait, "reduce nowait function not found");
|
|
auto *reduceNoWaitEnd = M->getOrRealizeFunc(
|
|
"_end_reduce_nowait",
|
|
{reductionLocRef->getType(), gtid->getType(), lckPtrType}, {}, ompModule);
|
|
seqassertn(reduceNoWaitEnd, "end reduce nowait function not found");
|
|
|
|
auto *series = M->Nr<SeriesFlow>();
|
|
auto *tupleVal = util::makeVar(reductionTuple, series, parent);
|
|
auto *reduceCode = util::call(
|
|
reduceNoWait, {M->Nr<VarValue>(reductionLocRef), M->Nr<VarValue>(gtid),
|
|
tupleVal, rawReducer, M->Nr<PointerValue>(lck)});
|
|
auto *codeVar = util::makeVar(reduceCode, series, parent)->getVar();
|
|
seqassertn(codeVar->getType()->is(M->getIntType()), "wrong reduce code type");
|
|
|
|
auto *sectionNonAtomic = M->Nr<SeriesFlow>();
|
|
auto *sectionAtomic = M->Nr<SeriesFlow>();
|
|
|
|
for (auto &info : sharedInfo) {
|
|
if (info.reduction) {
|
|
Value *ptr = util::tupleGet(M->Nr<VarValue>(extras), info.memb);
|
|
Value *arg = M->Nr<VarValue>(info.local);
|
|
sectionNonAtomic->push_back(
|
|
info.reduction.generateNonAtomicReduction(ptr, arg));
|
|
}
|
|
}
|
|
sectionNonAtomic->push_back(util::call(
|
|
reduceNoWaitEnd, {M->Nr<VarValue>(reductionLocRef), M->Nr<VarValue>(gtid),
|
|
M->Nr<PointerValue>(lck)}));
|
|
|
|
for (auto &info : sharedInfo) {
|
|
if (info.reduction) {
|
|
Value *ptr = util::tupleGet(M->Nr<VarValue>(extras), info.memb);
|
|
Value *arg = M->Nr<VarValue>(info.local);
|
|
sectionAtomic->push_back(
|
|
info.reduction.generateAtomicReduction(ptr, arg, locRef, gtid, locks));
|
|
}
|
|
}
|
|
|
|
// make: if code == 1 { sectionNonAtomic } elif code == 2 { sectionAtomic }
|
|
auto *theSwitch = M->Nr<IfFlow>(
|
|
*M->Nr<VarValue>(codeVar) == *M->getInt(1), sectionNonAtomic,
|
|
util::series(M->Nr<IfFlow>(*M->Nr<VarValue>(codeVar) == *M->getInt(2),
|
|
sectionAtomic)));
|
|
series->push_back(theSwitch);
|
|
v->replaceAll(series);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct ImperativeLoopTemplateReplacer : public ParallelLoopTemplateReplacer {
|
|
OMPSched *sched;
|
|
int64_t step;
|
|
|
|
ImperativeLoopTemplateReplacer(BodiedFunc *parent, CallInstr *replacement,
|
|
Var *loopVar, ReductionIdentifier *reds,
|
|
OMPSched *sched, int64_t step)
|
|
: ParallelLoopTemplateReplacer(parent, replacement, loopVar, reds), sched(sched),
|
|
step(step) {}
|
|
|
|
void handle(CallInstr *v) override {
|
|
ParallelLoopTemplateReplacer::handle(v);
|
|
auto *M = v->getModule();
|
|
auto *func = util::getFunc(v->getCallee());
|
|
if (!func)
|
|
return;
|
|
auto name = func->getUnmangledName();
|
|
|
|
if (name == "_loop_step") {
|
|
v->replaceAll(M->getInt(step));
|
|
}
|
|
|
|
if (name == "_loop_body_stub") {
|
|
seqassertn(replacement, "unexpected double replacement");
|
|
seqassertn(v->numArgs() == 2 && isA<VarValue>(v->front()) &&
|
|
isA<VarValue>(v->back()),
|
|
"unexpected loop body stub");
|
|
|
|
auto *outlinedFunc = util::getFunc(replacement->getCallee());
|
|
|
|
// the template passes the new loop var and extra args
|
|
// to the body stub for convenience
|
|
auto *newLoopVar = util::getVar(v->front());
|
|
auto *extras = util::getVar(v->back());
|
|
|
|
std::vector<Value *> newArgs;
|
|
auto outlinedArgs = outlinedFunc->arg_begin(); // arg vars of *outlined func*
|
|
unsigned next = 0; // next index in "extra" args tuple, passed to template
|
|
// `arg` is an argument of the original outlined func call
|
|
for (auto *arg : *replacement) {
|
|
if (getVarFromOutlinedArg(arg)->getId() != loopVar->getId()) {
|
|
Value *newArg = nullptr;
|
|
|
|
// shared vars will be stored in a new var
|
|
if (isA<PointerValue>(arg)) {
|
|
types::Type *base = cast<types::PointerType>(arg->getType())->getBase();
|
|
|
|
// get extras again since we'll be inserting the new var before extras local
|
|
Var *lastArg = parent->arg_back(); // ptr to {chunk, start, stop, extras}
|
|
Value *val = util::tupleGet(util::ptrLoad(M->Nr<VarValue>(lastArg)), 3);
|
|
Value *initVal = util::ptrLoad(util::tupleGet(val, next));
|
|
|
|
Reduction reduction = reds->getReduction(*outlinedArgs);
|
|
if (reduction) {
|
|
initVal = reduction.getInitial();
|
|
seqassertn(initVal && initVal->getType()->is(base),
|
|
"unknown reduction init value");
|
|
}
|
|
|
|
VarValue *newVar = util::makeVar(
|
|
initVal, cast<SeriesFlow>(parent->getBody()), parent, /*prepend=*/true);
|
|
sharedInfo.push_back({next, newVar->getVar(), reduction});
|
|
|
|
newArg = M->Nr<PointerValue>(newVar->getVar());
|
|
++next;
|
|
} else {
|
|
newArg = util::tupleGet(M->Nr<VarValue>(extras), next++);
|
|
}
|
|
|
|
newArgs.push_back(newArg);
|
|
} else {
|
|
if (isA<VarValue>(arg)) {
|
|
newArgs.push_back(M->Nr<VarValue>(newLoopVar));
|
|
} else if (isA<PointerValue>(arg)) {
|
|
newArgs.push_back(M->Nr<PointerValue>(newLoopVar));
|
|
} else {
|
|
seqassertn(false, "unknown outline var");
|
|
}
|
|
}
|
|
|
|
++outlinedArgs;
|
|
}
|
|
|
|
v->replaceAll(util::call(outlinedFunc, newArgs));
|
|
replacement = nullptr;
|
|
}
|
|
|
|
if (name == "_loop_shared_updates") {
|
|
// for all non-reduction shareds, set the final values
|
|
// this will be similar to OpenMP's "lastprivate"
|
|
seqassertn(v->numArgs() == 1 && isA<VarValue>(v->front()),
|
|
"unexpected shared updates stub");
|
|
auto *extras = util::getVar(v->front());
|
|
auto *series = M->Nr<SeriesFlow>();
|
|
|
|
for (auto &info : sharedInfo) {
|
|
if (info.reduction)
|
|
continue;
|
|
|
|
auto *finalValue = M->Nr<VarValue>(info.local);
|
|
auto *val = M->Nr<VarValue>(extras);
|
|
auto *origPtr = util::tupleGet(val, info.memb);
|
|
series->push_back(util::ptrStore(origPtr, finalValue));
|
|
}
|
|
|
|
v->replaceAll(series);
|
|
}
|
|
|
|
if (name == "_loop_schedule") {
|
|
v->replaceAll(M->getInt(sched->code));
|
|
}
|
|
|
|
if (name == "_loop_ordered") {
|
|
v->replaceAll(M->getBool(sched->ordered));
|
|
}
|
|
}
|
|
};
|
|
|
|
struct TaskLoopReductionVarReplacer : public util::Operator {
|
|
std::vector<Var *> reductionArgs;
|
|
std::vector<std::pair<Var *, Var *>> reductionRemap;
|
|
BodiedFunc *parent;
|
|
|
|
void setupReductionRemap() {
|
|
auto *M = parent->getModule();
|
|
|
|
for (auto *var : reductionArgs) {
|
|
auto *newVar = M->Nr<Var>(var->getType(), /*global=*/false);
|
|
reductionRemap.emplace_back(var, newVar);
|
|
}
|
|
}
|
|
|
|
TaskLoopReductionVarReplacer(std::vector<Var *> reductionArgs, BodiedFunc *parent)
|
|
: util::Operator(), reductionArgs(std::move(reductionArgs)), reductionRemap(),
|
|
parent(parent) {
|
|
setupReductionRemap();
|
|
}
|
|
|
|
void preHook(Node *v) override {
|
|
for (auto &p : reductionRemap) {
|
|
v->replaceUsedVariable(p.first->getId(), p.second);
|
|
}
|
|
}
|
|
|
|
// need to do this as a separate step since otherwise the old variable
|
|
// in the assignment will be replaced, which we don't want
|
|
void finalize() {
|
|
auto *M = parent->getModule();
|
|
auto *body = cast<SeriesFlow>(parent->getBody());
|
|
auto *gtid = parent->arg_back();
|
|
|
|
for (auto &p : reductionRemap) {
|
|
auto *taskRedData = M->getOrRealizeFunc(
|
|
"_taskred_data", {M->getIntType(), p.first->getType()}, {}, ompModule);
|
|
seqassertn(taskRedData, "could not find '_taskred_data'");
|
|
|
|
auto *assign = M->Nr<AssignInstr>(
|
|
p.second,
|
|
util::call(taskRedData, {M->Nr<VarValue>(gtid), M->Nr<VarValue>(p.first)}));
|
|
body->insert(body->begin(), assign);
|
|
parent->push_back(p.second);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct TaskLoopBodyStubReplacer : public util::Operator {
|
|
CallInstr *replacement;
|
|
std::vector<bool> reduceArgs;
|
|
|
|
TaskLoopBodyStubReplacer(CallInstr *replacement, std::vector<bool> reduceArgs)
|
|
: util::Operator(), replacement(replacement), reduceArgs(std::move(reduceArgs)) {}
|
|
|
|
void handle(CallInstr *v) override {
|
|
auto *func = util::getFunc(v->getCallee());
|
|
if (func && func->getUnmangledName() == "_task_loop_body_stub") {
|
|
seqassertn(replacement, "unexpected double replacement");
|
|
seqassertn(v->numArgs() == 3 && isA<VarValue>(v->front()) &&
|
|
isA<VarValue>(v->back()),
|
|
"unexpected loop body stub");
|
|
|
|
// the template passes gtid, privs and shareds to the body stub for convenience
|
|
std::vector<Value *> args(v->begin(), v->end());
|
|
auto *gtid = args[0];
|
|
auto *privatesTuple = args[1];
|
|
auto *sharedsTuple = args[2];
|
|
unsigned privatesNext = 0;
|
|
unsigned sharedsNext = 0;
|
|
std::vector<Value *> newArgs;
|
|
bool hasReductions =
|
|
std::any_of(reduceArgs.begin(), reduceArgs.end(), [](bool b) { return b; });
|
|
|
|
for (auto *arg : *replacement) {
|
|
if (isA<VarValue>(arg)) {
|
|
newArgs.push_back(util::tupleGet(privatesTuple, privatesNext++));
|
|
} else if (isA<PointerValue>(arg)) {
|
|
newArgs.push_back(util::tupleGet(sharedsTuple, sharedsNext++));
|
|
} else {
|
|
// make sure we're on the last arg, which should be gtid
|
|
// in case of reductions
|
|
seqassertn(hasReductions && arg == replacement->back(),
|
|
"unknown outline var");
|
|
}
|
|
}
|
|
|
|
auto *outlinedFunc = cast<BodiedFunc>(util::getFunc(replacement->getCallee()));
|
|
|
|
if (hasReductions) {
|
|
newArgs.push_back(gtid);
|
|
|
|
std::vector<Var *> reductionArgs;
|
|
unsigned i = 0;
|
|
for (auto it = outlinedFunc->arg_begin(); it != outlinedFunc->arg_end(); ++it) {
|
|
if (reduceArgs[i++])
|
|
reductionArgs.push_back(*it);
|
|
}
|
|
TaskLoopReductionVarReplacer redrep(reductionArgs, outlinedFunc);
|
|
outlinedFunc->accept(redrep);
|
|
redrep.finalize();
|
|
}
|
|
|
|
v->replaceAll(util::call(outlinedFunc, newArgs));
|
|
replacement = nullptr;
|
|
}
|
|
}
|
|
};
|
|
|
|
struct TaskLoopRoutineStubReplacer : public ParallelLoopTemplateReplacer {
|
|
std::vector<Value *> privates;
|
|
std::vector<Value *> shareds;
|
|
Var *array; // task reduction input array
|
|
Var *tskgrp; // task group identifier
|
|
|
|
void setupSharedInfo(std::vector<Reduction> &sharedRedux) {
|
|
unsigned sharedsNext = 0;
|
|
for (auto *val : shareds) {
|
|
if (getVarFromOutlinedArg(val)->getId() != loopVar->getId()) {
|
|
if (auto &reduction = sharedRedux[sharedsNext]) {
|
|
Var *newVar = util::getVar(util::makeVar(
|
|
reduction.getInitial(), cast<SeriesFlow>(parent->getBody()), parent,
|
|
/*prepend=*/true));
|
|
sharedInfo.push_back({sharedsNext, newVar, reduction});
|
|
}
|
|
}
|
|
++sharedsNext;
|
|
}
|
|
}
|
|
|
|
TaskLoopRoutineStubReplacer(BodiedFunc *parent, CallInstr *replacement, Var *loopVar,
|
|
ReductionIdentifier *reds, std::vector<Value *> privates,
|
|
std::vector<Value *> shareds,
|
|
std::vector<Reduction> sharedRedux)
|
|
: ParallelLoopTemplateReplacer(parent, replacement, loopVar, reds),
|
|
privates(std::move(privates)), shareds(std::move(shareds)), array(nullptr),
|
|
tskgrp(nullptr) {
|
|
setupSharedInfo(sharedRedux);
|
|
}
|
|
|
|
BodiedFunc *makeTaskRedInitFunc(Reduction *reduction) {
|
|
auto *M = parent->getModule();
|
|
auto *argType = M->getPointerType(reduction->getType());
|
|
auto *funcType = M->getFuncType(M->getNoneType(), {argType, argType});
|
|
auto *initializer = M->Nr<BodiedFunc>("__red_init");
|
|
initializer->realize(funcType, {"lhs", "rhs"});
|
|
|
|
auto *lhsVar = initializer->arg_front();
|
|
auto *body = M->Nr<SeriesFlow>();
|
|
auto *lhsPtr = M->Nr<VarValue>(lhsVar);
|
|
body->push_back(util::ptrStore(lhsPtr, reduction->getInitial()));
|
|
initializer->setBody(body);
|
|
return initializer;
|
|
}
|
|
|
|
BodiedFunc *makeTaskRedCombFunc(Reduction *reduction) {
|
|
auto *M = parent->getModule();
|
|
auto *argType = M->getPointerType(reduction->getType());
|
|
auto *funcType = M->getFuncType(M->getNoneType(), {argType, argType});
|
|
auto *reducer = M->Nr<BodiedFunc>("__red_comb");
|
|
reducer->realize(funcType, {"lhs", "rhs"});
|
|
|
|
auto *lhsVar = reducer->arg_front();
|
|
auto *rhsVar = reducer->arg_back();
|
|
auto *body = M->Nr<SeriesFlow>();
|
|
auto *lhsPtr = M->Nr<VarValue>(lhsVar);
|
|
auto *rhsPtr = M->Nr<VarValue>(rhsVar);
|
|
body->push_back(
|
|
reduction->generateNonAtomicReduction(lhsPtr, util::ptrLoad(rhsPtr)));
|
|
reducer->setBody(body);
|
|
return reducer;
|
|
}
|
|
|
|
Value *makeTaskRedInput(Reduction *reduction, Value *shar, Value *orig) {
|
|
auto *M = shar->getModule();
|
|
auto *size = M->Nr<TypePropertyInstr>(reduction->getType(),
|
|
TypePropertyInstr::Property::SIZEOF);
|
|
auto *init = ptrFromFunc(makeTaskRedInitFunc(reduction));
|
|
auto *comb = ptrFromFunc(makeTaskRedCombFunc(reduction));
|
|
|
|
auto *taskRedInputType = M->getOrRealizeType("TaskReductionInput", {}, ompModule);
|
|
seqassertn(taskRedInputType, "could not find 'TaskReductionInput' type");
|
|
auto *result = taskRedInputType->construct({shar, orig, size, init, comb});
|
|
seqassertn(result, "bad construction of 'TaskReductionInput' type");
|
|
return result;
|
|
}
|
|
|
|
void handle(VarValue *v) override {
|
|
auto *M = v->getModule();
|
|
auto *func = util::getFunc(v);
|
|
if (func && func->getUnmangledName() == "_routine_stub") {
|
|
std::vector<bool> reduceArgs;
|
|
unsigned sharedsNext = 0;
|
|
unsigned infoNext = 0;
|
|
|
|
for (auto *arg : *replacement) {
|
|
if (isA<VarValue>(arg)) {
|
|
reduceArgs.push_back(false);
|
|
} else if (isA<PointerValue>(arg)) {
|
|
if (infoNext < sharedInfo.size() &&
|
|
sharedInfo[infoNext].memb == sharedsNext &&
|
|
sharedInfo[infoNext].reduction) {
|
|
reduceArgs.push_back(true);
|
|
++infoNext;
|
|
} else {
|
|
reduceArgs.push_back(false);
|
|
}
|
|
++sharedsNext;
|
|
} else {
|
|
// make sure we're on the last arg, which should be gtid
|
|
// in case of reductions
|
|
seqassertn(numReductions() > 0 && arg == replacement->back(),
|
|
"unknown outline var");
|
|
reduceArgs.push_back(false);
|
|
}
|
|
}
|
|
|
|
util::CloneVisitor cv(M);
|
|
auto *newRoutine = cv.forceClone(func);
|
|
TaskLoopBodyStubReplacer rep(replacement, reduceArgs);
|
|
newRoutine->accept(rep);
|
|
v->setVar(newRoutine);
|
|
}
|
|
}
|
|
|
|
void handle(CallInstr *v) override {
|
|
ParallelLoopTemplateReplacer::handle(v);
|
|
auto *M = v->getModule();
|
|
auto *func = util::getFunc(v->getCallee());
|
|
if (!func)
|
|
return;
|
|
auto name = func->getUnmangledName();
|
|
|
|
if (name == "_taskred_setup") {
|
|
seqassertn(reductionLocRef && gtid, "bad visit order in template");
|
|
seqassertn(v->numArgs() == 1 && isA<VarValue>(v->front()),
|
|
"unexpected shared updates stub");
|
|
unsigned numRed = numReductions();
|
|
if (numRed == 0)
|
|
return;
|
|
|
|
auto *M = parent->getModule();
|
|
auto *extras = util::getVar(v->front());
|
|
|
|
// add task reduction inputs
|
|
auto *taskRedInitSeries = M->Nr<SeriesFlow>();
|
|
auto *taskRedInputType = M->getOrRealizeType("TaskReductionInput", {}, ompModule);
|
|
seqassertn(taskRedInputType, "could not find 'TaskReductionInput' type");
|
|
auto *irArrayType = M->getOrRealizeType("TaskReductionInputArray", {}, ompModule);
|
|
seqassertn(irArrayType, "could not find 'TaskReductionInputArray' type");
|
|
auto *taskRedInputsArray = util::makeVar(
|
|
M->Nr<StackAllocInstr>(irArrayType, numRed), taskRedInitSeries, parent);
|
|
array = util::getVar(taskRedInputsArray);
|
|
auto *taskRedInputsArrayType = taskRedInputsArray->getType();
|
|
|
|
auto *taskRedSetItem = M->getOrRealizeMethod(
|
|
taskRedInputsArrayType, Module::SETITEM_MAGIC_NAME,
|
|
{taskRedInputsArrayType, M->getIntType(), taskRedInputType});
|
|
seqassertn(taskRedSetItem,
|
|
"could not find 'TaskReductionInputArray.__setitem__' method");
|
|
int i = 0;
|
|
for (auto &info : sharedInfo) {
|
|
if (info.reduction) {
|
|
Value *shar = M->Nr<PointerValue>(info.local);
|
|
Value *orig = util::tupleGet(M->Nr<VarValue>(extras), info.memb);
|
|
auto *taskRedInput = makeTaskRedInput(&info.reduction, shar, orig);
|
|
taskRedInitSeries->push_back(util::call(
|
|
taskRedSetItem, {M->Nr<VarValue>(array), M->getInt(i++), taskRedInput}));
|
|
}
|
|
}
|
|
|
|
auto *arrayPtr = M->Nr<ExtractInstr>(M->Nr<VarValue>(array), "ptr");
|
|
auto *taskRedInitFunc =
|
|
M->getOrRealizeFunc("_taskred_init",
|
|
{reductionLocRef->getType(), gtid->getType(),
|
|
M->getIntType(), arrayPtr->getType()},
|
|
{}, ompModule);
|
|
seqassertn(taskRedInitFunc, "task red init function not found");
|
|
auto *taskRedInitResult =
|
|
util::makeVar(util::call(taskRedInitFunc, {M->Nr<VarValue>(reductionLocRef),
|
|
M->Nr<VarValue>(gtid),
|
|
M->getInt(numRed), arrayPtr}),
|
|
taskRedInitSeries, parent);
|
|
tskgrp = util::getVar(taskRedInitResult);
|
|
v->replaceAll(taskRedInitSeries);
|
|
}
|
|
|
|
if (name == "_fix_privates_and_shareds") {
|
|
std::vector<Value *> args(v->begin(), v->end());
|
|
seqassertn(args.size() == 3, "invalid _fix_privates_and_shareds call found");
|
|
unsigned numRed = numReductions();
|
|
auto *newLoopVar = args[0];
|
|
auto *privatesTuple = args[1];
|
|
auto *sharedsTuple = args[2];
|
|
|
|
unsigned privatesNext = 0;
|
|
unsigned sharedsNext = 0;
|
|
unsigned infoNext = 0;
|
|
|
|
bool needNewPrivates = false;
|
|
bool needNewShareds = false;
|
|
|
|
std::vector<Value *> newPrivates;
|
|
std::vector<Value *> newShareds;
|
|
|
|
for (auto *val : privates) {
|
|
if (numRed > 0 && val == privates.back()) { // i.e. task group identifier
|
|
seqassertn(tskgrp, "tskgrp var not set");
|
|
newPrivates.push_back(M->Nr<VarValue>(tskgrp));
|
|
needNewPrivates = true;
|
|
} else if (getVarFromOutlinedArg(val)->getId() != loopVar->getId()) {
|
|
newPrivates.push_back(util::tupleGet(privatesTuple, privatesNext));
|
|
} else {
|
|
newPrivates.push_back(newLoopVar);
|
|
needNewPrivates = true;
|
|
}
|
|
++privatesNext;
|
|
}
|
|
|
|
for (auto *val : shareds) {
|
|
if (getVarFromOutlinedArg(val)->getId() != loopVar->getId()) {
|
|
if (infoNext < sharedInfo.size() &&
|
|
sharedInfo[infoNext].memb == sharedsNext &&
|
|
sharedInfo[infoNext].reduction) {
|
|
newShareds.push_back(M->Nr<PointerValue>(sharedInfo[infoNext].local));
|
|
needNewShareds = true;
|
|
++infoNext;
|
|
} else {
|
|
newShareds.push_back(util::tupleGet(sharedsTuple, sharedsNext));
|
|
}
|
|
} else {
|
|
newShareds.push_back(M->Nr<PointerValue>(util::getVar(newLoopVar)));
|
|
needNewShareds = true;
|
|
}
|
|
++sharedsNext;
|
|
}
|
|
|
|
privatesTuple = needNewPrivates ? util::makeTuple(newPrivates, M) : privatesTuple;
|
|
sharedsTuple = needNewShareds ? util::makeTuple(newShareds, M) : sharedsTuple;
|
|
|
|
Value *result = util::makeTuple({privatesTuple, sharedsTuple}, M);
|
|
v->replaceAll(result);
|
|
}
|
|
|
|
if (name == "_taskred_finish") {
|
|
seqassertn(reductionLocRef && gtid, "bad visit order in template");
|
|
if (numReductions() == 0)
|
|
return;
|
|
|
|
auto *taskRedFini = M->getOrRealizeFunc(
|
|
"_taskred_fini", {reductionLocRef->getType(), gtid->getType()}, {},
|
|
ompModule);
|
|
seqassertn(taskRedFini, "taskred finish function not found not found");
|
|
v->replaceAll(util::call(
|
|
taskRedFini, {M->Nr<VarValue>(reductionLocRef), M->Nr<VarValue>(gtid)}));
|
|
}
|
|
}
|
|
};
|
|
|
|
struct GPULoopBodyStubReplacer : public util::Operator {
|
|
CallInstr *replacement;
|
|
Var *loopVar;
|
|
int64_t step;
|
|
|
|
GPULoopBodyStubReplacer(CallInstr *replacement, Var *loopVar, int64_t step)
|
|
: util::Operator(), replacement(replacement), loopVar(loopVar), step(step) {}
|
|
|
|
void handle(CallInstr *v) override {
|
|
auto *M = v->getModule();
|
|
auto *func = util::getFunc(v->getCallee());
|
|
if (!func)
|
|
return;
|
|
auto name = func->getUnmangledName();
|
|
|
|
if (name == "_gpu_loop_body_stub") {
|
|
seqassertn(replacement, "unexpected double replacement");
|
|
seqassertn(v->numArgs() == 2, "unexpected loop body stub");
|
|
|
|
// the template passes gtid, privs and shareds to the body stub for convenience
|
|
auto *idx = v->front();
|
|
auto *args = v->back();
|
|
unsigned next = 0;
|
|
|
|
std::vector<Value *> newArgs;
|
|
for (auto *arg : *replacement) {
|
|
if (getVarFromOutlinedArg(arg)->getId() == loopVar->getId()) {
|
|
newArgs.push_back(idx);
|
|
} else {
|
|
newArgs.push_back(util::tupleGet(args, next++));
|
|
}
|
|
}
|
|
|
|
auto *outlinedFunc = cast<BodiedFunc>(util::getFunc(replacement->getCallee()));
|
|
v->replaceAll(util::call(outlinedFunc, newArgs));
|
|
replacement = nullptr;
|
|
}
|
|
|
|
if (name == "_loop_step") {
|
|
v->replaceAll(M->getInt(step));
|
|
}
|
|
}
|
|
};
|
|
|
|
struct GPULoopTemplateReplacer : public LoopTemplateReplacer {
|
|
int64_t step;
|
|
|
|
GPULoopTemplateReplacer(BodiedFunc *parent, CallInstr *replacement, Var *loopVar,
|
|
int64_t step)
|
|
: LoopTemplateReplacer(parent, replacement, loopVar), step(step) {}
|
|
|
|
void handle(CallInstr *v) override {
|
|
auto *M = v->getModule();
|
|
auto *func = util::getFunc(v->getCallee());
|
|
if (!func)
|
|
return;
|
|
auto name = func->getUnmangledName();
|
|
|
|
if (name == "_loop_step") {
|
|
v->replaceAll(M->getInt(step));
|
|
}
|
|
}
|
|
};
|
|
|
|
struct OpenMPTransformData {
|
|
util::OutlineResult outline;
|
|
std::vector<Var *> sharedVars;
|
|
ReductionIdentifier reds;
|
|
};
|
|
|
|
template <typename T> OpenMPTransformData unpar(T *v) {
|
|
v->setParallel(false);
|
|
return {{}, {}, {}};
|
|
}
|
|
|
|
template <typename T>
|
|
OpenMPTransformData setupOpenMPTransform(T *v, BodiedFunc *parent, bool gpu) {
|
|
if (!v->isParallel())
|
|
return unpar(v);
|
|
auto *M = v->getModule();
|
|
auto *body = cast<SeriesFlow>(v->getBody());
|
|
if (!parent || !body)
|
|
return unpar(v);
|
|
auto outline = util::outlineRegion(parent, body, /*allowOutflows=*/false,
|
|
/*outlineGlobals=*/true, /*allByValue=*/gpu);
|
|
if (!outline)
|
|
return unpar(v);
|
|
|
|
// set up args to pass fork_call
|
|
Var *loopVar = v->getVar();
|
|
std::vector<Value *> outlineCallArgs(outline.call->begin(), outline.call->end());
|
|
|
|
// shared argument vars
|
|
std::vector<Var *> sharedVars;
|
|
Var *loopVarArg = nullptr;
|
|
unsigned i = 0;
|
|
for (auto it = outline.func->arg_begin(); it != outline.func->arg_end(); ++it) {
|
|
// pick out loop variable to pass to reduction identifier, which will
|
|
// ensure we don't reduce over it
|
|
if (getVarFromOutlinedArg(outlineCallArgs[i])->getId() == loopVar->getId())
|
|
loopVarArg = *it;
|
|
if (outline.argKinds[i] == util::OutlineResult::ArgKind::MODIFIED)
|
|
sharedVars.push_back(*it);
|
|
++i;
|
|
}
|
|
ReductionIdentifier reds(sharedVars, loopVarArg);
|
|
outline.func->accept(reds);
|
|
|
|
return {outline, sharedVars, reds};
|
|
}
|
|
|
|
struct ForkCallData {
|
|
CallInstr *fork = nullptr;
|
|
CallInstr *pushNumThreads = nullptr;
|
|
};
|
|
|
|
ForkCallData createForkCall(Module *M, OMPTypes &types, Value *rawTemplateFunc,
|
|
const std::vector<Value *> &forkExtraArgs,
|
|
transform::parallel::OMPSched *sched) {
|
|
ForkCallData result;
|
|
auto *forkExtra = util::makeTuple(forkExtraArgs, M);
|
|
std::vector<types::Type *> forkArgTypes = {types.i8ptr, forkExtra->getType()};
|
|
auto *forkFunc = M->getOrRealizeFunc("_fork_call", forkArgTypes, {}, ompModule);
|
|
seqassertn(forkFunc, "fork call function not found");
|
|
result.fork = util::call(forkFunc, {rawTemplateFunc, forkExtra});
|
|
|
|
if (sched->threads && sched->threads->getType()->is(types.i64)) {
|
|
auto *pushNumThreadsFunc =
|
|
M->getOrRealizeFunc("_push_num_threads", {types.i64}, {}, ompModule);
|
|
seqassertn(pushNumThreadsFunc, "push num threads func not found");
|
|
result.pushNumThreads = util::call(pushNumThreadsFunc, {sched->threads});
|
|
}
|
|
return result;
|
|
}
|
|
|
|
struct CollapseResult {
|
|
ImperativeForFlow *collapsed = nullptr;
|
|
SeriesFlow *setup = nullptr;
|
|
std::string error;
|
|
|
|
operator bool() const { return collapsed != nullptr; }
|
|
};
|
|
|
|
struct LoopRange {
|
|
ImperativeForFlow *loop;
|
|
Var *start;
|
|
Var *stop;
|
|
int64_t step;
|
|
Var *len;
|
|
};
|
|
|
|
CollapseResult collapseLoop(BodiedFunc *parent, ImperativeForFlow *v, int64_t levels) {
|
|
auto fail = [](const std::string &error) {
|
|
CollapseResult bad;
|
|
bad.error = error;
|
|
return bad;
|
|
};
|
|
|
|
auto *M = v->getModule();
|
|
CollapseResult res;
|
|
if (levels < 1)
|
|
return fail("'collapse' must be at least 1");
|
|
|
|
std::vector<ImperativeForFlow *> loopNests = {v};
|
|
ImperativeForFlow *curr = v;
|
|
|
|
for (auto i = 0; i < levels - 1; i++) {
|
|
auto *body = cast<SeriesFlow>(curr->getBody());
|
|
seqassertn(body, "unexpected loop body");
|
|
if (std::distance(body->begin(), body->end()) != 1 ||
|
|
!isA<ImperativeForFlow>(body->front()))
|
|
return fail("loop nest not collapsible");
|
|
|
|
curr = cast<ImperativeForFlow>(body->front());
|
|
loopNests.push_back(curr);
|
|
}
|
|
|
|
std::vector<LoopRange> ranges;
|
|
auto *setup = M->Nr<SeriesFlow>();
|
|
|
|
auto *intType = M->getIntType();
|
|
auto *lenCalc =
|
|
M->getOrRealizeFunc("_range_len", {intType, intType, intType}, {}, ompModule);
|
|
seqassertn(lenCalc, "range length calculation function not found");
|
|
|
|
for (auto *loop : loopNests) {
|
|
LoopRange range;
|
|
range.loop = loop;
|
|
range.start = util::makeVar(loop->getStart(), setup, parent)->getVar();
|
|
range.stop = util::makeVar(loop->getEnd(), setup, parent)->getVar();
|
|
range.step = loop->getStep();
|
|
range.len = util::makeVar(util::call(lenCalc, {M->Nr<VarValue>(range.start),
|
|
M->Nr<VarValue>(range.stop),
|
|
M->getInt(range.step)}),
|
|
setup, parent)
|
|
->getVar();
|
|
ranges.push_back(range);
|
|
}
|
|
|
|
auto *numIters = M->getInt(1);
|
|
for (auto &range : ranges) {
|
|
numIters = (*numIters) * (*M->Nr<VarValue>(range.len));
|
|
}
|
|
|
|
auto *collapsedVar = M->Nr<Var>(M->getIntType(), /*global=*/false);
|
|
parent->push_back(collapsedVar);
|
|
auto *body = M->Nr<SeriesFlow>();
|
|
auto sched = std::make_unique<OMPSched>(*v->getSchedule());
|
|
sched->collapse = 0;
|
|
auto *collapsed = M->Nr<ImperativeForFlow>(M->getInt(0), 1, numIters, body,
|
|
collapsedVar, std::move(sched));
|
|
|
|
// reconstruct indices by successive divmods
|
|
Var *lastDiv = nullptr;
|
|
for (auto it = ranges.rbegin(); it != ranges.rend(); ++it) {
|
|
auto *k = lastDiv ? lastDiv : collapsedVar;
|
|
auto *div =
|
|
util::makeVar(*M->Nr<VarValue>(k) / *M->Nr<VarValue>(it->len), body, parent)
|
|
->getVar();
|
|
auto *mod =
|
|
util::makeVar(*M->Nr<VarValue>(k) % *M->Nr<VarValue>(it->len), body, parent)
|
|
->getVar();
|
|
auto *i =
|
|
*M->Nr<VarValue>(it->start) + *(*M->Nr<VarValue>(mod) * *M->getInt(it->step));
|
|
body->push_back(M->Nr<AssignInstr>(it->loop->getVar(), i));
|
|
lastDiv = div;
|
|
}
|
|
|
|
auto *oldBody = cast<SeriesFlow>(loopNests.back()->getBody());
|
|
for (auto *x : *oldBody) {
|
|
body->push_back(x);
|
|
}
|
|
|
|
res.collapsed = collapsed;
|
|
res.setup = setup;
|
|
|
|
return res;
|
|
}
|
|
} // namespace
|
|
|
|
const std::string OpenMPPass::KEY = "core-parallel-openmp";
|
|
|
|
void OpenMPPass::handle(ForFlow *v) {
|
|
auto data = setupOpenMPTransform(v, cast<BodiedFunc>(getParentFunc()), /*gpu=*/false);
|
|
if (!v->isParallel())
|
|
return;
|
|
|
|
auto &outline = data.outline;
|
|
auto &sharedVars = data.sharedVars;
|
|
auto &reds = data.reds;
|
|
|
|
auto *M = v->getModule();
|
|
auto *loopVar = v->getVar();
|
|
auto *sched = v->getSchedule();
|
|
OMPTypes types(M);
|
|
|
|
// separate arguments into 'private' and 'shared'
|
|
std::vector<Reduction> sharedRedux; // reductions corresponding to shared vars
|
|
std::vector<Value *> privates, shareds;
|
|
unsigned i = 0;
|
|
for (auto *arg : *outline.call) {
|
|
if (isA<VarValue>(arg)) {
|
|
privates.push_back(arg);
|
|
} else {
|
|
shareds.push_back(arg);
|
|
sharedRedux.push_back(reds.getReduction(sharedVars[i++]));
|
|
}
|
|
}
|
|
|
|
util::CloneVisitor cv(M);
|
|
|
|
// We need to pass the task group identifier returned from
|
|
// __kmpc_taskred_modifier_init to the task entry, so append
|
|
// it to private data (initially as null void pointer). Also
|
|
// we add an argument to the end of the outlined function for
|
|
// the gtid.
|
|
if (reds.reductions.size() > 0) {
|
|
auto *nullPtr = types.i8ptr->construct({});
|
|
privates.push_back(nullPtr);
|
|
|
|
auto *outlinedFuncType = cast<types::FuncType>(outline.func->getType());
|
|
std::vector<types::Type *> argTypes(outlinedFuncType->begin(),
|
|
outlinedFuncType->end());
|
|
argTypes.push_back(M->getIntType());
|
|
auto *retType = outlinedFuncType->getReturnType();
|
|
|
|
std::vector<Var *> oldArgVars(outline.func->arg_begin(), outline.func->arg_end());
|
|
std::vector<std::string> argNames;
|
|
|
|
for (auto *var : oldArgVars) {
|
|
argNames.push_back(var->getName());
|
|
}
|
|
argNames.push_back("gtid");
|
|
|
|
auto *newOutlinedFunc = M->Nr<BodiedFunc>("__outlined_new");
|
|
newOutlinedFunc->realize(M->getFuncType(retType, argTypes), argNames);
|
|
|
|
std::vector<Var *> newArgVars(newOutlinedFunc->arg_begin(),
|
|
newOutlinedFunc->arg_end());
|
|
|
|
std::unordered_map<id_t, Var *> remaps;
|
|
for (unsigned i = 0; i < oldArgVars.size(); i++) {
|
|
remaps.emplace(oldArgVars[i]->getId(), newArgVars[i]);
|
|
}
|
|
auto *newBody =
|
|
cast<SeriesFlow>(cv.clone(outline.func->getBody(), newOutlinedFunc, remaps));
|
|
newOutlinedFunc->setBody(newBody);
|
|
|
|
// update outline struct
|
|
outline.func = newOutlinedFunc;
|
|
outline.call->setCallee(M->Nr<VarValue>(newOutlinedFunc));
|
|
outline.call->insert(outline.call->end(), M->getInt(0));
|
|
outline.argKinds.push_back(util::OutlineResult::ArgKind::CONSTANT);
|
|
}
|
|
|
|
auto *privatesTuple = util::makeTuple(privates, M);
|
|
auto *sharedsTuple = util::makeTuple(shareds, M);
|
|
|
|
// template call
|
|
std::vector<types::Type *> templateFuncArgs = {
|
|
types.i32ptr, types.i32ptr,
|
|
M->getPointerType(
|
|
M->getTupleType({v->getIter()->getType(), privatesTuple->getType(),
|
|
sharedsTuple->getType()}))};
|
|
auto *templateFunc = M->getOrRealizeFunc("_task_loop_outline_template",
|
|
templateFuncArgs, {}, ompModule);
|
|
seqassertn(templateFunc, "task loop outline template not found");
|
|
|
|
templateFunc = cv.forceClone(templateFunc);
|
|
TaskLoopRoutineStubReplacer rep(cast<BodiedFunc>(templateFunc), outline.call, loopVar,
|
|
&reds, privates, shareds, sharedRedux);
|
|
templateFunc->accept(rep);
|
|
auto *rawTemplateFunc = ptrFromFunc(templateFunc);
|
|
|
|
std::vector<Value *> forkExtraArgs = {v->getIter(), privatesTuple, sharedsTuple};
|
|
|
|
// fork call
|
|
auto forkData = createForkCall(M, types, rawTemplateFunc, forkExtraArgs, sched);
|
|
if (forkData.pushNumThreads)
|
|
insertBefore(forkData.pushNumThreads);
|
|
v->replaceAll(forkData.fork);
|
|
}
|
|
|
|
void OpenMPPass::handle(ImperativeForFlow *v) {
|
|
auto *parent = cast<BodiedFunc>(getParentFunc());
|
|
|
|
if (v->isParallel() && v->getSchedule()->collapse != 0) {
|
|
auto levels = v->getSchedule()->collapse;
|
|
auto collapse = collapseLoop(parent, v, levels);
|
|
|
|
if (collapse) {
|
|
v->replaceAll(collapse.collapsed);
|
|
v = collapse.collapsed;
|
|
insertBefore(collapse.setup);
|
|
} else if (!collapse.error.empty()) {
|
|
warn("could not collapse loop: " + collapse.error, v);
|
|
}
|
|
}
|
|
|
|
auto data =
|
|
setupOpenMPTransform(v, parent, (v->isParallel() && v->getSchedule()->gpu));
|
|
if (!v->isParallel())
|
|
return;
|
|
|
|
auto &outline = data.outline;
|
|
auto &sharedVars = data.sharedVars;
|
|
auto &reds = data.reds;
|
|
|
|
auto *M = v->getModule();
|
|
auto *loopVar = v->getVar();
|
|
auto *sched = v->getSchedule();
|
|
OMPTypes types(M);
|
|
|
|
// we disable shared vars for GPU loops
|
|
seqassertn(!(sched->gpu && !sharedVars.empty()), "GPU-parallel loop had shared vars");
|
|
|
|
// gather extra arguments
|
|
std::vector<Value *> extraArgs;
|
|
std::vector<types::Type *> extraArgTypes;
|
|
for (auto *arg : *outline.call) {
|
|
if (getVarFromOutlinedArg(arg)->getId() != loopVar->getId()) {
|
|
extraArgs.push_back(arg);
|
|
extraArgTypes.push_back(arg->getType());
|
|
}
|
|
}
|
|
|
|
// template call
|
|
std::string templateFuncName;
|
|
if (sched->gpu) {
|
|
templateFuncName = "_gpu_loop_outline_template";
|
|
} else if (sched->dynamic) {
|
|
templateFuncName = "_dynamic_loop_outline_template";
|
|
} else if (sched->chunk) {
|
|
templateFuncName = "_static_chunked_loop_outline_template";
|
|
} else {
|
|
templateFuncName = "_static_loop_outline_template";
|
|
}
|
|
|
|
if (sched->gpu) {
|
|
std::unordered_set<id_t> kernels;
|
|
const std::string gpuAttr = "std.gpu.kernel";
|
|
for (auto *var : *M) {
|
|
if (auto *func = cast<BodiedFunc>(var)) {
|
|
if (util::hasAttribute(func, gpuAttr))
|
|
kernels.insert(func->getId());
|
|
}
|
|
}
|
|
|
|
std::vector<types::Type *> templateFuncArgs = {types.i64, types.i64,
|
|
M->getTupleType(extraArgTypes)};
|
|
static int64_t instance = 0;
|
|
auto *templateFunc = M->getOrRealizeFunc(templateFuncName, templateFuncArgs,
|
|
{instance++}, gpuModule);
|
|
|
|
if (!templateFunc) {
|
|
warn("loop not compilable for GPU; ignoring", v);
|
|
v->setParallel(false);
|
|
return;
|
|
}
|
|
|
|
BodiedFunc *kernel = nullptr;
|
|
for (auto *var : *M) {
|
|
if (auto *func = cast<BodiedFunc>(var)) {
|
|
if (util::hasAttribute(func, gpuAttr) && kernels.count(func->getId()) == 0) {
|
|
seqassertn(!kernel, "multiple new kernels found after instantiation");
|
|
kernel = func;
|
|
}
|
|
}
|
|
}
|
|
seqassertn(kernel, "no new kernel found");
|
|
GPULoopBodyStubReplacer brep(outline.call, loopVar, v->getStep());
|
|
kernel->accept(brep);
|
|
|
|
util::CloneVisitor cv(M);
|
|
templateFunc = cast<Func>(cv.forceClone(templateFunc));
|
|
GPULoopTemplateReplacer rep(cast<BodiedFunc>(templateFunc), outline.call, loopVar,
|
|
v->getStep());
|
|
templateFunc->accept(rep);
|
|
v->replaceAll(util::call(
|
|
templateFunc, {v->getStart(), v->getEnd(), util::makeTuple(extraArgs, M)}));
|
|
} else {
|
|
std::vector<types::Type *> templateFuncArgs = {
|
|
types.i32ptr, types.i32ptr,
|
|
M->getPointerType(M->getTupleType(
|
|
{types.i64, types.i64, types.i64, M->getTupleType(extraArgTypes)}))};
|
|
auto *templateFunc =
|
|
M->getOrRealizeFunc(templateFuncName, templateFuncArgs, {}, ompModule);
|
|
seqassertn(templateFunc, "imperative loop outline template not found");
|
|
|
|
util::CloneVisitor cv(M);
|
|
templateFunc = cast<Func>(cv.forceClone(templateFunc));
|
|
ImperativeLoopTemplateReplacer rep(cast<BodiedFunc>(templateFunc), outline.call,
|
|
loopVar, &reds, sched, v->getStep());
|
|
templateFunc->accept(rep);
|
|
auto *rawTemplateFunc = ptrFromFunc(templateFunc);
|
|
|
|
auto *chunk = (sched->chunk && sched->chunk->getType()->is(types.i64))
|
|
? sched->chunk
|
|
: M->getInt(1);
|
|
std::vector<Value *> forkExtraArgs = {chunk, v->getStart(), v->getEnd()};
|
|
for (auto *arg : extraArgs) {
|
|
forkExtraArgs.push_back(arg);
|
|
}
|
|
|
|
// fork call
|
|
auto forkData = createForkCall(M, types, rawTemplateFunc, forkExtraArgs, sched);
|
|
if (forkData.pushNumThreads)
|
|
insertBefore(forkData.pushNumThreads);
|
|
v->replaceAll(forkData.fork);
|
|
}
|
|
}
|
|
|
|
} // namespace parallel
|
|
} // namespace transform
|
|
} // namespace ir
|
|
} // namespace codon
|