codon/codon/cir/transform/parallel/openmp.cpp

// Copyright (C) 2022-2023 Exaloop Inc. <https://exaloop.io>

#include "openmp.h"

#include <algorithm>
#include <iterator>
#include <limits>
#include <unordered_set>

#include "codon/cir/transform/parallel/schedule.h"
#include "codon/cir/util/cloning.h"
#include "codon/cir/util/irtools.h"
#include "codon/cir/util/outlining.h"

namespace codon {
namespace ir {
namespace transform {
namespace parallel {
namespace {
const std::string ompModule = "std.openmp";
const std::string gpuModule = "std.gpu";
const std::string builtinModule = "std.internal.builtin";

void warn(const std::string &msg, const Value *v) {
  auto src = v->getSrcInfo();
  compilationWarning(msg, src.file, src.line, src.col);
}

struct OMPTypes {
  types::Type *i64 = nullptr;
  types::Type *i32 = nullptr;
  types::Type *i8ptr = nullptr;
  types::Type *i32ptr = nullptr;

  explicit OMPTypes(Module *M) {
    i64 = M->getIntType();
    i32 = M->getIntNType(32, /*sign=*/true);
    i8ptr = M->getPointerType(M->getByteType());
    i32ptr = M->getPointerType(i32);
  }
};

Var *getVarFromOutlinedArg(Value *arg) {
  if (auto *val = cast<VarValue>(arg)) {
    return val->getVar();
  } else if (auto *val = cast<PointerValue>(arg)) {
    return val->getVar();
  } else {
    seqassertn(false, "unknown outline var");
  }
  return nullptr;
}

Value *ptrFromFunc(Func *func) {
  auto *M = func->getModule();
  auto *funcType = func->getType();
  auto *rawMethod = M->getOrRealizeMethod(funcType, "__raw__", {funcType});
  seqassertn(rawMethod, "cannot find function __raw__ method");
  return util::call(rawMethod, {M->Nr<VarValue>(func)});
}

// we create the locks lazily to avoid them when they're not needed
struct ReductionLocks {
  Var *mainLock =
      nullptr; // lock used in calls to _reduce_no_wait and _end_reduce_no_wait
  Var *critLock = nullptr; // lock used in reduction critical sections

  Var *createLock(Module *M) {
    auto *lockType = M->getOrRealizeType("Lock", {}, ompModule);
    seqassertn(lockType, "openmp.Lock type not found");
    auto *var = M->Nr<Var>(lockType, /*global=*/true);
    static int counter = 1;
    var->setName(".omp_lock." + std::to_string(counter++));

    // add it to main function so it doesn't get demoted by IR pass
    auto *series = cast<SeriesFlow>(cast<BodiedFunc>(M->getMainFunc())->getBody());
    auto *init = (*lockType)();
    seqassertn(init, "could not initialize openmp.Lock");
    series->insert(series->begin(), M->Nr<AssignInstr>(var, init));

    return var;
  }

  Var *getMainLock(Module *M) {
    if (!mainLock)
      mainLock = createLock(M);
    return mainLock;
  }

  Var *getCritLock(Module *M) {
    if (!critLock)
      critLock = createLock(M);
    return critLock;
  }
};

struct Reduction {
  enum Kind {
    NONE,
    ADD,
    MUL,
    AND,
    OR,
    XOR,
    MIN,
    MAX,
  };

  Kind kind = Kind::NONE;
  Var *shared = nullptr;

  types::Type *getType() {
    auto *ptrType = cast<types::PointerType>(shared->getType());
    seqassertn(ptrType, "expected shared var to be of pointer type");
    return ptrType->getBase();
  }

  Value *getInitial() {
    if (!*this)
      return nullptr;
    auto *M = shared->getModule();
    auto *type = getType();

    if (isA<types::IntType>(type)) {
      switch (kind) {
      case Kind::ADD:
        return M->getInt(0);
      case Kind::MUL:
        return M->getInt(1);
      case Kind::AND:
        return M->getInt(~0);
      case Kind::OR:
        return M->getInt(0);
      case Kind::XOR:
        return M->getInt(0);
      case Kind::MIN:
        return M->getInt(std::numeric_limits<int64_t>::max());
      case Kind::MAX:
        return M->getInt(std::numeric_limits<int64_t>::min());
      default:
        return nullptr;
      }
    } else if (isA<types::FloatType>(type)) {
      switch (kind) {
      case Kind::ADD:
        return M->getFloat(0.);
      case Kind::MUL:
        return M->getFloat(1.);
      case Kind::MIN:
        return M->getFloat(std::numeric_limits<double>::max());
      case Kind::MAX:
        return M->getFloat(std::numeric_limits<double>::min());
      default:
        return nullptr;
      }
    } else if (isA<types::Float32Type>(type)) {
      auto *f32 = M->getOrRealizeType("float32");
      float value = 0.0;

      switch (kind) {
      case Kind::ADD:
        value = 0.0;
        break;
      case Kind::MUL:
        value = 1.0;
        break;
      case Kind::MIN:
        value = std::numeric_limits<float>::max();
        break;
      case Kind::MAX:
        value = std::numeric_limits<float>::min();
        break;
      default:
        return nullptr;
      }

      return (*f32)(*M->getFloat(value));
    }

    auto *init = (*type)();
    if (!init || !init->getType()->is(type))
      return nullptr;
    return init;
  }

  Value *generateNonAtomicReduction(Value *ptr, Value *arg) {
    auto *M = ptr->getModule();
    Value *lhs = util::ptrLoad(ptr);
    Value *result = nullptr;
    switch (kind) {
    case Kind::ADD:
      result = *lhs + *arg;
      break;
    case Kind::MUL:
      result = *lhs * *arg;
      break;
    case Kind::AND:
      result = *lhs & *arg;
      break;
    case Kind::OR:
      result = *lhs | *arg;
      break;
    case Kind::XOR:
      result = *lhs ^ *arg;
      break;
    case Kind::MIN:
    case Kind::MAX: {
      auto name = (kind == Kind::MIN ? "min" : "max");
      auto *tup = util::makeTuple({lhs, arg});
      auto *none = (*M->getNoneType())();
      auto *fn = M->getOrRealizeFunc(name, {tup->getType(), none->getType()}, {},
                                     builtinModule);
      seqassertn(fn, "{} function not found", name);
      result = util::call(fn, {tup, none});
      break;
    }
    default:
      return nullptr;
    }
    return util::ptrStore(ptr, result);
  }

  Value *generateAtomicReduction(Value *ptr, Value *arg, Var *loc, Var *gtid,
                                 ReductionLocks &locks) {
    auto *M = ptr->getModule();
    auto *type = getType();
    std::string func = "";

    if (isA<types::IntType>(type)) {
      switch (kind) {
      case Kind::ADD:
        func = "_atomic_int_add";
        break;
      case Kind::MUL:
        func = "_atomic_int_mul";
        break;
      case Kind::AND:
        func = "_atomic_int_and";
        break;
      case Kind::OR:
        func = "_atomic_int_or";
        break;
      case Kind::XOR:
        func = "_atomic_int_xor";
        break;
      case Kind::MIN:
        func = "_atomic_int_min";
        break;
      case Kind::MAX:
        func = "_atomic_int_max";
        break;
      default:
        break;
      }
    } else if (isA<types::FloatType>(type)) {
      switch (kind) {
      case Kind::ADD:
        func = "_atomic_float_add";
        break;
      case Kind::MUL:
        func = "_atomic_float_mul";
        break;
      case Kind::MIN:
        func = "_atomic_float_min";
        break;
      case Kind::MAX:
        func = "_atomic_float_max";
        break;
      default:
        break;
      }
    } else if (isA<types::Float32Type>(type)) {
      switch (kind) {
      case Kind::ADD:
        func = "_atomic_float32_add";
        break;
      case Kind::MUL:
        func = "_atomic_float32_mul";
        break;
      case Kind::MIN:
        func = "_atomic_float32_min";
        break;
      case Kind::MAX:
        func = "_atomic_float32_max";
        break;
      default:
        break;
      }
    }

    if (!func.empty()) {
      auto *atomicOp =
          M->getOrRealizeFunc(func, {ptr->getType(), arg->getType()}, {}, ompModule);
      seqassertn(atomicOp, "atomic op '{}' not found", func);
      return util::call(atomicOp, {ptr, arg});
    }

    switch (kind) {
    case Kind::ADD:
      func = "__atomic_add__";
      break;
    case Kind::MUL:
      func = "__atomic_mul__";
      break;
    case Kind::AND:
      func = "__atomic_and__";
      break;
    case Kind::OR:
      func = "__atomic_or__";
      break;
    case Kind::XOR:
      func = "__atomic_xor__";
      break;
    case Kind::MIN:
      func = "__atomic_min__";
      break;
    case Kind::MAX:
      func = "__atomic_max__";
      break;
    default:
      break;
    }

    if (!func.empty()) {
      auto *atomicOp =
          M->getOrRealizeMethod(arg->getType(), func, {ptr->getType(), arg->getType()});
      if (atomicOp)
        return util::call(atomicOp, {ptr, arg});
    }

    seqassertn(loc && gtid, "loc and/or gtid are null");
    auto *lck = locks.getCritLock(M);
    auto *lckPtrType = M->getPointerType(lck->getType());
    auto *critBegin = M->getOrRealizeFunc("_critical_begin",
                                          {loc->getType(), gtid->getType(), lckPtrType},
                                          {}, ompModule);
    seqassertn(critBegin, "critical begin function not found");
    auto *critEnd = M->getOrRealizeFunc(
        "_critical_end", {loc->getType(), gtid->getType(), lckPtrType}, {}, ompModule);
    seqassertn(critEnd, "critical end function not found");

    auto *critEnter =
        util::call(critBegin, {M->Nr<VarValue>(loc), M->Nr<VarValue>(gtid),
                               M->Nr<PointerValue>(lck)});
    auto *operation = generateNonAtomicReduction(ptr, arg);
    auto *critExit = util::call(critEnd, {M->Nr<VarValue>(loc), M->Nr<VarValue>(gtid),
                                          M->Nr<PointerValue>(lck)});
    // make sure the unlock is in a finally-block
    return util::series(critEnter, M->Nr<TryCatchFlow>(util::series(operation),
                                                       util::series(critExit)));
  }

  operator bool() const { return kind != Kind::NONE; }
};

struct ReductionFunction {
  std::string name;
  Reduction::Kind kind;
  bool method;
};

struct ReductionIdentifier : public util::Operator {
  std::vector<Var *> shareds;
  Var *loopVarArg;
  std::unordered_map<id_t, Reduction> reductions;

  ReductionIdentifier()
      : util::Operator(), shareds(), loopVarArg(nullptr), reductions() {}

  ReductionIdentifier(std::vector<Var *> shareds, Var *loopVarArg)
      : util::Operator(), shareds(std::move(shareds)), loopVarArg(loopVarArg),
        reductions() {}

  bool isShared(Var *shared) {
    if (loopVarArg && shared->getId() == loopVarArg->getId())
      return false;
    for (auto *v : shareds) {
      if (shared->getId() == v->getId())
        return true;
    }
    return false;
  }

  bool isSharedDeref(Var *shared, Value *v) {
    auto *M = v->getModule();
    auto *ptrType = cast<types::PointerType>(shared->getType());
    seqassertn(ptrType, "expected shared var to be of pointer type");
    auto *type = ptrType->getBase();

    if (util::isCallOf(v, Module::GETITEM_MAGIC_NAME, {ptrType, M->getIntType()}, type,
                       /*method=*/true)) {
      auto *call = cast<CallInstr>(v);
      auto *var = util::getVar(call->front());
      return util::isConst<int64_t>(call->back(), 0) && var &&
             var->getId() == shared->getId();
    }

    return false;
  }

  static void extractAssociativeOpChain(Value *v, const std::string &op,
                                        types::Type *type,
                                        std::vector<Value *> &result) {
    if (util::isCallOf(v, op, {type, type}, type, /*method=*/true)) {
      auto *call = cast<CallInstr>(v);
      extractAssociativeOpChain(call->front(), op, type, result);
      extractAssociativeOpChain(call->back(), op, type, result);
    } else {
      result.push_back(v);
    }
  }

  Reduction getReductionFromCall(CallInstr *v) {
    auto *M = v->getModule();
    auto *func = util::getFunc(v->getCallee());
    if (v->numArgs() != 3 || !func ||
        func->getUnmangledName() != Module::SETITEM_MAGIC_NAME)
      return {};

    std::vector<Value *> args(v->begin(), v->end());
    Value *self = args[0];
    Value *idx = args[1];
    Value *item = args[2];

    Var *shared = util::getVar(self);
    if (!shared || !isShared(shared) || !util::isConst<int64_t>(idx, 0))
      return {};

    auto *ptrType = cast<types::PointerType>(shared->getType());
    seqassertn(ptrType, "expected shared var to be of pointer type");
    auto *type = ptrType->getBase();
    auto *noneType = M->getOptionalType(M->getNoneType());

    // double-check the call
    if (!util::isCallOf(v, Module::SETITEM_MAGIC_NAME,
                        {self->getType(), idx->getType(), item->getType()},
                        M->getNoneType(), /*method=*/true))
      return {};

    const std::vector<ReductionFunction> reductionFunctions = {
        {Module::ADD_MAGIC_NAME, Reduction::Kind::ADD, true},
        {Module::MUL_MAGIC_NAME, Reduction::Kind::MUL, true},
        {Module::AND_MAGIC_NAME, Reduction::Kind::AND, true},
        {Module::OR_MAGIC_NAME, Reduction::Kind::OR, true},
        {Module::XOR_MAGIC_NAME, Reduction::Kind::XOR, true},
        {"min", Reduction::Kind::MIN, false},
        {"max", Reduction::Kind::MAX, false},
    };

    for (auto &rf : reductionFunctions) {
      if (rf.method) {
        if (!util::isCallOf(item, rf.name, {type, type}, type, /*method=*/true))
          continue;
      } else {
        if (!util::isCallOf(item, rf.name, {M->getTupleType({type, type}), noneType},
                            type,
                            /*method=*/false))
          continue;
      }

      auto *callRHS = cast<CallInstr>(item);
      Value *deref = nullptr;

      if (rf.method) {
        std::vector<Value *> opChain;
        extractAssociativeOpChain(callRHS, rf.name, callRHS->front()->getType(),
                                  opChain);
        if (opChain.size() < 2)
          continue;

        for (auto *val : opChain) {
          if (isSharedDeref(shared, val)) {
            deref = val;
            break;
          }
        }
      } else {
        callRHS = cast<CallInstr>(callRHS->front()); // this will be Tuple.__new__
        if (!callRHS)
          continue;

        for (auto *val : *callRHS) {
          if (isSharedDeref(shared, val)) {
            deref = val;
            break;
          }
        }
      }

      if (!deref)
        return {};

      Reduction reduction = {rf.kind, shared};
      if (!reduction.getInitial())
        return {};

      return reduction;
    }

    return {};
  }

  Reduction getReduction(Var *shared) {
    auto it = reductions.find(shared->getId());
    return (it != reductions.end()) ? it->second : Reduction();
  }

  void handle(CallInstr *v) override {
    if (auto reduction = getReductionFromCall(v)) {
      auto it = reductions.find(reduction.shared->getId());
      // if we've seen the var before, make sure it's consistent
      // otherwise mark as invalid via an empty reduction
      if (it == reductions.end()) {
        reductions.emplace(reduction.shared->getId(), reduction);
      } else if (it->second && it->second.kind != reduction.kind) {
        it->second = {};
      }
    }
  }
};

struct SharedInfo {
  unsigned memb;       // member index in template's `extra` arg
  Var *local;          // the local var we create to store current value
  Reduction reduction; // the reduction we're performing, or empty if none
};

struct LoopTemplateReplacer : public util::Operator {
  BodiedFunc *parent;
  CallInstr *replacement;
  Var *loopVar;

  LoopTemplateReplacer(BodiedFunc *parent, CallInstr *replacement, Var *loopVar)
      : util::Operator(), parent(parent), replacement(replacement), loopVar(loopVar) {}
};

struct ParallelLoopTemplateReplacer : public LoopTemplateReplacer {
  ReductionIdentifier *reds;
  std::vector<SharedInfo> sharedInfo;
  ReductionLocks locks;
  Var *locRef;
  Var *reductionLocRef;
  Var *gtid;

  ParallelLoopTemplateReplacer(BodiedFunc *parent, CallInstr *replacement, Var *loopVar,
                               ReductionIdentifier *reds)
      : LoopTemplateReplacer(parent, replacement, loopVar), reds(reds), sharedInfo(),
        locks(), locRef(nullptr), reductionLocRef(nullptr), gtid(nullptr) {}

  unsigned numReductions() {
    unsigned num = 0;
    for (auto &info : sharedInfo) {
      if (info.reduction)
        num += 1;
    }
    return num;
  }

  Value *getReductionTuple() {
    auto *M = parent->getModule();
    std::vector<Value *> elements;
    for (auto &info : sharedInfo) {
      if (info.reduction)
        elements.push_back(M->Nr<PointerValue>(info.local));
    }
    return util::makeTuple(elements, M);
  }

  BodiedFunc *makeReductionFunc() {
    auto *M = parent->getModule();
    auto *tupleType = getReductionTuple()->getType();
    auto *argType = M->getPointerType(tupleType);
    auto *funcType = M->getFuncType(M->getNoneType(), {argType, argType});
    auto *reducer = M->Nr<BodiedFunc>("__omp_reducer");
    reducer->realize(funcType, {"lhs", "rhs"});

    auto *lhsVar = reducer->arg_front();
    auto *rhsVar = reducer->arg_back();
    auto *body = M->Nr<SeriesFlow>();
    unsigned next = 0;
    for (auto &info : sharedInfo) {
      if (info.reduction) {
        auto *lhs = util::ptrLoad(M->Nr<VarValue>(lhsVar));
        auto *rhs = util::ptrLoad(M->Nr<VarValue>(rhsVar));
        auto *lhsElem = util::tupleGet(lhs, next);
        auto *rhsElem = util::tupleGet(rhs, next);
        body->push_back(
            info.reduction.generateNonAtomicReduction(lhsElem, util::ptrLoad(rhsElem)));
        ++next;
      }
    }
    reducer->setBody(body);
    return reducer;
  }

  void handle(CallInstr *v) override {
    auto *M = v->getModule();
    auto *func = util::getFunc(v->getCallee());
    if (!func)
      return;
    auto name = func->getUnmangledName();

    if (name == "_loop_loc_and_gtid") {
      seqassertn(v->numArgs() == 3 &&
                     std::all_of(v->begin(), v->end(),
                                 [](auto x) { return isA<VarValue>(x); }),
                 "unexpected loop loc and gtid stub");
      std::vector<Value *> args(v->begin(), v->end());
      locRef = util::getVar(args[0]);
      reductionLocRef = util::getVar(args[1]);
      gtid = util::getVar(args[2]);
    }

    if (name == "_loop_reductions") {
      seqassertn(reductionLocRef && gtid, "bad visit order in template");
      seqassertn(v->numArgs() == 1 && isA<VarValue>(v->front()),
                 "unexpected shared updates stub");
      if (numReductions() == 0)
        return;

      auto *M = parent->getModule();
      auto *extras = util::getVar(v->front());
      auto *reductionTuple = getReductionTuple();
      auto *reducer = makeReductionFunc();
      auto *lck = locks.getMainLock(M);
      auto *rawReducer = ptrFromFunc(reducer);

      auto *lckPtrType = M->getPointerType(lck->getType());
      auto *reduceNoWait = M->getOrRealizeFunc(
          "_reduce_nowait",
          {reductionLocRef->getType(), gtid->getType(), reductionTuple->getType(),
           rawReducer->getType(), lckPtrType},
          {}, ompModule);
      seqassertn(reduceNoWait, "reduce nowait function not found");
      auto *reduceNoWaitEnd = M->getOrRealizeFunc(
          "_end_reduce_nowait",
          {reductionLocRef->getType(), gtid->getType(), lckPtrType}, {}, ompModule);
      seqassertn(reduceNoWaitEnd, "end reduce nowait function not found");

      auto *series = M->Nr<SeriesFlow>();
      auto *tupleVal = util::makeVar(reductionTuple, series, parent);
      auto *reduceCode = util::call(
          reduceNoWait, {M->Nr<VarValue>(reductionLocRef), M->Nr<VarValue>(gtid),
                         tupleVal, rawReducer, M->Nr<PointerValue>(lck)});
      auto *codeVar = util::makeVar(reduceCode, series, parent)->getVar();
      seqassertn(codeVar->getType()->is(M->getIntType()), "wrong reduce code type");

      auto *sectionNonAtomic = M->Nr<SeriesFlow>();
      auto *sectionAtomic = M->Nr<SeriesFlow>();

      for (auto &info : sharedInfo) {
        if (info.reduction) {
          Value *ptr = util::tupleGet(M->Nr<VarValue>(extras), info.memb);
          Value *arg = M->Nr<VarValue>(info.local);
          sectionNonAtomic->push_back(
              info.reduction.generateNonAtomicReduction(ptr, arg));
        }
      }
      sectionNonAtomic->push_back(util::call(
          reduceNoWaitEnd, {M->Nr<VarValue>(reductionLocRef), M->Nr<VarValue>(gtid),
                            M->Nr<PointerValue>(lck)}));

      for (auto &info : sharedInfo) {
        if (info.reduction) {
          Value *ptr = util::tupleGet(M->Nr<VarValue>(extras), info.memb);
          Value *arg = M->Nr<VarValue>(info.local);
          sectionAtomic->push_back(
              info.reduction.generateAtomicReduction(ptr, arg, locRef, gtid, locks));
        }
      }

      // make: if code == 1 { sectionNonAtomic } elif code == 2 { sectionAtomic }
      auto *theSwitch = M->Nr<IfFlow>(
          *M->Nr<VarValue>(codeVar) == *M->getInt(1), sectionNonAtomic,
          util::series(M->Nr<IfFlow>(*M->Nr<VarValue>(codeVar) == *M->getInt(2),
                                     sectionAtomic)));
      series->push_back(theSwitch);
      v->replaceAll(series);
    }
  }
};

struct ImperativeLoopTemplateReplacer : public ParallelLoopTemplateReplacer {
  OMPSched *sched;
  int64_t step;

  ImperativeLoopTemplateReplacer(BodiedFunc *parent, CallInstr *replacement,
                                 Var *loopVar, ReductionIdentifier *reds,
                                 OMPSched *sched, int64_t step)
      : ParallelLoopTemplateReplacer(parent, replacement, loopVar, reds), sched(sched),
        step(step) {}

  void handle(CallInstr *v) override {
    ParallelLoopTemplateReplacer::handle(v);
    auto *M = v->getModule();
    auto *func = util::getFunc(v->getCallee());
    if (!func)
      return;
    auto name = func->getUnmangledName();

    if (name == "_loop_step") {
      v->replaceAll(M->getInt(step));
    }

    if (name == "_loop_body_stub") {
      seqassertn(replacement, "unexpected double replacement");
      seqassertn(v->numArgs() == 2 && isA<VarValue>(v->front()) &&
                     isA<VarValue>(v->back()),
                 "unexpected loop body stub");

      auto *outlinedFunc = util::getFunc(replacement->getCallee());

      // the template passes the new loop var and extra args
      // to the body stub for convenience
      auto *newLoopVar = util::getVar(v->front());
      auto *extras = util::getVar(v->back());

      std::vector<Value *> newArgs;
      auto outlinedArgs = outlinedFunc->arg_begin(); // arg vars of *outlined func*
      unsigned next = 0; // next index in "extra" args tuple, passed to template
      // `arg` is an argument of the original outlined func call
      for (auto *arg : *replacement) {
        if (getVarFromOutlinedArg(arg)->getId() != loopVar->getId()) {
          Value *newArg = nullptr;

          // shared vars will be stored in a new var
          if (isA<PointerValue>(arg)) {
            types::Type *base = cast<types::PointerType>(arg->getType())->getBase();

            // get extras again since we'll be inserting the new var before extras local
            Var *lastArg = parent->arg_back(); // ptr to {chunk, start, stop, extras}
            Value *val = util::tupleGet(util::ptrLoad(M->Nr<VarValue>(lastArg)), 3);
            Value *initVal = util::ptrLoad(util::tupleGet(val, next));

            Reduction reduction = reds->getReduction(*outlinedArgs);
            if (reduction) {
              initVal = reduction.getInitial();
              seqassertn(initVal && initVal->getType()->is(base),
                         "unknown reduction init value");
            }

            VarValue *newVar = util::makeVar(
                initVal, cast<SeriesFlow>(parent->getBody()), parent, /*prepend=*/true);
            sharedInfo.push_back({next, newVar->getVar(), reduction});

            newArg = M->Nr<PointerValue>(newVar->getVar());
            ++next;
          } else {
            newArg = util::tupleGet(M->Nr<VarValue>(extras), next++);
          }

          newArgs.push_back(newArg);
        } else {
          if (isA<VarValue>(arg)) {
            newArgs.push_back(M->Nr<VarValue>(newLoopVar));
          } else if (isA<PointerValue>(arg)) {
            newArgs.push_back(M->Nr<PointerValue>(newLoopVar));
          } else {
            seqassertn(false, "unknown outline var");
          }
        }

        ++outlinedArgs;
      }

      v->replaceAll(util::call(outlinedFunc, newArgs));
      replacement = nullptr;
    }

    if (name == "_loop_shared_updates") {
      // for all non-reduction shareds, set the final values
      // this will be similar to OpenMP's "lastprivate"
      seqassertn(v->numArgs() == 1 && isA<VarValue>(v->front()),
                 "unexpected shared updates stub");
      auto *extras = util::getVar(v->front());
      auto *series = M->Nr<SeriesFlow>();

      for (auto &info : sharedInfo) {
        if (info.reduction)
          continue;

        auto *finalValue = M->Nr<VarValue>(info.local);
        auto *val = M->Nr<VarValue>(extras);
        auto *origPtr = util::tupleGet(val, info.memb);
        series->push_back(util::ptrStore(origPtr, finalValue));
      }

      v->replaceAll(series);
    }

    if (name == "_loop_schedule") {
      v->replaceAll(M->getInt(sched->code));
    }

    if (name == "_loop_ordered") {
      v->replaceAll(M->getBool(sched->ordered));
    }
  }
};

struct TaskLoopReductionVarReplacer : public util::Operator {
  std::vector<Var *> reductionArgs;
  std::vector<std::pair<Var *, Var *>> reductionRemap;
  BodiedFunc *parent;

  void setupReductionRemap() {
    auto *M = parent->getModule();

    for (auto *var : reductionArgs) {
      auto *newVar = M->Nr<Var>(var->getType(), /*global=*/false);
      reductionRemap.emplace_back(var, newVar);
    }
  }

  TaskLoopReductionVarReplacer(std::vector<Var *> reductionArgs, BodiedFunc *parent)
      : util::Operator(), reductionArgs(std::move(reductionArgs)), reductionRemap(),
        parent(parent) {
    setupReductionRemap();
  }

  void preHook(Node *v) override {
    for (auto &p : reductionRemap) {
      v->replaceUsedVariable(p.first->getId(), p.second);
    }
  }

  // need to do this as a separate step since otherwise the old variable
  // in the assignment will be replaced, which we don't want
  void finalize() {
    auto *M = parent->getModule();
    auto *body = cast<SeriesFlow>(parent->getBody());
    auto *gtid = parent->arg_back();

    for (auto &p : reductionRemap) {
      auto *taskRedData = M->getOrRealizeFunc(
          "_taskred_data", {M->getIntType(), p.first->getType()}, {}, ompModule);
      seqassertn(taskRedData, "could not find '_taskred_data'");

      auto *assign = M->Nr<AssignInstr>(
          p.second,
          util::call(taskRedData, {M->Nr<VarValue>(gtid), M->Nr<VarValue>(p.first)}));
      body->insert(body->begin(), assign);
      parent->push_back(p.second);
    }
  }
};

struct TaskLoopBodyStubReplacer : public util::Operator {
  CallInstr *replacement;
  std::vector<bool> reduceArgs;

  TaskLoopBodyStubReplacer(CallInstr *replacement, std::vector<bool> reduceArgs)
      : util::Operator(), replacement(replacement), reduceArgs(std::move(reduceArgs)) {}

  void handle(CallInstr *v) override {
    auto *func = util::getFunc(v->getCallee());
    if (func && func->getUnmangledName() == "_task_loop_body_stub") {
      seqassertn(replacement, "unexpected double replacement");
      seqassertn(v->numArgs() == 3 && isA<VarValue>(v->front()) &&
                     isA<VarValue>(v->back()),
                 "unexpected loop body stub");

      // the template passes gtid, privs and shareds to the body stub for convenience
      std::vector<Value *> args(v->begin(), v->end());
      auto *gtid = args[0];
      auto *privatesTuple = args[1];
      auto *sharedsTuple = args[2];
      unsigned privatesNext = 0;
      unsigned sharedsNext = 0;
      std::vector<Value *> newArgs;
      bool hasReductions =
          std::any_of(reduceArgs.begin(), reduceArgs.end(), [](bool b) { return b; });

      for (auto *arg : *replacement) {
        if (isA<VarValue>(arg)) {
          newArgs.push_back(util::tupleGet(privatesTuple, privatesNext++));
        } else if (isA<PointerValue>(arg)) {
          newArgs.push_back(util::tupleGet(sharedsTuple, sharedsNext++));
        } else {
          // make sure we're on the last arg, which should be gtid
          // in case of reductions
          seqassertn(hasReductions && arg == replacement->back(),
                     "unknown outline var");
        }
      }

      auto *outlinedFunc = cast<BodiedFunc>(util::getFunc(replacement->getCallee()));

      if (hasReductions) {
        newArgs.push_back(gtid);

        std::vector<Var *> reductionArgs;
        unsigned i = 0;
        for (auto it = outlinedFunc->arg_begin(); it != outlinedFunc->arg_end(); ++it) {
          if (reduceArgs[i++])
            reductionArgs.push_back(*it);
        }
        TaskLoopReductionVarReplacer redrep(reductionArgs, outlinedFunc);
        outlinedFunc->accept(redrep);
        redrep.finalize();
      }

      v->replaceAll(util::call(outlinedFunc, newArgs));
      replacement = nullptr;
    }
  }
};

struct TaskLoopRoutineStubReplacer : public ParallelLoopTemplateReplacer {
  std::vector<Value *> privates;
  std::vector<Value *> shareds;
  Var *array;  // task reduction input array
  Var *tskgrp; // task group identifier

  void setupSharedInfo(std::vector<Reduction> &sharedRedux) {
    unsigned sharedsNext = 0;
    for (auto *val : shareds) {
      if (getVarFromOutlinedArg(val)->getId() != loopVar->getId()) {
        if (auto &reduction = sharedRedux[sharedsNext]) {
          Var *newVar = util::getVar(util::makeVar(
              reduction.getInitial(), cast<SeriesFlow>(parent->getBody()), parent,
              /*prepend=*/true));
          sharedInfo.push_back({sharedsNext, newVar, reduction});
        }
      }
      ++sharedsNext;
    }
  }

  TaskLoopRoutineStubReplacer(BodiedFunc *parent, CallInstr *replacement, Var *loopVar,
                              ReductionIdentifier *reds, std::vector<Value *> privates,
                              std::vector<Value *> shareds,
                              std::vector<Reduction> sharedRedux)
      : ParallelLoopTemplateReplacer(parent, replacement, loopVar, reds),
        privates(std::move(privates)), shareds(std::move(shareds)), array(nullptr),
        tskgrp(nullptr) {
    setupSharedInfo(sharedRedux);
  }

  BodiedFunc *makeTaskRedInitFunc(Reduction *reduction) {
    auto *M = parent->getModule();
    auto *argType = M->getPointerType(reduction->getType());
    auto *funcType = M->getFuncType(M->getNoneType(), {argType, argType});
    auto *initializer = M->Nr<BodiedFunc>("__red_init");
    initializer->realize(funcType, {"lhs", "rhs"});

    auto *lhsVar = initializer->arg_front();
    auto *body = M->Nr<SeriesFlow>();
    auto *lhsPtr = M->Nr<VarValue>(lhsVar);
    body->push_back(util::ptrStore(lhsPtr, reduction->getInitial()));
    initializer->setBody(body);
    return initializer;
  }

  BodiedFunc *makeTaskRedCombFunc(Reduction *reduction) {
    auto *M = parent->getModule();
    auto *argType = M->getPointerType(reduction->getType());
    auto *funcType = M->getFuncType(M->getNoneType(), {argType, argType});
    auto *reducer = M->Nr<BodiedFunc>("__red_comb");
    reducer->realize(funcType, {"lhs", "rhs"});

    auto *lhsVar = reducer->arg_front();
    auto *rhsVar = reducer->arg_back();
    auto *body = M->Nr<SeriesFlow>();
    auto *lhsPtr = M->Nr<VarValue>(lhsVar);
    auto *rhsPtr = M->Nr<VarValue>(rhsVar);
    body->push_back(
        reduction->generateNonAtomicReduction(lhsPtr, util::ptrLoad(rhsPtr)));
    reducer->setBody(body);
    return reducer;
  }

  Value *makeTaskRedInput(Reduction *reduction, Value *shar, Value *orig) {
    auto *M = shar->getModule();
    auto *size = M->Nr<TypePropertyInstr>(reduction->getType(),
                                          TypePropertyInstr::Property::SIZEOF);
    auto *init = ptrFromFunc(makeTaskRedInitFunc(reduction));
    auto *comb = ptrFromFunc(makeTaskRedCombFunc(reduction));

    auto *taskRedInputType = M->getOrRealizeType("TaskReductionInput", {}, ompModule);
    seqassertn(taskRedInputType, "could not find 'TaskReductionInput' type");
    auto *result = taskRedInputType->construct({shar, orig, size, init, comb});
    seqassertn(result, "bad construction of 'TaskReductionInput' type");
    return result;
  }

  void handle(VarValue *v) override {
    auto *M = v->getModule();
    auto *func = util::getFunc(v);
    if (func && func->getUnmangledName() == "_routine_stub") {
      std::vector<bool> reduceArgs;
      unsigned sharedsNext = 0;
      unsigned infoNext = 0;

      for (auto *arg : *replacement) {
        if (isA<VarValue>(arg)) {
          reduceArgs.push_back(false);
        } else if (isA<PointerValue>(arg)) {
          if (infoNext < sharedInfo.size() &&
              sharedInfo[infoNext].memb == sharedsNext &&
              sharedInfo[infoNext].reduction) {
            reduceArgs.push_back(true);
            ++infoNext;
          } else {
            reduceArgs.push_back(false);
          }
          ++sharedsNext;
        } else {
          // make sure we're on the last arg, which should be gtid
          // in case of reductions
          seqassertn(numReductions() > 0 && arg == replacement->back(),
                     "unknown outline var");
          reduceArgs.push_back(false);
        }
      }

      util::CloneVisitor cv(M);
      auto *newRoutine = cv.forceClone(func);
      TaskLoopBodyStubReplacer rep(replacement, reduceArgs);
      newRoutine->accept(rep);
      v->setVar(newRoutine);
    }
  }

  void handle(CallInstr *v) override {
    ParallelLoopTemplateReplacer::handle(v);
    auto *M = v->getModule();
    auto *func = util::getFunc(v->getCallee());
    if (!func)
      return;
    auto name = func->getUnmangledName();

    if (name == "_taskred_setup") {
      seqassertn(reductionLocRef && gtid, "bad visit order in template");
      seqassertn(v->numArgs() == 1 && isA<VarValue>(v->front()),
                 "unexpected shared updates stub");
      unsigned numRed = numReductions();
      if (numRed == 0)
        return;

      auto *M = parent->getModule();
      auto *extras = util::getVar(v->front());

      // add task reduction inputs
      auto *taskRedInitSeries = M->Nr<SeriesFlow>();
      auto *taskRedInputType = M->getOrRealizeType("TaskReductionInput", {}, ompModule);
      seqassertn(taskRedInputType, "could not find 'TaskReductionInput' type");
      auto *irArrayType = M->getOrRealizeType("TaskReductionInputArray", {}, ompModule);
      seqassertn(irArrayType, "could not find 'TaskReductionInputArray' type");
      auto *taskRedInputsArray = util::makeVar(
          M->Nr<StackAllocInstr>(irArrayType, numRed), taskRedInitSeries, parent);
      array = util::getVar(taskRedInputsArray);
      auto *taskRedInputsArrayType = taskRedInputsArray->getType();

      auto *taskRedSetItem = M->getOrRealizeMethod(
          taskRedInputsArrayType, Module::SETITEM_MAGIC_NAME,
          {taskRedInputsArrayType, M->getIntType(), taskRedInputType});
      seqassertn(taskRedSetItem,
                 "could not find 'TaskReductionInputArray.__setitem__' method");
      int i = 0;
      for (auto &info : sharedInfo) {
        if (info.reduction) {
          Value *shar = M->Nr<PointerValue>(info.local);
          Value *orig = util::tupleGet(M->Nr<VarValue>(extras), info.memb);
          auto *taskRedInput = makeTaskRedInput(&info.reduction, shar, orig);
          taskRedInitSeries->push_back(util::call(
              taskRedSetItem, {M->Nr<VarValue>(array), M->getInt(i++), taskRedInput}));
        }
      }

      auto *arrayPtr = M->Nr<ExtractInstr>(M->Nr<VarValue>(array), "ptr");
      auto *taskRedInitFunc =
          M->getOrRealizeFunc("_taskred_init",
                              {reductionLocRef->getType(), gtid->getType(),
                               M->getIntType(), arrayPtr->getType()},
                              {}, ompModule);
      seqassertn(taskRedInitFunc, "task red init function not found");
      auto *taskRedInitResult =
          util::makeVar(util::call(taskRedInitFunc, {M->Nr<VarValue>(reductionLocRef),
                                                     M->Nr<VarValue>(gtid),
                                                     M->getInt(numRed), arrayPtr}),
                        taskRedInitSeries, parent);
      tskgrp = util::getVar(taskRedInitResult);
      v->replaceAll(taskRedInitSeries);
    }

    if (name == "_fix_privates_and_shareds") {
      std::vector<Value *> args(v->begin(), v->end());
      seqassertn(args.size() == 3, "invalid _fix_privates_and_shareds call found");
      unsigned numRed = numReductions();
      auto *newLoopVar = args[0];
      auto *privatesTuple = args[1];
      auto *sharedsTuple = args[2];

      unsigned privatesNext = 0;
      unsigned sharedsNext = 0;
      unsigned infoNext = 0;

      bool needNewPrivates = false;
      bool needNewShareds = false;

      std::vector<Value *> newPrivates;
      std::vector<Value *> newShareds;

      for (auto *val : privates) {
        if (numRed > 0 && val == privates.back()) { // i.e. task group identifier
          seqassertn(tskgrp, "tskgrp var not set");
          newPrivates.push_back(M->Nr<VarValue>(tskgrp));
          needNewPrivates = true;
        } else if (getVarFromOutlinedArg(val)->getId() != loopVar->getId()) {
          newPrivates.push_back(util::tupleGet(privatesTuple, privatesNext));
        } else {
          newPrivates.push_back(newLoopVar);
          needNewPrivates = true;
        }
        ++privatesNext;
      }

      for (auto *val : shareds) {
        if (getVarFromOutlinedArg(val)->getId() != loopVar->getId()) {
          if (infoNext < sharedInfo.size() &&
              sharedInfo[infoNext].memb == sharedsNext &&
              sharedInfo[infoNext].reduction) {
            newShareds.push_back(M->Nr<PointerValue>(sharedInfo[infoNext].local));
            needNewShareds = true;
            ++infoNext;
          } else {
            newShareds.push_back(util::tupleGet(sharedsTuple, sharedsNext));
          }
        } else {
          newShareds.push_back(M->Nr<PointerValue>(util::getVar(newLoopVar)));
          needNewShareds = true;
        }
        ++sharedsNext;
      }

      privatesTuple = needNewPrivates ? util::makeTuple(newPrivates, M) : privatesTuple;
      sharedsTuple = needNewShareds ? util::makeTuple(newShareds, M) : sharedsTuple;

      Value *result = util::makeTuple({privatesTuple, sharedsTuple}, M);
      v->replaceAll(result);
    }

    if (name == "_taskred_finish") {
      seqassertn(reductionLocRef && gtid, "bad visit order in template");
      if (numReductions() == 0)
        return;

      auto *taskRedFini = M->getOrRealizeFunc(
          "_taskred_fini", {reductionLocRef->getType(), gtid->getType()}, {},
          ompModule);
      seqassertn(taskRedFini, "taskred finish function not found not found");
      v->replaceAll(util::call(
          taskRedFini, {M->Nr<VarValue>(reductionLocRef), M->Nr<VarValue>(gtid)}));
    }
  }
};

struct GPULoopBodyStubReplacer : public util::Operator {
  CallInstr *replacement;
  Var *loopVar;
  int64_t step;

  GPULoopBodyStubReplacer(CallInstr *replacement, Var *loopVar, int64_t step)
      : util::Operator(), replacement(replacement), loopVar(loopVar), step(step) {}

  void handle(CallInstr *v) override {
    auto *M = v->getModule();
    auto *func = util::getFunc(v->getCallee());
    if (!func)
      return;
    auto name = func->getUnmangledName();

    if (name == "_gpu_loop_body_stub") {
      seqassertn(replacement, "unexpected double replacement");
      seqassertn(v->numArgs() == 2, "unexpected loop body stub");

      // the template passes gtid, privs and shareds to the body stub for convenience
      auto *idx = v->front();
      auto *args = v->back();
      unsigned next = 0;

      std::vector<Value *> newArgs;
      for (auto *arg : *replacement) {
        if (getVarFromOutlinedArg(arg)->getId() == loopVar->getId()) {
          newArgs.push_back(idx);
        } else {
          newArgs.push_back(util::tupleGet(args, next++));
        }
      }

      auto *outlinedFunc = cast<BodiedFunc>(util::getFunc(replacement->getCallee()));
      v->replaceAll(util::call(outlinedFunc, newArgs));
      replacement = nullptr;
    }

    if (name == "_loop_step") {
      v->replaceAll(M->getInt(step));
    }
  }
};

struct GPULoopTemplateReplacer : public LoopTemplateReplacer {
  int64_t step;

  GPULoopTemplateReplacer(BodiedFunc *parent, CallInstr *replacement, Var *loopVar,
                          int64_t step)
      : LoopTemplateReplacer(parent, replacement, loopVar), step(step) {}

  void handle(CallInstr *v) override {
    auto *M = v->getModule();
    auto *func = util::getFunc(v->getCallee());
    if (!func)
      return;
    auto name = func->getUnmangledName();

    if (name == "_loop_step") {
      v->replaceAll(M->getInt(step));
    }
  }
};

struct OpenMPTransformData {
  util::OutlineResult outline;
  std::vector<Var *> sharedVars;
  ReductionIdentifier reds;
};

template <typename T> OpenMPTransformData unpar(T *v) {
  v->setParallel(false);
  return {{}, {}, {}};
}

template <typename T>
OpenMPTransformData setupOpenMPTransform(T *v, BodiedFunc *parent, bool gpu) {
  if (!v->isParallel())
    return unpar(v);
  auto *M = v->getModule();
  auto *body = cast<SeriesFlow>(v->getBody());
  if (!parent || !body)
    return unpar(v);
  auto outline = util::outlineRegion(parent, body, /*allowOutflows=*/false,
                                     /*outlineGlobals=*/true, /*allByValue=*/gpu);
  if (!outline)
    return unpar(v);

  // set up args to pass fork_call
  Var *loopVar = v->getVar();
  std::vector<Value *> outlineCallArgs(outline.call->begin(), outline.call->end());

  // shared argument vars
  std::vector<Var *> sharedVars;
  Var *loopVarArg = nullptr;
  unsigned i = 0;
  for (auto it = outline.func->arg_begin(); it != outline.func->arg_end(); ++it) {
    // pick out loop variable to pass to reduction identifier, which will
    // ensure we don't reduce over it
    if (getVarFromOutlinedArg(outlineCallArgs[i])->getId() == loopVar->getId())
      loopVarArg = *it;
    if (outline.argKinds[i] == util::OutlineResult::ArgKind::MODIFIED)
      sharedVars.push_back(*it);
    ++i;
  }
  ReductionIdentifier reds(sharedVars, loopVarArg);
  outline.func->accept(reds);

  return {outline, sharedVars, reds};
}

struct ForkCallData {
  CallInstr *fork = nullptr;
  CallInstr *pushNumThreads = nullptr;
};

ForkCallData createForkCall(Module *M, OMPTypes &types, Value *rawTemplateFunc,
                            const std::vector<Value *> &forkExtraArgs,
                            transform::parallel::OMPSched *sched) {
  ForkCallData result;
  auto *forkExtra = util::makeTuple(forkExtraArgs, M);
  std::vector<types::Type *> forkArgTypes = {types.i8ptr, forkExtra->getType()};
  auto *forkFunc = M->getOrRealizeFunc("_fork_call", forkArgTypes, {}, ompModule);
  seqassertn(forkFunc, "fork call function not found");
  result.fork = util::call(forkFunc, {rawTemplateFunc, forkExtra});

  if (sched->threads && sched->threads->getType()->is(types.i64)) {
    auto *pushNumThreadsFunc =
        M->getOrRealizeFunc("_push_num_threads", {types.i64}, {}, ompModule);
    seqassertn(pushNumThreadsFunc, "push num threads func not found");
    result.pushNumThreads = util::call(pushNumThreadsFunc, {sched->threads});
  }
  return result;
}

struct CollapseResult {
  ImperativeForFlow *collapsed = nullptr;
  SeriesFlow *setup = nullptr;
  std::string error;

  operator bool() const { return collapsed != nullptr; }
};

struct LoopRange {
  ImperativeForFlow *loop;
  Var *start;
  Var *stop;
  int64_t step;
  Var *len;
};

CollapseResult collapseLoop(BodiedFunc *parent, ImperativeForFlow *v, int64_t levels) {
  auto fail = [](const std::string &error) {
    CollapseResult bad;
    bad.error = error;
    return bad;
  };

  auto *M = v->getModule();
  CollapseResult res;
  if (levels < 1)
    return fail("'collapse' must be at least 1");

  std::vector<ImperativeForFlow *> loopNests = {v};
  ImperativeForFlow *curr = v;

  for (auto i = 0; i < levels - 1; i++) {
    auto *body = cast<SeriesFlow>(curr->getBody());
    seqassertn(body, "unexpected loop body");
    if (std::distance(body->begin(), body->end()) != 1 ||
        !isA<ImperativeForFlow>(body->front()))
      return fail("loop nest not collapsible");

    curr = cast<ImperativeForFlow>(body->front());
    loopNests.push_back(curr);
  }

  std::vector<LoopRange> ranges;
  auto *setup = M->Nr<SeriesFlow>();

  auto *intType = M->getIntType();
  auto *lenCalc =
      M->getOrRealizeFunc("_range_len", {intType, intType, intType}, {}, ompModule);
  seqassertn(lenCalc, "range length calculation function not found");

  for (auto *loop : loopNests) {
    LoopRange range;
    range.loop = loop;
    range.start = util::makeVar(loop->getStart(), setup, parent)->getVar();
    range.stop = util::makeVar(loop->getEnd(), setup, parent)->getVar();
    range.step = loop->getStep();
    range.len = util::makeVar(util::call(lenCalc, {M->Nr<VarValue>(range.start),
                                                   M->Nr<VarValue>(range.stop),
                                                   M->getInt(range.step)}),
                              setup, parent)
                    ->getVar();
    ranges.push_back(range);
  }

  auto *numIters = M->getInt(1);
  for (auto &range : ranges) {
    numIters = (*numIters) * (*M->Nr<VarValue>(range.len));
  }

  auto *collapsedVar = M->Nr<Var>(M->getIntType(), /*global=*/false);
  parent->push_back(collapsedVar);
  auto *body = M->Nr<SeriesFlow>();
  auto sched = std::make_unique<OMPSched>(*v->getSchedule());
  sched->collapse = 0;
  auto *collapsed = M->Nr<ImperativeForFlow>(M->getInt(0), 1, numIters, body,
                                             collapsedVar, std::move(sched));

  // reconstruct indices by successive divmods
  Var *lastDiv = nullptr;
  for (auto it = ranges.rbegin(); it != ranges.rend(); ++it) {
    auto *k = lastDiv ? lastDiv : collapsedVar;
    auto *div =
        util::makeVar(*M->Nr<VarValue>(k) / *M->Nr<VarValue>(it->len), body, parent)
            ->getVar();
    auto *mod =
        util::makeVar(*M->Nr<VarValue>(k) % *M->Nr<VarValue>(it->len), body, parent)
            ->getVar();
    auto *i =
        *M->Nr<VarValue>(it->start) + *(*M->Nr<VarValue>(mod) * *M->getInt(it->step));
    body->push_back(M->Nr<AssignInstr>(it->loop->getVar(), i));
    lastDiv = div;
  }

  auto *oldBody = cast<SeriesFlow>(loopNests.back()->getBody());
  for (auto *x : *oldBody) {
    body->push_back(x);
  }

  res.collapsed = collapsed;
  res.setup = setup;

  return res;
}
} // namespace

const std::string OpenMPPass::KEY = "core-parallel-openmp";

void OpenMPPass::handle(ForFlow *v) {
  auto data = setupOpenMPTransform(v, cast<BodiedFunc>(getParentFunc()), /*gpu=*/false);
  if (!v->isParallel())
    return;

  auto &outline = data.outline;
  auto &sharedVars = data.sharedVars;
  auto &reds = data.reds;

  auto *M = v->getModule();
  auto *loopVar = v->getVar();
  auto *sched = v->getSchedule();
  OMPTypes types(M);

  // separate arguments into 'private' and 'shared'
  std::vector<Reduction> sharedRedux; // reductions corresponding to shared vars
  std::vector<Value *> privates, shareds;
  unsigned i = 0;
  for (auto *arg : *outline.call) {
    if (isA<VarValue>(arg)) {
      privates.push_back(arg);
    } else {
      shareds.push_back(arg);
      sharedRedux.push_back(reds.getReduction(sharedVars[i++]));
    }
  }

  util::CloneVisitor cv(M);

  // We need to pass the task group identifier returned from
  // __kmpc_taskred_modifier_init to the task entry, so append
  // it to private data (initially as null void pointer). Also
  // we add an argument to the end of the outlined function for
  // the gtid.
  if (reds.reductions.size() > 0) {
    auto *nullPtr = types.i8ptr->construct({});
    privates.push_back(nullPtr);

    auto *outlinedFuncType = cast<types::FuncType>(outline.func->getType());
    std::vector<types::Type *> argTypes(outlinedFuncType->begin(),
                                        outlinedFuncType->end());
    argTypes.push_back(M->getIntType());
    auto *retType = outlinedFuncType->getReturnType();

    std::vector<Var *> oldArgVars(outline.func->arg_begin(), outline.func->arg_end());
    std::vector<std::string> argNames;

    for (auto *var : oldArgVars) {
      argNames.push_back(var->getName());
    }
    argNames.push_back("gtid");

    auto *newOutlinedFunc = M->Nr<BodiedFunc>("__outlined_new");
    newOutlinedFunc->realize(M->getFuncType(retType, argTypes), argNames);

    std::vector<Var *> newArgVars(newOutlinedFunc->arg_begin(),
                                  newOutlinedFunc->arg_end());

    std::unordered_map<id_t, Var *> remaps;
    for (unsigned i = 0; i < oldArgVars.size(); i++) {
      remaps.emplace(oldArgVars[i]->getId(), newArgVars[i]);
    }
    auto *newBody =
        cast<SeriesFlow>(cv.clone(outline.func->getBody(), newOutlinedFunc, remaps));
    newOutlinedFunc->setBody(newBody);

    // update outline struct
    outline.func = newOutlinedFunc;
    outline.call->setCallee(M->Nr<VarValue>(newOutlinedFunc));
    outline.call->insert(outline.call->end(), M->getInt(0));
    outline.argKinds.push_back(util::OutlineResult::ArgKind::CONSTANT);
  }

  auto *privatesTuple = util::makeTuple(privates, M);
  auto *sharedsTuple = util::makeTuple(shareds, M);

  // template call
  std::vector<types::Type *> templateFuncArgs = {
      types.i32ptr, types.i32ptr,
      M->getPointerType(
          M->getTupleType({v->getIter()->getType(), privatesTuple->getType(),
                           sharedsTuple->getType()}))};
  auto *templateFunc = M->getOrRealizeFunc("_task_loop_outline_template",
                                           templateFuncArgs, {}, ompModule);
  seqassertn(templateFunc, "task loop outline template not found");

  templateFunc = cv.forceClone(templateFunc);
  TaskLoopRoutineStubReplacer rep(cast<BodiedFunc>(templateFunc), outline.call, loopVar,
                                  &reds, privates, shareds, sharedRedux);
  templateFunc->accept(rep);
  auto *rawTemplateFunc = ptrFromFunc(templateFunc);

  std::vector<Value *> forkExtraArgs = {v->getIter(), privatesTuple, sharedsTuple};

  // fork call
  auto forkData = createForkCall(M, types, rawTemplateFunc, forkExtraArgs, sched);
  if (forkData.pushNumThreads)
    insertBefore(forkData.pushNumThreads);
  v->replaceAll(forkData.fork);
}

void OpenMPPass::handle(ImperativeForFlow *v) {
  auto *parent = cast<BodiedFunc>(getParentFunc());

  if (v->isParallel() && v->getSchedule()->collapse != 0) {
    auto levels = v->getSchedule()->collapse;
    auto collapse = collapseLoop(parent, v, levels);

    if (collapse) {
      v->replaceAll(collapse.collapsed);
      v = collapse.collapsed;
      insertBefore(collapse.setup);
    } else if (!collapse.error.empty()) {
      warn("could not collapse loop: " + collapse.error, v);
    }
  }

  auto data =
      setupOpenMPTransform(v, parent, (v->isParallel() && v->getSchedule()->gpu));
  if (!v->isParallel())
    return;

  auto &outline = data.outline;
  auto &sharedVars = data.sharedVars;
  auto &reds = data.reds;

  auto *M = v->getModule();
  auto *loopVar = v->getVar();
  auto *sched = v->getSchedule();
  OMPTypes types(M);

  // we disable shared vars for GPU loops
  seqassertn(!(sched->gpu && !sharedVars.empty()), "GPU-parallel loop had shared vars");

  // gather extra arguments
  std::vector<Value *> extraArgs;
  std::vector<types::Type *> extraArgTypes;
  for (auto *arg : *outline.call) {
    if (getVarFromOutlinedArg(arg)->getId() != loopVar->getId()) {
      extraArgs.push_back(arg);
      extraArgTypes.push_back(arg->getType());
    }
  }

  // template call
  std::string templateFuncName;
  if (sched->gpu) {
    templateFuncName = "_gpu_loop_outline_template";
  } else if (sched->dynamic) {
    templateFuncName = "_dynamic_loop_outline_template";
  } else if (sched->chunk) {
    templateFuncName = "_static_chunked_loop_outline_template";
  } else {
    templateFuncName = "_static_loop_outline_template";
  }

  if (sched->gpu) {
    std::unordered_set<id_t> kernels;
    const std::string gpuAttr = "std.gpu.kernel";
    for (auto *var : *M) {
      if (auto *func = cast<BodiedFunc>(var)) {
        if (util::hasAttribute(func, gpuAttr))
          kernels.insert(func->getId());
      }
    }

    std::vector<types::Type *> templateFuncArgs = {types.i64, types.i64,
                                                   M->getTupleType(extraArgTypes)};
    static int64_t instance = 0;
    auto *templateFunc = M->getOrRealizeFunc(templateFuncName, templateFuncArgs,
                                             {instance++}, gpuModule);

    if (!templateFunc) {
      warn("loop not compilable for GPU; ignoring", v);
      v->setParallel(false);
      return;
    }

    BodiedFunc *kernel = nullptr;
    for (auto *var : *M) {
      if (auto *func = cast<BodiedFunc>(var)) {
        if (util::hasAttribute(func, gpuAttr) && kernels.count(func->getId()) == 0) {
          seqassertn(!kernel, "multiple new kernels found after instantiation");
          kernel = func;
        }
      }
    }
    seqassertn(kernel, "no new kernel found");
    GPULoopBodyStubReplacer brep(outline.call, loopVar, v->getStep());
    kernel->accept(brep);

    util::CloneVisitor cv(M);
    templateFunc = cast<Func>(cv.forceClone(templateFunc));
    GPULoopTemplateReplacer rep(cast<BodiedFunc>(templateFunc), outline.call, loopVar,
                                v->getStep());
    templateFunc->accept(rep);
    v->replaceAll(util::call(
        templateFunc, {v->getStart(), v->getEnd(), util::makeTuple(extraArgs, M)}));
  } else {
    std::vector<types::Type *> templateFuncArgs = {
        types.i32ptr, types.i32ptr,
        M->getPointerType(M->getTupleType(
            {types.i64, types.i64, types.i64, M->getTupleType(extraArgTypes)}))};
    auto *templateFunc =
        M->getOrRealizeFunc(templateFuncName, templateFuncArgs, {}, ompModule);
    seqassertn(templateFunc, "imperative loop outline template not found");

    util::CloneVisitor cv(M);
    templateFunc = cast<Func>(cv.forceClone(templateFunc));
    ImperativeLoopTemplateReplacer rep(cast<BodiedFunc>(templateFunc), outline.call,
                                       loopVar, &reds, sched, v->getStep());
    templateFunc->accept(rep);
    auto *rawTemplateFunc = ptrFromFunc(templateFunc);

    auto *chunk = (sched->chunk && sched->chunk->getType()->is(types.i64))
                      ? sched->chunk
                      : M->getInt(1);
    std::vector<Value *> forkExtraArgs = {chunk, v->getStart(), v->getEnd()};
    for (auto *arg : extraArgs) {
      forkExtraArgs.push_back(arg);
    }

    // fork call
    auto forkData = createForkCall(M, types, rawTemplateFunc, forkExtraArgs, sched);
    if (forkData.pushNumThreads)
      insertBefore(forkData.pushNumThreads);
    v->replaceAll(forkData.fork);
  }
}

} // namespace parallel
} // namespace transform
} // namespace ir
} // namespace codon