#include "compiler/util/peglib.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define FMT_HEADER_ONLY #include "compiler/util/fmt/format.h" using namespace std; string escape(const string &str) { string r; for (unsigned char c : str) { switch (c) { case '\n': r += "\\\\n"; break; case '\r': r += "\\\\r"; break; case '\t': r += "\\\\t"; break; case '\\': r += "\\\\"; break; case '"': r += "\\\""; break; default: if (c < 32 || c >= 127) r += fmt::format("\\\\x{:x}", c); else r += c; } } return r; } template string join(const T &items, const string &delim = " ", int start = 0, int end = -1) { string s; if (end == -1) end = items.size(); for (int i = start; i < end; i++) s += (i > start ? delim : "") + items[i]; return s; } const string NO_PACKRAT = ":NO_PACKRAT"; namespace peg { using Rules = std::unordered_map>; struct SetUpPackrat : public Ope::Visitor { bool packrat; unordered_set *seen; static bool check(unordered_set *seen, const shared_ptr &op) { SetUpPackrat v; v.seen = seen; v.packrat = true; op->accept(v); return v.packrat; }; void visit(Sequence &ope) override { for (auto op : ope.opes_) packrat &= check(seen, op); } void visit(PrioritizedChoice &ope) override { for (auto op : ope.opes_) packrat &= check(seen, op); } void visit(Repetition &ope) override { packrat &= check(seen, ope.ope_); } void visit(AndPredicate &ope) override { packrat &= check(seen, ope.ope_); } void visit(NotPredicate &ope) override { packrat &= check(seen, ope.ope_); } void visit(CaptureScope &ope) override { packrat &= check(seen, ope.ope_); } void visit(Capture &ope) override { packrat &= check(seen, ope.ope_); } void visit(TokenBoundary &ope) override { packrat &= check(seen, ope.ope_); } void visit(Ignore &ope) override { packrat &= check(seen, ope.ope_); } void visit(WeakHolder &ope) override { packrat &= check(seen, ope.weak_.lock()); } void visit(Holder &ope) override { packrat &= check(seen, ope.ope_); } void visit(Reference &ope) override { if (seen->find(ope.name_) != seen->end()) { if (ope.rule_) packrat &= ope.rule_->enable_memoize; return; } seen->insert(ope.name_); for (auto op : ope.args_) packrat &= check(seen, op); if (ope.rule_) { if (auto op = ope.get_core_operator()) packrat &= check(seen, op); packrat &= ope.rule_->enable_memoize; if (!packrat) ope.rule_->enable_memoize = false; } } void visit(Whitespace &ope) override { packrat &= check(seen, ope.ope_); } void visit(PrecedenceClimbing &ope) override { packrat &= check(seen, ope.atom_); } void visit(Recovery &ope) override { packrat &= check(seen, ope.ope_); } }; class ParserGenerator { public: static ParserGenerator &get_instance() { static ParserGenerator instance; return instance; } private: ParserGenerator() { make_grammar(); setup_actions(); } struct Instruction { std::string type; std::any data; }; struct Data { std::shared_ptr grammar; std::string start; const char *start_pos = nullptr; std::vector> duplicates; std::map instructions; std::set captures; bool enablePackratParsing = true; std::string preamble; Data() : grammar(std::make_shared()) {} }; void make_grammar() { // Setup PEG syntax parser g["Grammar"] <= seq(g["Spacing"], oom(g["Definition"]), g["EndOfFile"]); g["Definition"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Parameters"], g["LEFTARROW"], g["TopExpression"], opt(g["Instruction"])), seq(g["Ignore"], g["Identifier"], g["LEFTARROW"], g["TopExpression"], opt(g["Instruction"])), seq(g["Ignore"], lit("%preamble"), g["Spacing"], g["CppInstr"])); g["TopExpression"] <= seq(opt(g["SLASH"]), g["TopChoice"], zom(seq(g["SLASH"], g["TopChoice"]))); g["TopChoice"] <= seq(g["Sequence"], opt(g["CppInstr"])); g["Expression"] <= seq(g["Sequence"], zom(seq(g["SLASH"], g["Sequence"]))); g["Sequence"] <= zom(cho(g["CUT"], g["Prefix"])); g["Prefix"] <= seq(opt(cho(g["AND"], g["NOT"])), g["SuffixWithLabel"]); g["SuffixWithLabel"] <= seq(g["Suffix"], opt(seq(g["LABEL"], g["Identifier"]))); g["Suffix"] <= seq(g["Primary"], opt(g["Loop"])); g["Loop"] <= cho(g["QUESTION"], g["STAR"], g["PLUS"], g["Repetition"]); g["Primary"] <= cho(seq(g["Ignore"], g["IdentCont"], g["Arguments"], npd(g["LEFTARROW"])), seq(g["Ignore"], g["Identifier"], npd(seq(opt(g["Parameters"]), g["LEFTARROW"]))), seq(g["OPEN"], g["Expression"], g["CLOSE"]), seq(g["BeginTok"], g["Expression"], g["EndTok"]), seq(g["BeginCapScope"], g["Expression"], g["EndCapScope"]), seq(g["BeginCap"], g["Expression"], g["EndCap"]), g["BackRef"], g["LiteralI"], g["Dictionary"], g["Literal"], g["NegatedClass"], g["Class"], g["DOT"]); g["Identifier"] <= seq(g["IdentCont"], opt(tok(lit(string(NO_PACKRAT)))), g["Spacing"]); g["IdentCont"] <= seq(g["IdentStart"], zom(g["IdentRest"])); const static std::vector> range = {{0x0080, 0xFFFF}}; g["IdentStart"] <= seq(npd(lit(u8(u8"↑"))), npd(lit(u8(u8"⇑"))), cho(cls("a-zA-Z_%"), cls(range))); g["IdentRest"] <= cho(g["IdentStart"], cls("0-9")); g["Dictionary"] <= seq(g["LiteralD"], oom(seq(g["PIPE"], g["LiteralD"]))); auto lit_ope = cho( seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), cls("'"), g["Spacing"]), seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), cls("\""), g["Spacing"])); g["Literal"] <= lit_ope; g["LiteralD"] <= lit_ope; g["LiteralI"] <= cho(seq(cls("'"), tok(zom(seq(npd(cls("'")), g["Char"]))), lit("'i"), g["Spacing"]), seq(cls("\""), tok(zom(seq(npd(cls("\"")), g["Char"]))), lit("\"i"), g["Spacing"])); // NOTE: The original Brian Ford's paper uses 'zom' instead of 'oom'. g["Class"] <= seq(chr('['), npd(chr('^')), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); g["NegatedClass"] <= seq(lit("[^"), tok(oom(seq(npd(chr(']')), g["Range"]))), chr(']'), g["Spacing"]); g["Range"] <= cho(seq(g["Char"], chr('-'), g["Char"]), g["Char"]); g["Char"] <= cho(seq(chr('\\'), cls("nrt'\"[]\\^")), seq(chr('\\'), cls("0-3"), cls("0-7"), cls("0-7")), seq(chr('\\'), cls("0-7"), opt(cls("0-7"))), seq(lit("\\x"), cls("0-9a-fA-F"), opt(cls("0-9a-fA-F"))), seq(lit("\\u"), cho(seq(cho(seq(chr('0'), cls("0-9a-fA-F")), lit("10")), rep(cls("0-9a-fA-F"), 4, 4)), rep(cls("0-9a-fA-F"), 4, 5))), seq(npd(chr('\\')), dot())); g["Repetition"] <= seq(g["BeginBlacket"], g["RepetitionRange"], g["EndBlacket"]); g["RepetitionRange"] <= cho(seq(g["Number"], g["COMMA"], g["Number"]), seq(g["Number"], g["COMMA"]), g["Number"], seq(g["COMMA"], g["Number"])); g["Number"] <= seq(oom(cls("0-9")), g["Spacing"]); g["LEFTARROW"] <= seq(cho(lit("<-"), lit(u8(u8"←"))), g["Spacing"]); ~g["SLASH"] <= seq(chr('/'), g["Spacing"]); ~g["PIPE"] <= seq(chr('|'), g["Spacing"]); g["AND"] <= seq(chr('&'), g["Spacing"]); g["NOT"] <= seq(chr('!'), g["Spacing"]); g["QUESTION"] <= seq(chr('?'), g["Spacing"]); g["STAR"] <= seq(chr('*'), g["Spacing"]); g["PLUS"] <= seq(chr('+'), g["Spacing"]); ~g["OPEN"] <= seq(chr('('), g["Spacing"]); ~g["CLOSE"] <= seq(chr(')'), g["Spacing"]); g["DOT"] <= seq(chr('.'), g["Spacing"]); g["CUT"] <= seq(chr('^'), g["Spacing"]); // Change from ↑ to ^ ~g["LABEL"] <= seq(chr('@'), g["Spacing"]); // Change from ⇑ to @ ~g["Spacing"] <= zom(cho(g["Space"], g["Comment"])); g["Comment"] <= seq(chr('#'), zom(seq(npd(g["EndOfLine"]), dot())), g["EndOfLine"]); g["Space"] <= cho(chr(' '), chr('\t'), g["EndOfLine"]); g["EndOfLine"] <= cho(lit("\r\n"), chr('\n'), chr('\r')); g["EndOfFile"] <= npd(dot()); ~g["BeginTok"] <= seq(chr('<'), g["Spacing"]); ~g["EndTok"] <= seq(chr('>'), g["Spacing"]); ~g["BeginCapScope"] <= seq(chr('$'), chr('('), g["Spacing"]); ~g["EndCapScope"] <= seq(chr(')'), g["Spacing"]); g["BeginCap"] <= seq(chr('$'), tok(g["IdentCont"]), chr('<'), g["Spacing"]); ~g["EndCap"] <= seq(chr('>'), g["Spacing"]); g["BackRef"] <= seq(chr('$'), tok(g["IdentCont"]), g["Spacing"]); g["IGNORE"] <= chr('~'); g["Ignore"] <= opt(g["IGNORE"]); g["Parameters"] <= seq(g["OPEN"], g["Identifier"], zom(seq(g["COMMA"], g["Identifier"])), g["CLOSE"]); g["Arguments"] <= seq(g["OPEN"], g["Expression"], zom(seq(g["COMMA"], g["Expression"])), g["CLOSE"]); ~g["COMMA"] <= seq(chr(','), g["Spacing"]); // Instruction grammars g["Instruction"] <= seq(g["BeginBlacket"], cho(cho(g["PrecedenceClimbing"]), cho(g["ErrorMessage"]), cho(g["NoAstOpt"])), g["EndBlacket"]); ~g["SpacesZom"] <= zom(g["Space"]); ~g["SpacesOom"] <= oom(g["Space"]); ~g["BeginBlacket"] <= seq(chr('{'), g["Spacing"]); ~g["EndBlacket"] <= seq(chr('}'), g["Spacing"]); // PrecedenceClimbing instruction g["PrecedenceClimbing"] <= seq(lit("precedence"), g["SpacesOom"], g["PrecedenceInfo"], zom(seq(g["SpacesOom"], g["PrecedenceInfo"])), g["SpacesZom"]); g["PrecedenceInfo"] <= seq(g["PrecedenceAssoc"], oom(seq(ign(g["SpacesOom"]), g["PrecedenceOpe"]))); g["PrecedenceOpe"] <= cho(seq(cls("'"), tok(zom(seq(npd(cho(g["Space"], cls("'"))), g["Char"]))), cls("'")), seq(cls("\""), tok(zom(seq(npd(cho(g["Space"], cls("\""))), g["Char"]))), cls("\"")), tok(oom(seq(npd(cho(g["PrecedenceAssoc"], g["Space"], chr('}'))), dot())))); g["PrecedenceAssoc"] <= cls("LR"); // Error message instruction (change "message" to "!") g["ErrorMessage"] <= seq(lit("!"), g["SpacesOom"], g["LiteralD"], g["SpacesZom"]); // No Ast node optimazation instruction g["NoAstOpt"] <= seq(lit("no_ast_opt"), g["SpacesZom"]); g["CppInstr"] <= seq(g["CppCode"], g["Spacing"]); g["CppCode"] <= seq(chr('{'), zom(g["CppChar"]), chr('}')); g["CppChar"] <= cho(g["CppCode"], seq(npd(chr('{')), npd(chr('}')), dot())); // Set definition names for (auto &x : g) { x.second.name = x.first; } } void setup_actions() { g["Definition"] = [&](const SemanticValues &vs, std::any &dt) { auto &data = *std::any_cast(dt); if (vs.choice() == 2) { data.preamble = std::any_cast(vs[1]); return; } auto is_macro = vs.choice() == 0; auto ignore = std::any_cast(vs[0]); auto name = std::any_cast(vs[1]); auto enable_memoize = true; if (name.size() > NO_PACKRAT.size() && name.substr(name.size() - NO_PACKRAT.size()) == NO_PACKRAT) { enable_memoize = false; name = name.substr(0, name.size() - NO_PACKRAT.size()); } std::vector params; std::shared_ptr ope; if (is_macro) { params = std::any_cast>(vs[2]); ope = std::any_cast>(vs[4]); if (vs.size() == 6) { data.instructions[name] = std::any_cast(vs[5]); } } else { ope = std::any_cast>(vs[3]); if (vs.size() == 5) { data.instructions[name] = std::any_cast(vs[4]); } } auto &grammar = *data.grammar; if (!grammar.count(name)) { auto &rule = grammar[name]; rule <= ope; rule.name = name; rule.s_ = vs.sv().data(); rule.ignoreSemanticValue = ignore; rule.is_macro = is_macro; rule.params = params; rule.enable_memoize = enable_memoize; if (data.start.empty()) { data.start = name; data.start_pos = vs.sv().data(); } } else { data.duplicates.emplace_back(name, vs.sv().data()); } }; g["Definition"].enter = [](const char * /*s*/, size_t /*n*/, std::any &dt) { auto &data = *std::any_cast(dt); data.captures.clear(); }; auto exprFn = [&](const SemanticValues &vs) { if (vs.size() == 1) { return std::any_cast>(vs[0]); } else { std::vector> opes; for (auto i = 0u; i < vs.size(); i++) { opes.emplace_back(std::any_cast>(vs[i])); } const std::shared_ptr ope = std::make_shared(opes); return ope; } }; g["Expression"] = exprFn; g["TopExpression"] = exprFn; g["TopChoice"] = [&](const SemanticValues &vs) { if (vs.size() > 1) { auto op = std::any_cast>(vs[0]); op->code = std::any_cast(vs[1]); } return vs[0]; }; g["Sequence"] = [&](const SemanticValues &vs) { if (vs.empty()) { return npd(lit("")); } else if (vs.size() == 1) { return std::any_cast>(vs[0]); } else { std::vector> opes; for (const auto &x : vs) { opes.emplace_back(std::any_cast>(x)); } const std::shared_ptr ope = std::make_shared(opes); return ope; } }; g["Prefix"] = [&](const SemanticValues &vs) { std::shared_ptr ope; if (vs.size() == 1) { ope = std::any_cast>(vs[0]); } else { assert(vs.size() == 2); auto tok = std::any_cast(vs[0]); ope = std::any_cast>(vs[1]); if (tok == '&') { ope = apd(ope); } else { // '!' ope = npd(ope); } } return ope; }; g["SuffixWithLabel"] = [&](const SemanticValues &vs, std::any &dt) { auto ope = std::any_cast>(vs[0]); if (vs.size() == 1) { return ope; } else { assert(vs.size() == 2); auto &data = *std::any_cast(dt); const auto &ident = std::any_cast(vs[1]); auto label = ref(*data.grammar, ident, vs.sv().data(), false, {}); auto recovery = rec( ref(*data.grammar, RECOVER_DEFINITION_NAME, vs.sv().data(), true, {label})); return cho4label_(ope, recovery); } }; struct Loop { enum class Type { opt = 0, zom, oom, rep }; Type type; std::pair range; }; g["Suffix"] = [&](const SemanticValues &vs) { auto ope = std::any_cast>(vs[0]); if (vs.size() == 1) { return ope; } else { assert(vs.size() == 2); auto loop = std::any_cast(vs[1]); switch (loop.type) { case Loop::Type::opt: return opt(ope); case Loop::Type::zom: return zom(ope); case Loop::Type::oom: return oom(ope); default: // Regex-like repetition return rep(ope, loop.range.first, loop.range.second); } } }; g["Loop"] = [&](const SemanticValues &vs) { switch (vs.choice()) { case 0: // Option return Loop{Loop::Type::opt, std::pair()}; case 1: // Zero or More return Loop{Loop::Type::zom, std::pair()}; case 2: // One or More return Loop{Loop::Type::oom, std::pair()}; default: // Regex-like repetition return Loop{Loop::Type::rep, std::any_cast>(vs[0])}; } }; g["RepetitionRange"] = [&](const SemanticValues &vs) { switch (vs.choice()) { case 0: { // Number COMMA Number auto min = std::any_cast(vs[0]); auto max = std::any_cast(vs[1]); return std::pair(min, max); } case 1: // Number COMMA return std::pair(std::any_cast(vs[0]), std::numeric_limits::max()); case 2: { // Number auto n = std::any_cast(vs[0]); return std::pair(n, n); } default: // COMMA Number return std::pair(std::numeric_limits::min(), std::any_cast(vs[0])); } }; g["Number"] = [&](const SemanticValues &vs) { return vs.token_to_number(); }; g["Primary"] = [&](const SemanticValues &vs, std::any &dt) { auto &data = *std::any_cast(dt); switch (vs.choice()) { case 0: // Macro Reference case 1: { // Reference auto is_macro = vs.choice() == 0; auto ignore = std::any_cast(vs[0]); const auto &ident = std::any_cast(vs[1]); std::vector> args; if (is_macro) { args = std::any_cast>>(vs[2]); } auto ope = ref(*data.grammar, ident, vs.sv().data(), is_macro, args); if (ident == RECOVER_DEFINITION_NAME) { ope = rec(ope); } if (ignore) { return ign(ope); } else { return ope; } } case 2: { // (Expression) return std::any_cast>(vs[0]); } case 3: { // TokenBoundary return tok(std::any_cast>(vs[0])); } case 4: { // CaptureScope return csc(std::any_cast>(vs[0])); } case 5: { // Capture const auto &name = std::any_cast(vs[0]); auto ope = std::any_cast>(vs[1]); data.captures.insert(name); return cap(ope, [name](const char *a_s, size_t a_n, Context &c) { auto &cs = c.capture_scope_stack[c.capture_scope_stack_size - 1]; cs[name] = std::string(a_s, a_n); }); } default: { return std::any_cast>(vs[0]); } } }; g["Identifier"] = [](const SemanticValues &vs) { string s = std::any_cast(vs[0]); if (!vs.tokens.empty()) s += vs.token_to_string(); return s; }; g["IdentCont"] = [](const SemanticValues &vs) { return std::string(vs.sv().data(), vs.sv().length()); }; g["Dictionary"] = [](const SemanticValues &vs) { auto items = vs.transform(); return dic(items); }; g["Literal"] = [](const SemanticValues &vs) { const auto &tok = vs.tokens.front(); return lit(resolve_escape_sequence(tok.data(), tok.size())); }; g["LiteralI"] = [](const SemanticValues &vs) { const auto &tok = vs.tokens.front(); return liti(resolve_escape_sequence(tok.data(), tok.size())); }; g["LiteralD"] = [](const SemanticValues &vs) { auto &tok = vs.tokens.front(); return resolve_escape_sequence(tok.data(), tok.size()); }; g["Class"] = [](const SemanticValues &vs) { auto ranges = vs.transform>(); return cls(ranges); }; g["NegatedClass"] = [](const SemanticValues &vs) { auto ranges = vs.transform>(); return ncls(ranges); }; g["Range"] = [](const SemanticValues &vs) { switch (vs.choice()) { case 0: { auto s1 = std::any_cast(vs[0]); auto s2 = std::any_cast(vs[1]); auto cp1 = decode_codepoint(s1.data(), s1.length()); auto cp2 = decode_codepoint(s2.data(), s2.length()); return std::pair(cp1, cp2); } case 1: { auto s = std::any_cast(vs[0]); auto cp = decode_codepoint(s.data(), s.length()); return std::pair(cp, cp); } } return std::pair(0, 0); }; g["Char"] = [](const SemanticValues &vs) { return resolve_escape_sequence(vs.sv().data(), vs.sv().length()); }; g["AND"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["NOT"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["QUESTION"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["STAR"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["PLUS"] = [](const SemanticValues &vs) { return *vs.sv().data(); }; g["DOT"] = [](const SemanticValues & /*vs*/) { return dot(); }; g["CUT"] = [](const SemanticValues & /*vs*/) { return cut(); }; g["BeginCap"] = [](const SemanticValues &vs) { return vs.token(); }; g["BackRef"] = [&](const SemanticValues &vs, std::any &dt) { auto &data = *std::any_cast(dt); if (data.captures.find(vs.token()) == data.captures.end()) { data.enablePackratParsing = false; } return bkr(vs.token_to_string()); }; g["Ignore"] = [](const SemanticValues &vs) { return vs.size() > 0; }; g["Parameters"] = [](const SemanticValues &vs) { return vs.transform(); }; g["Arguments"] = [](const SemanticValues &vs) { return vs.transform>(); }; g["PrecedenceClimbing"] = [](const SemanticValues &vs) { PrecedenceClimbing::BinOpeInfo binOpeInfo; size_t level = 1; for (auto v : vs) { auto tokens = std::any_cast>(v); auto assoc = tokens[0][0]; for (size_t i = 1; i < tokens.size(); i++) { binOpeInfo[tokens[i]] = std::pair(level, assoc); } level++; } Instruction instruction; instruction.type = "precedence"; instruction.data = binOpeInfo; return instruction; }; g["PrecedenceInfo"] = [](const SemanticValues &vs) { return vs.transform(); }; g["PrecedenceOpe"] = [](const SemanticValues &vs) { return vs.token(); }; g["PrecedenceAssoc"] = [](const SemanticValues &vs) { return vs.token(); }; g["ErrorMessage"] = [](const SemanticValues &vs) { Instruction instruction; instruction.type = "message"; instruction.data = std::any_cast(vs[0]); return instruction; }; g["CppCode"] = [](const SemanticValues &vs) { return std::string(vs.sv()); }; g["NoAstOpt"] = [](const SemanticValues & /*vs*/) { Instruction instruction; instruction.type = "no_ast_opt"; return instruction; }; } bool apply_precedence_instruction(Definition &rule, const PrecedenceClimbing::BinOpeInfo &info, const char *s, Log log) { try { auto &seq = dynamic_cast(*rule.get_core_operator()); auto atom = seq.opes_[0]; auto &rep = dynamic_cast(*seq.opes_[1]); auto &seq1 = dynamic_cast(*rep.ope_); auto binop = seq1.opes_[0]; auto atom1 = seq1.opes_[1]; auto atom_name = dynamic_cast(*atom).name_; auto binop_name = dynamic_cast(*binop).name_; auto atom1_name = dynamic_cast(*atom1).name_; if (!rep.is_zom() || atom_name != atom1_name || atom_name == binop_name) { if (log) { auto line = line_info(s, rule.s_); log(line.first, line.second, "'precedence' instruction cannot be applied to '" + rule.name + "'."); } return false; } rule.holder_->ope_ = pre(atom, binop, info, rule); rule.disable_action = true; } catch (...) { if (log) { auto line = line_info(s, rule.s_); log(line.first, line.second, "'precedence' instruction cannot be applied to '" + rule.name + "'."); } return false; } return true; } public: std::shared_ptr perform_core(const char *s, size_t n, const Rules &rules, std::string &start, bool &enablePackratParsing, std::string &preamble, Log log) { Data data; auto &grammar = *data.grammar; // Built-in macros { // `%recover` { auto &rule = grammar[RECOVER_DEFINITION_NAME]; rule <= ref(grammar, "x", "", false, {}); rule.name = RECOVER_DEFINITION_NAME; rule.s_ = "[native]"; rule.ignoreSemanticValue = true; rule.is_macro = true; rule.params = {"x"}; } } std::any dt = &data; auto r = g["Grammar"].parse(s, n, dt, nullptr, log); if (!r.ret) { if (log) { if (r.error_info.message_pos) { auto line = line_info(s, r.error_info.message_pos); log(line.first, line.second, r.error_info.message); } else { auto line = line_info(s, r.error_info.error_pos); log(line.first, line.second, "syntax error"); } } return nullptr; } // User provided rules for (auto [user_name, user_rule] : rules) { auto name = user_name; auto ignore = false; if (!name.empty() && name[0] == '~') { ignore = true; name.erase(0, 1); } if (!name.empty()) { auto &rule = grammar[name]; rule <= user_rule; rule.name = name; rule.ignoreSemanticValue = ignore; } } // Check duplicated definitions auto ret = data.duplicates.empty(); for (const auto &[name, ptr] : data.duplicates) { if (log) { auto line = line_info(s, ptr); log(line.first, line.second, "'" + name + "' is already defined."); } } // Set root definition auto &start_rule = grammar[data.start]; // Check if the start rule has ignore operator { if (start_rule.ignoreSemanticValue) { if (log) { auto line = line_info(s, start_rule.s_); log(line.first, line.second, "Ignore operator cannot be applied to '" + start_rule.name + "'."); } ret = false; } } if (!ret) { return nullptr; } // Check missing definitions auto referenced = std::unordered_set{ WHITESPACE_DEFINITION_NAME, WORD_DEFINITION_NAME, RECOVER_DEFINITION_NAME, start_rule.name, "fstring"}; for (auto &[_, rule] : grammar) { ReferenceChecker vis(grammar, rule.params); rule.accept(vis); referenced.insert(vis.referenced.begin(), vis.referenced.end()); for (const auto &[name, ptr] : vis.error_s) { if (log) { auto line = line_info(s, ptr); log(line.first, line.second, vis.error_message[name]); } ret = false; } } for (auto &[name, rule] : grammar) { if (!referenced.count(name)) { if (log) { auto line = line_info(s, rule.s_); auto msg = "'" + name + "' is not referenced."; log(line.first, line.second, msg); } } } if (!ret) { return nullptr; } // Link references for (auto &x : grammar) { auto &rule = x.second; LinkReferences vis(grammar, rule.params); rule.accept(vis); } // Check left recursion ret = true; for (auto &[name, rule] : grammar) { DetectLeftRecursion vis(name); rule.accept(vis); if (vis.error_s) { if (log) { auto line = line_info(s, vis.error_s); log(line.first, line.second, "'" + name + "' is left recursive."); } ret = false; } } if (!ret) { return nullptr; } // Check infinite loop { DetectInfiniteLoop vis(data.start_pos, data.start); start_rule.accept(vis); if (vis.has_error) { if (log) { auto line = line_info(s, vis.error_s); log(line.first, line.second, "infinite loop is detected in '" + vis.error_name + "'."); } return nullptr; } } // Automatic whitespace skipping if (grammar.count(WHITESPACE_DEFINITION_NAME)) { for (auto &x : grammar) { auto &rule = x.second; auto ope = rule.get_core_operator(); if (IsLiteralToken::check(*ope)) { rule <= tok(ope); } } start_rule.whitespaceOpe = wsp(grammar[WHITESPACE_DEFINITION_NAME].get_core_operator()); } // Word expression if (grammar.count(WORD_DEFINITION_NAME)) { start_rule.wordOpe = grammar[WORD_DEFINITION_NAME].get_core_operator(); } // Apply instructions for (const auto &[name, instruction] : data.instructions) { auto &rule = grammar[name]; if (instruction.type == "precedence") { const auto &info = std::any_cast(instruction.data); if (!apply_precedence_instruction(rule, info, s, log)) { return nullptr; } } else if (instruction.type == "message") { rule.error_message = std::any_cast(instruction.data); } else if (instruction.type == "no_ast_opt") { rule.no_ast_opt = true; } } // Disable packrat on demand unordered_set seen; for (auto &[name, rule] : grammar) { auto packrat = SetUpPackrat::check(&seen, rule.get_core_operator()); if (!packrat) rule.enable_memoize = false; } // Set root definition start = data.start; enablePackratParsing = data.enablePackratParsing; preamble = data.preamble; return data.grammar; } Grammar g; }; class PrintVisitor : public Ope::Visitor { vector v; public: static string parse(const shared_ptr &op) { PrintVisitor v; op->accept(v); if (v.v.size()) { if (v.v[0].empty()) return fmt::format("P[\"{}\"]", v.v[1]); else return fmt::format("{}({})", v.v[0], join(v.v, ", ", 1)); } return "-"; }; private: void visit(Sequence &s) override { v = {"seq"}; for (auto &o : s.opes_) v.push_back(parse(o)); } void visit(PrioritizedChoice &s) override { v = {"cho"}; for (auto &o : s.opes_) v.push_back(parse(o)); } void visit(Repetition &s) override { if (s.is_zom()) v = {"zom", parse(s.ope_)}; else if (s.min_ == 1 && s.max_ == std::numeric_limits::max()) v = {"oom", parse(s.ope_)}; else if (s.min_ == 0 && s.max_ == 1) v = {"opt", parse(s.ope_)}; else v = {"rep", parse(s.ope_), to_string(s.min_), to_string(s.max_)}; } void visit(AndPredicate &s) override { v = {"apd", parse(s.ope_)}; } void visit(NotPredicate &s) override { v = {"npd", parse(s.ope_)}; } void visit(LiteralString &s) override { v = {s.ignore_case_ ? "liti" : "lit", fmt::format("\"{}\"", escape(s.lit_))}; } void visit(CharacterClass &s) override { vector sv; for (auto &c : s.ranges_) sv.push_back(fmt::format("{{0x{:x}, 0x{:x}}}", (int)c.first, (int)c.second)); v = {s.negated_ ? "ncls" : "cls", "vc{" + join(sv, ",") + "}"}; } void visit(Character &s) override { v = {"chr", fmt::format("'{}'", s.ch_)}; } void visit(AnyCharacter &s) override { v = {"dot"}; } void visit(Cut &s) override { v = {"cut"}; } void visit(Reference &s) override { if (s.is_macro_) { vector vs; for (auto &o : s.args_) vs.push_back(parse(o)); v = {"ref", "P", fmt::format("\"{}\"", s.name_), "\"\"", "true", "{" + join(vs, ", ") + "}"}; } else { v = {"ref", "P", fmt::format("\"{}\"", s.name_)}; } } void visit(TokenBoundary &s) override { v = {"tok", parse(s.ope_)}; } void visit(Ignore &s) override { v = {"ign", parse(s.ope_)}; } void visit(Recovery &s) override { v = {"rec", parse(s.ope_)}; } // infix TODO }; } // namespace peg int main(int argc, char **argv) { peg::parser parser; fmt::print("Generating grammar from {}\n", argv[1]); ifstream ifs(argv[1]); string g((istreambuf_iterator(ifs)), istreambuf_iterator()); ifs.close(); string start; peg::Rules dummy = {}; if (string(argv[3]) == "codon") dummy["NLP"] = peg::usr([](const char *, size_t, peg::SemanticValues &, any &) -> size_t { return -1; }); bool enablePackratParsing; string preamble; peg::Log log = [](size_t line, size_t col, const string &msg) { cerr << line << ":" << col << ": " << msg << "\n"; }; auto grammar = peg::ParserGenerator::get_instance().perform_core( g.c_str(), g.size(), dummy, start, enablePackratParsing, preamble, log); assert(grammar); string rules, actions; string action_preamble = " auto &CTX = any_cast(DT);\n"; string loc_preamble = " auto LI = VS.line_info();\n" " auto LOC = seq::SrcInfo(\n" " VS.path, LI.first + CTX.line_offset,\n" " LI.second + CTX.col_offset,\n" " VS.sv().size());\n"; for (auto &[name, def] : *grammar) { auto op = def.get_core_operator(); if (dummy.find(name) != dummy.end()) continue; rules += fmt::format(" {}P[\"{}\"] <= {};\n", def.ignoreSemanticValue ? "~" : "", name, peg::PrintVisitor::parse(op)); rules += fmt::format(" P[\"{}\"].name = \"{}\";\n", name, escape(name)); if (def.is_macro) rules += fmt::format(" P[\"{}\"].is_macro = true;\n", name); if (!def.enable_memoize) rules += fmt::format(" P[\"{}\"].enable_memoize = false;\n", name); if (!def.params.empty()) { vector params; for (auto &p : def.params) params.push_back(fmt::format("\"{}\"", escape(p))); rules += fmt::format(" P[\"{}\"].params = {{{}}};\n", name, join(params, ", ")); } string code = op->code; if (code.empty()) { bool all_empty = true; if (auto ope = dynamic_cast(op.get())) { for (int i = 0; i < ope->opes_.size(); i++) if (!ope->opes_[i]->code.empty()) { code += fmt::format(" if (VS.choice() == {}) {}\n", i, ope->opes_[i]->code); all_empty = false; } else { code += fmt::format(" if (VS.choice() == {}) return V0;\n", i); } } if (all_empty) code = ""; if (!code.empty()) code = "{\n" + code + "}"; } if (!code.empty()) { code = code.substr(1, code.size() - 2); if (code.find("LOC") != std::string::npos) code = loc_preamble + code; if (code.find("CTX") != std::string::npos) code = action_preamble + code; actions += fmt::format( "P[\"{}\"] = [](peg::SemanticValues &VS, any &DT) {{\n{}\n}};\n", name, code); } }; FILE *fout = fopen(argv[2], "w"); fmt::print(fout, "// clang-format off\n"); fmt::print(fout, "#pragma clang diagnostic push\n"); fmt::print(fout, "#pragma clang diagnostic ignored \"-Wreturn-type\"\n"); if (!preamble.empty()) fmt::print(fout, "{}\n", preamble.substr(1, preamble.size() - 2)); string rules_preamble = " using namespace peg;\n" " using peg::seq;\n" " using vc = vector>;\n"; fmt::print(fout, "void init_{}_rules(peg::Grammar &P) {{\n{}\n{}\n}}\n", argv[3], rules_preamble, rules); string act; for (auto &c : actions) if (c == '\n') act += "\n "; else act += c; fmt::print(fout, "void init_{}_actions(peg::Grammar &P) {{\n {}\n}}\n", argv[3], act); fmt::print(fout, "// clang-format on\n"); fmt::print(fout, "#pragma clang diagnostic pop\n"); fclose(fout); return 0; }