// Copyright (C)2009 Laurence Tratt http://tratt.net/laurie/ // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal in the Software without restriction, including without limitation the // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or // sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. import Builtins, CEI, CPK::Earley::DSL, CPK::Traverser, Exceptions, Strings, Sys // // This file implements an ultra-simple PEG parser generator. It can handle direct left-recursion // correctly (including when direct right-recursion is also present in a rule). // // But it is fairly easy to understand. It uses fairly standard PEG syntax (as per Ford's original // papers) apart from character classes are written ["..."] instead of [...] i.e. the characters // in the character class must be enclosed in quotes (this is due to a limitation in the Converge // tokenizer). And it produces parse trees so one can see what happened. // // // This parser is structured into several sub-rules to make it more convenient for processing during // the translation stage. // parse := $<>: peg ::= defn ( "NEWLINE" defn )* defn ::= "ID" "<" "-" seq_expr ( "/" seq_expr )* seq_expr ::= ( expr )* expr ::= match | ref | predicate | operator | "(" seq_expr ")" match ::= "STRING" | "[" "STRING" "]" | "." ref ::= "ID" predicate ::= "&" expr | "!" expr operator ::= expr "*" | expr "?" | expr "+" func peg(dsl_block, src_infos): pn := parse(dsl_block, src_infos) nullables := Nullables.new().nullables(pn) Nullable_Calls.new().nullable_calls(pn, nullables) lrrs := Left_Recursive_Rules.new().lrrs(pn, nullables) Right_Recursive_Rules.new().rrrs(pn, nullables, lrrs) defns, defn_names := Peg_Translator.new().translate(pn) growing_init := [] for defn_name := defn_names.iter(): growing_init.append(CEI::idict_elem(CEI::istring(defn_name), CEI::idict([]))) it := [| class: $c{defns} func init(self): self._growing := ${CEI::idict(growing_init)} self._limit := Set{} func parse_with_tree(self, s): return self.$c{defns[0].name.name}(s, 0) |] Sys::println(CEI::elided_pp_itree(it, self_module.mod_id)) return it class Non_Term: func init(self, name, children): self.name := name self.children := children func to_str(self): return self.pp(0) func pp(self, indent): pp := [self.name] for c := self.children.iter(): pp.append("\n" + " " * (indent + 1) + "-> ") pp.append(c.pp(indent + 1)) return Strings::join(pp, "") class Term: func init(self, type, val): self.type := type self.val := val func to_str(self): return self.pp(0) func pp(self, indent): return "<" + self.type + " " + self.val + ">" class Nullables: func nullables(self, node): self._nullables := Set{} while 1: prev_len := self._nullables.len() self._preorder(node) if self._nullables.len() == prev_len: break return self._nullables func _preorder(self, node): return self.get_slot("_t_" + node.name)(node) func _t_peg(self, pn): // peg ::= defn ( "NEWLINE" defn )* i := 0 while i < pn.len(): self._preorder(pn[i]) i += 2 func _t_defn(self, pn): // defn ::= "ID" "<" "-" seq_expr ( "/" seq_expr )* i := 3 while i < pn.len(): if self._preorder(pn[i]) == 1: self._nullables.add(pn[0].value) break i += 2 func _t_seq_expr(self, pn): // seq_expr ::= ( expr )+ for expr := pn.iter(): if self._preorder(expr) == 0: return 0 return 1 func _t_expr(self, pn): // expr ::= match // | ref // | predicate // | operator // | "(" seq_expr ")" // | choice if pn.len() == 1: return self._preorder(pn[0]) else: // pn ::= "(" seq_expr ")" return self._preorder(pn[1]) func _t_match(self, pn): // match ::= "STRING" // | "[" "STRING" "]" // | "." return 0 func _t_ref(self, pn): // ref ::= "ID" if self._nullables.find(pn[0].value): return 1 else: return 0 func _t_predicate(self, pn): // predicate ::= "&" expr // | "!" expr return 1 func _t_operator(self, pn): // operator ::= expr "*" // | expr "?" // | expr "+" ndif pn[1].type == "*": return 1 elif pn[1].type == "?": return 1 elif pn[1].type == "+": return 0 class Nullable_Calls: func nullable_calls(self, node, nullables): self._nullables := nullables self._preorder(node) func _preorder(self, node): return self.get_slot("_t_" + node.name)(node) func _t_peg(self, pn): // peg ::= defn ( "NEWLINE" defn )* i := 0 while i < pn.len(): self._preorder(pn[i]) i += 2 func _t_defn(self, pn): // defn ::= "ID" "<" "-" seq_expr ( "/" seq_expr )* i := 3 while i < pn.len(): self._preorder(pn[i]) i += 2 func _t_seq_expr(self, pn): // seq_expr ::= ( expr )+ for expr := pn.riter(): if self._preorder(expr) == 0: return 0 return 1 func _t_expr(self, pn): // expr ::= match // | ref // | predicate // | operator // | "(" seq_expr ")" // | choice if pn.len() == 1: return self._preorder(pn[0]) else: // pn ::= "(" seq_expr ")" return self._preorder(pn[1]) func _t_match(self, pn): // match ::= "STRING" // | "[" "STRING" "]" // | "." return 0 func _t_ref(self, pn): // ref ::= "ID" pn.rr := 1 if self._nullables.find(pn[0].value): return 1 else: return 0 func _t_predicate(self, pn): // predicate ::= "&" expr // | "!" expr return 1 func _t_operator(self, pn): // operator ::= expr "*" // | expr "?" // | expr "+" ndif pn[1].type == "*": self._preorder(pn[0]) return 1 elif pn[1].type == "?": self._preorder(pn[0]) return 1 elif pn[1].type == "+": self._preorder(pn[0]) return 0 class Left_Recursive_Rules: func lrrs(self, node, nullables): self._nullables := nullables self._lrr := Set{} self._preorder(node) return self._lrr func _preorder(self, node): return self.get_slot("_t_" + node.name)(node) func _t_peg(self, pn): // peg ::= defn ( "NEWLINE" defn )* i := 0 while i < pn.len(): self._preorder(pn[i]) i += 2 func _t_defn(self, pn): // defn ::= "ID" "<" "-" seq_expr ( "/" seq_expr )* self._rule := pn[0].value i := 3 while i < pn.len(): self._preorder(pn[i]) i += 2 func _t_seq_expr(self, pn): // seq_expr ::= ( expr )+ for expr := pn.iter(): if self._preorder(expr) == 0: return 0 return 1 func _t_expr(self, pn): // expr ::= match // | ref // | predicate // | operator // | "(" seq_expr ")" // | choice if pn.len() == 1: return self._preorder(pn[0]) else: // pn ::= "(" seq_expr ")" return self._preorder(pn[1]) func _t_match(self, pn): // match ::= "STRING" // | "[" "STRING" "]" // | "." return 0 func _t_ref(self, pn): // ref ::= "ID" if self._rule == pn[0].value: self._lrr.add(self._rule) if self._nullables.find(pn[0].value): return 1 else: return 0 func _t_predicate(self, pn): // predicate ::= "&" expr // | "!" expr return 1 func _t_operator(self, pn): // operator ::= expr "*" // | expr "?" // | expr "+" ndif pn[1].type == "*": self._preorder(pn[0]) return 1 elif pn[1].type == "?": self._preorder(pn[0]) return 1 elif pn[1].type == "+": self._preorder(pn[0]) return 0 class Right_Recursive_Rules: func rrrs(self, node, nullables, lrrs): self._nullables := nullables self._lrrs := lrrs self._preorder(node) func _preorder(self, node): return self.get_slot("_t_" + node.name)(node) func _t_peg(self, pn): // peg ::= defn ( "NEWLINE" defn )* i := 0 while i < pn.len(): self._preorder(pn[i]) i += 2 func _t_defn(self, pn): // defn ::= "ID" "<" "-" seq_expr ( "/" seq_expr )* self._rule := pn[0].value i := 3 while i < pn.len(): self._preorder(pn[i]) i += 2 func _t_seq_expr(self, pn): // seq_expr ::= ( expr )+ self._potential := 0 for expr := pn.riter(): if self._preorder(expr) == 0: return 0 return 1 func _t_expr(self, pn): // expr ::= match // | ref // | predicate // | operator // | "(" seq_expr ")" // | choice if pn.len() == 1: return self._preorder(pn[0]) else: // pn ::= "(" seq_expr ")" return self._preorder(pn[1]) func _t_match(self, pn): // match ::= "STRING" // | "[" "STRING" "]" // | "." return 0 func _t_ref(self, pn): // ref ::= "ID" if self._potential == 1 & self._rule == pn[0].value & self._lrrs.find(pn[0].value): CEI::error("Left-recursive rules can not contain potentially right-recursive calls", \ pn.src_infos) if self._nullables.find(pn[0].value): return 1 else: return 0 func _t_predicate(self, pn): // predicate ::= "&" expr // | "!" expr return 1 func _t_operator(self, pn): // operator ::= expr "*" // | expr "?" // | expr "+" ndif pn[1].type == "*": self._potential := 1 self._preorder(pn[0]) return 1 elif pn[1].type == "?": self._potential := 1 self._preorder(pn[0]) return 1 elif pn[1].type == "+": self._preorder(pn[0]) return 0 // // The basic way we translate PEGs is as follows. All matching occurs on the variables &s (the string // being matched) and &p (our current position in it). We maintain a list of the current parse tree // in the variable whose name comes from col_nm (note that col_nm is a *pointer* to a variable name, // not the variable itself - this is to allow for PEG nesting). If an expression is successful, it // advances &p and updates the value pointed to by col_nm. If it is unsuccessful, it sets &p to null. // Each subsequent part of the PEG checks to see if &p is null; if it is, it realises that an earlier // part of the match failed, and that it should fail itself. // // What this means is that we manually encode backtracking in a "flat" manner. It's not entirely // pretty, but it is efficient. // class Peg_Translator: func translate(self, node): self._defn_names := Set{} return [self._preorder(node), self._defn_names] func _preorder(self, node): return self.get_slot("_t_" + node.name)(node) func _col_preorder(self, node, col): return self.get_slot("_t_" + node.name)(node, col) func _t_peg(self, pn): // peg ::= defn ( "NEWLINE" defn )* defns := [] i := 0 while i < pn.len(): defns.append(self._preorder(pn[i])) i += 2 return defns func _t_defn(self, pn): // defn ::= "ID" "<" "-" seq_expr ( "/" seq_expr )* self._defn_name := pn[0].value self._defn_names.add(self._defn_name) col_nm := CEI::fresh_name() suc_var := CEI::ivar(CEI::fresh_name()) exprs := [] i := 3 while i < pn.len(): exprs.append([| old_p := &p if $c{suc_var} == 0: $c{self._col_preorder(pn[i], col_nm)} if &p is null: &p := old_p else: $c{suc_var} := 1 |]) i += 2 return [| bound_func $c{CEI::ivar("_p_" + pn[0].value)}(&self, &s, &p): &orig_p := &p $c{CEI::ivar(col_nm)} := [] $c{suc_var} := 0 $c{exprs.flattened()} if $c{suc_var} == 0: return null else: val := Non_Term.new(${CEI::lift(pn[0].value)}, $c{CEI::ivar(col_nm)}) return [&p, val] |] func _t_seq_expr(self, pn, col_nm): // seq_expr ::= ( expr )+ sub_col_nm := CEI::fresh_name() exprs := [] for expr := pn.iter(): exprs.append([| if not &p is null: $c{self._col_preorder(expr, sub_col_nm)} |]) return [| $c{CEI::ivar(sub_col_nm)} := [] $c{exprs} if not &p is null: $c{CEI::ivar(col_nm)}.extend($c{CEI::ivar(sub_col_nm)}) |] func _t_expr(self, pn, col_nm): // expr ::= match // | ref // | predicate // | operator // | "(" seq_expr ")" // | choice if pn.len() == 1: return self._col_preorder(pn[0], col_nm) else: // pn ::= "(" seq_expr ")" return self._col_preorder(pn[1], col_nm) func _t_match(self, pn, col_nm): // match ::= "STRING" // | "[" "STRING" "]" // | "." if pn[0].type == "STRING": return [| if &p + ${CEI::lift(pn[0].value.len())} <= &s.len() & \ &s.prefixed_by(${CEI::lift(pn[0].value)}, &p): val := Term.new(${CEI::lift(pn[0].value)}, ${CEI::lift(pn[0].value)}) $c{CEI::ivar(col_nm)}.append(val) &p += ${CEI::lift(pn[0].value.len())} else: &p := null |] elif pn.len() == 3: // match ::= "[" "STRING" "]" cs := pn[1].value if cs.len() > 0 & cs[0] == "^": neg := 1 i := 1 else: neg := 0 i := 0 alts := [] while i < cs.len(): if i + 2 < cs.len() & cs[i + 1] == "-": alts.append([| &s[&p].int_val() >= ${CEI::lift(cs[i].int_val())} & \ &s[&p].int_val() <= ${CEI::lift(cs[i + 2].int_val())} |]) i += 3 else: alts.append([| &s[&p] == ${CEI::lift(cs[i])} |]) i += 1 if neg == 0: return [| if &p < &s.len() & $c{CEI::ialternation(alts)}: val := Term.new("[" + ${CEI::lift(pn[1].value)} + "]", &s[&p]) $c{CEI::ivar(col_nm)}.append(val) &p += 1 else: &p := null |] else: return [| if &p < &s.len() & not $c{CEI::ialternation(alts)}: val := Term.new("[" + ${CEI::lift(pn[1].value)} + "]", &s[&p]) $c{CEI::ivar(col_nm)}.append(val) &p += 1 else: &p := null |] else: // match ::= "." return [| if &p < &s.len(): val := Term.new(".", &s[&p]) $c{CEI::ivar(col_nm)}.append(val) &p += 1 else: &p := null |] func _t_ref(self, pn, col_nm): // ref ::= "ID" normal := [| rtn := &self.${"_p_" + pn[0].value}(&s, &p) if rtn is null: &p := null else: &p := rtn[0] $c{CEI::ivar(col_nm)}.append(rtn[1]) |] if self._defn_name != pn[0].value: return normal elif not pn.find_slot("rr"): return [| if seed := &self._growing[${CEI::lift(pn[0].value)}].find(&p): if seed is null: &p := null else: $c{CEI::ivar(col_nm)}.append(seed[1]) &p := seed[0] elif &orig_p == &p: &self._growing[${CEI::lift(pn[0].value)}][&p] := null while 1: rtn := &self.${"_p_" + self._defn_name}(&s, &p) if rtn is null | \ (not &self._growing[${CEI::lift(pn[0].value)}][&p] is null & \ rtn[0] <= &self._growing[${CEI::lift(pn[0].value)}][&p][0]): seed := &self._growing[${CEI::lift(pn[0].value)}][&p] &self._growing[${CEI::lift(pn[0].value)}].del(&p) if seed is null: &p := null break else: return seed &self._growing[${CEI::lift(pn[0].value)}][&p] := rtn else: if &self._limit.find(${CEI::lift(pn[0].value)}): &self._limit.del(${CEI::lift(pn[0].value)}) add := 1 else: add := 0 $c{normal} if add == 1: &self._limit.add(${CEI::lift(pn[0].value)}) |] else: return [| if &self._limit.find(${CEI::lift(pn[0].value)}): &p := null else: if &self._growing[${CEI::lift(pn[0].value)}].find(&orig_p): del := 1 &self._limit.add(${CEI::lift(pn[0].value)}) else: del := 0 $c{normal} if del == 1: &self._limit.del(${CEI::lift(pn[0].value)}) |] func _t_predicate(self, pn, col_nm): // predicate ::= "&" expr // | "!" expr ndif pn[0].type == "&": dummy_col_nm := CEI::fresh_name("dummy_col_nm") return [| old_p := &p $c{CEI::ivar(dummy_col_nm)} := [] $c{self._col_preorder(pn[1], dummy_col_nm)} if not &p is null: &p := old_p |] elif pn[0].type == "!": dummy_col_nm := CEI::fresh_name("dummy_col_nm") return [| old_p := &p $c{CEI::ivar(dummy_col_nm)} := [] $c{self._col_preorder(pn[1], dummy_col_nm)} if &p is null: &p := old_p else: &p := null |] func _t_operator(self, pn, col_nm): // operator ::= expr "*" // | expr "?" // | expr "+" ndif pn[1].type == "*": return [| while 1: old_p := &p $c{self._col_preorder(pn[0], col_nm)} if &p is null: &p := old_p break |] elif pn[1].type == "?": return [| old_p := &p $c{self._col_preorder(pn[0], col_nm)} if &p is null: &p := old_p |] elif pn[1].type == "+": return [| j := 0 while 1: old_p := &p $c{self._col_preorder(pn[0], col_nm)} if &p is null: if j > 0: &p := old_p break j += 1 |] calc := $<>: S <- Term Term <- Term "+" Term / Term "-" Term / Fact Fact <- Fact "*" Fact / Fact "/" Fact / Int Int <- ["0-9"]+ calc2 := $<>: S <- Term ! . Term <- Term "+" Fact / Term "-" Fact / Fact Fact <- Fact "*" Int / Fact "/" Int / Assign / List / Int Assign <- ID ":=" Term List <- "[" "]" / "[" Term "]" / "[" Term ( Entry )+ "]" Entry <- "," Term ID <- ["a-zA-Z_"]+ Int <- ["0-9"]+ // If you uncomment the following PEG, you will get a compile-time error as it is both potentially // left and and potentially right recursive. // //s3 := $<>: // expr <- "m"? expr "-" expr "M"? / num // num <- ["0-9"] func tryp(peg, s): rtn := peg.new().parse_with_tree(s) if not rtn is null: Sys::println(" ", rtn[1].pp(1)) else: Sys::println(" ") func main(): tryp(calc, "1-2-3") tryp(calc2, "1-2-3") tryp(calc2, "123+23") tryp(calc2, "123+45*67") tryp(calc2, "123*45+67") tryp(calc2, "x:=[1,2,3]+[1]") tryp(calc2, "x:=[1,2,3]+[z:=[2]]")