#! /usr/bin/env converge // Copyright (C)2009 Laurence Tratt http://tratt.net/laurie/ // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal in the Software without restriction, including without limitation the // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or // sell copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS // IN THE SOFTWARE. import Builtins, CEI, CPK::Earley::DSL, CPK::Traverser, Exceptions, Strings, Sys // // This file implements an ultra-simple PEG parser generator. It can't handle left-recursion (if // present, it will lead to stack overflow and a core dump); it's probably very slow. // // But it is fairly easy to understand. It uses fairly standard PEG syntax (as per Ford's original // papers) apart from: ordered choices can only appear at the top level of rule (not within // expressions); character classes are written ["..."] instead of [...] i.e. the characters // in the character class must be enclosed in quotes (this is due to a limitation in the Converge // tokenizer). And it produces parse trees so one can see what happened. // // // This parser is structured into several sub-rules to make it more convenient for processing during // the translation stage. // parse := $<>: peg ::= defn ( "NEWLINE" defn )* defn ::= "ID" "<" "-" seq_expr ( "/" seq_expr )* seq_expr ::= ( expr )+ expr ::= match | ref | predicate | operator | "(" seq_expr ")" match ::= "STRING" | "[" "STRING" "]" | "." ref ::= "ID" predicate ::= "&" expr | "!" expr operator ::= expr "*" | expr "?" | expr "+" func peg(dsl_block, src_infos): pn := parse(dsl_block, src_infos) defns := Peg_Translator.new().translate(pn) it := [| class: $c{defns} func parse_with_tree(self, s): return self.$c{defns[0].name.name}(s, 0) |] Sys::println(CEI::elided_pp_itree(it, self_module.mod_id)) return it class Non_Term: func init(self, name, children): self.name := name self.children := children func to_str(self): return self.pp(0) func pp(self, indent): pp := [self.name] for c := self.children.iter(): pp.append("\n" + " " * (indent + 1) + "-> ") pp.append(c.pp(indent + 1)) return Strings::join(pp, "") class Term: func init(self, type, val): self.type := type self.val := val func to_str(self): return self.pp(0) func pp(self, indent): return "<" + self.type + " " + self.val + ">" // // The basic way we translate PEGs is as follows. All matching occurs on the variables &s (the string // being matched) and &i (our current position in it). We maintain a list of the current parse tree // in the variable whose name comes from col_nm (note that col_nm is a *pointer* to a variable name, // not the variable itself - this is to allow for PEG nesting). If an expression is successful, it // advances &i and updates the value pointed to by col_nm. If it is unsuccessful, it sets &i to null. // Each subsequent part of the PEG checks to see if &i is null; if it is, it realises that an earlier // part of the match failed, and that it should fail itself. // // What this means is that we manually encode backtracking in a "flat" manner. It's not entirely // pretty, but it is efficient. // class Peg_Translator: func translate(self, node): return self._preorder(node) func _preorder(self, node): return self.get_slot("_t_" + node.name)(node) func _col_preorder(self, node, col): return self.get_slot("_t_" + node.name)(node, col) func _t_peg(self, pn): // peg ::= defn ( "NEWLINE" defn )* defns := [] i := 0 while i < pn.len(): defns.append(self._preorder(pn[i])) i += 2 return defns func _t_defn(self, pn): // defn ::= "ID" "<" "-" seq_expr ( "/" seq_expr )* col_nm := CEI::fresh_name() suc_var := CEI::ivar(CEI::fresh_name()) exprs := [] i := 3 while i < pn.len(): exprs.append([| old_i := &i if $c{suc_var} == 0: $c{self._col_preorder(pn[i], col_nm)} if &i is null: &i := old_i else: $c{suc_var} := 1 |]) i += 2 return [| bound_func $c{CEI::ivar("_p_" + pn[0].value)}(&self, &s, &i): $c{CEI::ivar(col_nm)} := [] $c{suc_var} := 0 $c{exprs.flattened()} if $c{suc_var} == 0: return null else: val := Non_Term.new(${CEI::lift(pn[0].value)}, $c{CEI::ivar(col_nm)}) return [&i, val] |] func _t_seq_expr(self, pn, col_nm): // seq_expr ::= ( expr )+ sub_col_nm := CEI::fresh_name() exprs := [] for expr := pn.iter(): exprs.append([| if not &i is null: $c{self._col_preorder(expr, sub_col_nm)} |]) return [| $c{CEI::ivar(sub_col_nm)} := [] $c{exprs} if not &i is null: $c{CEI::ivar(col_nm)}.extend($c{CEI::ivar(sub_col_nm)}) |] func _t_expr(self, pn, col_nm): // expr ::= match // | ref // | predicate // | operator // | "(" seq_expr ")" if pn.len() == 1: return self._col_preorder(pn[0], col_nm) else: // pn ::= "(" seq_expr ")" return self._col_preorder(pn[1], col_nm) func _t_match(self, pn, col_nm): // match ::= "STRING" // | "[" "STRING" "]" // | "." if pn[0].type == "STRING": return [| if &i + ${CEI::lift(pn[0].value.len())} <= &s.len() & \ &s.prefixed_by(${CEI::lift(pn[0].value)}, &i): val := Term.new(${CEI::lift(pn[0].value)}, ${CEI::lift(pn[0].value)}) $c{CEI::ivar(col_nm)}.append(val) &i += ${CEI::lift(pn[0].value.len())} else: &i := null |] elif pn.len() == 3: // match ::= "[" "STRING" "]" cs := pn[1].value if cs.len() > 0 & cs[0] == "^": neg := 1 i := 1 else: neg := 0 i := 0 alts := [] while i < cs.len(): if i + 2 < cs.len() & cs[i + 1] == "-": alts.append([| &s[&i].int_val() >= ${CEI::lift(cs[i].int_val())} & \ &s[&i].int_val() <= ${CEI::lift(cs[i + 2].int_val())} |]) i += 3 else: alts.append([| &s[&i] == ${CEI::lift(cs[i])} |]) i += 1 if neg == 0: return [| if &i < &s.len() & $c{CEI::ialternation(alts)}: val := Term.new("[" + ${CEI::lift(pn[1].value)} + "]", &s[&i]) $c{CEI::ivar(col_nm)}.append(val) &i += 1 else: &i := null |] else: return [| if &i < &s.len() & not $c{CEI::ialternation(alts)}: val := Term.new("[" + ${CEI::lift(pn[1].value)} + "]", &s[&i]) $c{CEI::ivar(col_nm)}.append(val) &i += 1 else: &i := null |] else: // match ::= "." return [| if &i < &s.len(): val := Term.new(".", &s[&i]) $c{CEI::ivar(col_nm)}.append(val) &i += 1 else: &i := null |] func _t_ref(self, pn, col_nm): // ref ::= "ID" return [| rtn := &self.${"_p_" + pn[0].value}(&s, &i) if rtn is null: &i := null else: &i := rtn[0] $c{CEI::ivar(col_nm)}.append(rtn[1]) |] func _t_predicate(self, pn, col_nm): // predicate ::= "&" expr // | "!" expr ndif pn[0].type == "&": dummy_col_nm := CEI::fresh_name("dummy_col_nm") return [| old_i := &i $c{CEI::ivar(dummy_col_nm)} := [] $c{self._col_preorder(pn[1], dummy_col_nm)} if not &i is null: &i := old_i |] elif pn[0].type == "!": dummy_col_nm := CEI::fresh_name("dummy_col_nm") return [| old_i := &i $c{CEI::ivar(dummy_col_nm)} := [] $c{self._col_preorder(pn[1], dummy_col_nm)} if &i is null: &i := old_i else: &i := null |] func _t_operator(self, pn, col_nm): // operator ::= expr "*" // | expr "?" // | expr "+" ndif pn[1].type == "*": return [| while 1: old_i := &i $c{self._col_preorder(pn[0], col_nm)} if &i is null: &i := old_i break |] elif pn[1].type == "?": return [| old_i := &i $c{self._col_preorder(pn[0], col_nm)} if &i is null: &i := old_i |] elif pn[1].type == "+": return [| j := 0 while 1: old_i := &i $c{self._col_preorder(pn[0], col_nm)} if &i is null: if j > 0: &i := old_i break j += 1 |] calc := $<>: S <- Additive ! . Additive <- Multitative "+" Additive / Multitative Multitative <- Primary "*" Multitative / Primary Primary <- "(" Additive ")" / Decimal Decimal <- " "* ["0-9"]+ " "* func tryp(s): Sys::println(s) rtn := calc.new().parse_with_tree(s) if not rtn is null: Sys::println(" ", rtn[1].pp(1)) else: Sys::println(" ") func main(): tryp("123*33+44") tryp("123+33+44randomtext") tryp("2 + 3 * 4+(2*3)") tryp("(2)")