Commit e1e7942d authored by Vladimír Štill's avatar Vladimír Štill
Browse files

lib: Split-off regular grammar parser

in preparation for ANTLR-based valiadators, #9
parent 125cfec7
Pipeline #95269 failed with stage
in 19 seconds
...@@ -21,7 +21,7 @@ class RegGrammar: ...@@ -21,7 +21,7 @@ class RegGrammar:
self.terminals = terminals self.terminals = terminals
self.rules = rules self.rules = rules
self.init = init self.init = init
assert self.check() self.check()
@staticmethod @staticmethod
def from_cfg(cfg: CFG) -> RegGrammar: def from_cfg(cfg: CFG) -> RegGrammar:
...@@ -38,23 +38,31 @@ class RegGrammar: ...@@ -38,23 +38,31 @@ class RegGrammar:
return RegGrammar(cfg.nonterminals, cfg.terminals, reg_rules, cfg.init) return RegGrammar(cfg.nonterminals, cfg.terminals, reg_rules, cfg.init)
# exception # exception
def check(self) -> None:
# unused formal requirements check, regular grammars specific (rule variants)
def check(self) -> bool:
assert len(self.nonterminals) > 0, "empty grammar" assert len(self.nonterminals) > 0, "empty grammar"
has_eps_start = False
has_start_loop = False
for nonterminal in self.rules: for nonterminal in self.rules:
assert nonterminal in self.nonterminals, "unknown nonterminal " + nonterminal.name assert nonterminal in self.nonterminals, \
f"unknown nonterminal {nonterminal.name}"
for rule in self.rules[nonterminal]: for rule in self.rules[nonterminal]:
if isinstance(rule, Terminal): if isinstance(rule, Eps):
assert nonterminal == self.init, \
"ε can only appear for initial nonterminal, " \
f"appears for {nonterminal}"
has_eps_start = True
elif isinstance(rule, Terminal):
assert rule in self.terminals, "unknown terminal " + rule.name assert rule in self.terminals, "unknown terminal " + rule.name
elif not isinstance(rule, Eps): else:
assert rule[0] in self.terminals, "unknown terminal " + rule[0].name assert rule[0] in self.terminals, "unknown terminal " + rule[0].name
assert rule[1] in self.nonterminals, "unknown nonterminal " + rule[1].name assert rule[1] in self.nonterminals, "unknown nonterminal " + rule[1].name
has_start_loop |= rule[1] == self.init
assert self.init in self.nonterminals, "init not in nonterminals" assert self.init in self.nonterminals, "init not in nonterminals"
assert not has_eps_start or not has_start_loop, \
return True "since ε is present the start nonterminal must not appear on any " \
"right-hand side of a rule"
def reggrammar_to_nfa(self) -> NFA: def reggrammar_to_nfa(self) -> NFA:
states: Set[State] = set() states: Set[State] = set()
......
...@@ -4,15 +4,15 @@ grammar CFG; ...@@ -4,15 +4,15 @@ grammar CFG;
start: (onerule NEWLINE+)* onerule (NEWLINE+ | ) comment; start: (onerule NEWLINE+)* onerule (NEWLINE+ | ) comment;
onerule: nonterminal ARROW (rewrite DELIMITER)* rewrite; onerule: nonterm ARROW (rewrite DELIMITER)* rewrite;
rewrite: (term_or_nonterm+ | EPSILON); rewrite: (term_or_nonterm+ | EPSILON);
term_or_nonterm: (terminal | nonterminal); term_or_nonterm: (term | nonterm);
terminal: (TERMINAL | QUOTE anyvalue+ QUOTE); term: (TERMINAL | QUOTE anyvalue+ QUOTE);
nonterminal: (CAPS | (LEFT_ANGLE symbol+ RIGHT_ANGLE (APOSTROPHE*)) | (symbol APOSTROPHE+)); nonterm: (CAPS | (LEFT_ANGLE symbol+ RIGHT_ANGLE (APOSTROPHE*)) | (symbol APOSTROPHE+));
symbol: (TERMINAL | CAPS | UNDERSCORE); symbol: (TERMINAL | CAPS | UNDERSCORE);
...@@ -41,3 +41,7 @@ QUOTE : '"'; ...@@ -41,3 +41,7 @@ QUOTE : '"';
WS : [ \r\t]+ -> skip ; WS : [ \r\t]+ -> skip ;
ANYCHAR : .; ANYCHAR : .;
/*
vim: ft=antlr
*/
...@@ -44,21 +44,21 @@ class CFGListener(ParseTreeListener): ...@@ -44,21 +44,21 @@ class CFGListener(ParseTreeListener):
pass pass
# Enter a parse tree produced by CFGParser#terminal. # Enter a parse tree produced by CFGParser#term.
def enterTerminal(self, ctx:CFGParser.TerminalContext): def enterTerm(self, ctx:CFGParser.TermContext):
pass pass
# Exit a parse tree produced by CFGParser#terminal. # Exit a parse tree produced by CFGParser#term.
def exitTerminal(self, ctx:CFGParser.TerminalContext): def exitTerm(self, ctx:CFGParser.TermContext):
pass pass
# Enter a parse tree produced by CFGParser#nonterminal. # Enter a parse tree produced by CFGParser#nonterm.
def enterNonterminal(self, ctx:CFGParser.NonterminalContext): def enterNonterm(self, ctx:CFGParser.NontermContext):
pass pass
# Exit a parse tree produced by CFGParser#nonterminal. # Exit a parse tree produced by CFGParser#nonterm.
def exitNonterminal(self, ctx:CFGParser.NonterminalContext): def exitNonterm(self, ctx:CFGParser.NontermContext):
pass pass
......
...@@ -73,14 +73,14 @@ class CFGParser ( Parser ): ...@@ -73,14 +73,14 @@ class CFGParser ( Parser ):
RULE_onerule = 1 RULE_onerule = 1
RULE_rewrite = 2 RULE_rewrite = 2
RULE_term_or_nonterm = 3 RULE_term_or_nonterm = 3
RULE_terminal = 4 RULE_term = 4
RULE_nonterminal = 5 RULE_nonterm = 5
RULE_symbol = 6 RULE_symbol = 6
RULE_comment = 7 RULE_comment = 7
RULE_anyvalue = 8 RULE_anyvalue = 8
ruleNames = [ "start", "onerule", "rewrite", "term_or_nonterm", "terminal", ruleNames = [ "start", "onerule", "rewrite", "term_or_nonterm", "term",
"nonterminal", "symbol", "comment", "anyvalue" ] "nonterm", "symbol", "comment", "anyvalue" ]
EOF = Token.EOF EOF = Token.EOF
LEFT_ANGLE=1 LEFT_ANGLE=1
...@@ -222,8 +222,8 @@ class CFGParser ( Parser ): ...@@ -222,8 +222,8 @@ class CFGParser ( Parser ):
super().__init__(parent, invokingState) super().__init__(parent, invokingState)
self.parser = parser self.parser = parser
def nonterminal(self): def nonterm(self):
return self.getTypedRuleContext(CFGParser.NonterminalContext,0) return self.getTypedRuleContext(CFGParser.NontermContext,0)
def ARROW(self): def ARROW(self):
...@@ -269,7 +269,7 @@ class CFGParser ( Parser ): ...@@ -269,7 +269,7 @@ class CFGParser ( Parser ):
try: try:
self.enterOuterAlt(localctx, 1) self.enterOuterAlt(localctx, 1)
self.state = 40 self.state = 40
self.nonterminal() self.nonterm()
self.state = 41 self.state = 41
self.match(CFGParser.ARROW) self.match(CFGParser.ARROW)
self.state = 47 self.state = 47
...@@ -380,12 +380,12 @@ class CFGParser ( Parser ): ...@@ -380,12 +380,12 @@ class CFGParser ( Parser ):
super().__init__(parent, invokingState) super().__init__(parent, invokingState)
self.parser = parser self.parser = parser
def terminal(self): def term(self):
return self.getTypedRuleContext(CFGParser.TerminalContext,0) return self.getTypedRuleContext(CFGParser.TermContext,0)
def nonterminal(self): def nonterm(self):
return self.getTypedRuleContext(CFGParser.NonterminalContext,0) return self.getTypedRuleContext(CFGParser.NontermContext,0)
def getRuleIndex(self): def getRuleIndex(self):
...@@ -419,12 +419,12 @@ class CFGParser ( Parser ): ...@@ -419,12 +419,12 @@ class CFGParser ( Parser ):
la_ = self._interp.adaptivePredict(self._input,7,self._ctx) la_ = self._interp.adaptivePredict(self._input,7,self._ctx)
if la_ == 1: if la_ == 1:
self.state = 60 self.state = 60
self.terminal() self.term()
pass pass
elif la_ == 2: elif la_ == 2:
self.state = 61 self.state = 61
self.nonterminal() self.nonterm()
pass pass
...@@ -437,7 +437,7 @@ class CFGParser ( Parser ): ...@@ -437,7 +437,7 @@ class CFGParser ( Parser ):
return localctx return localctx
class TerminalContext(ParserRuleContext): class TermContext(ParserRuleContext):
__slots__ = 'parser' __slots__ = 'parser'
def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
...@@ -461,29 +461,29 @@ class CFGParser ( Parser ): ...@@ -461,29 +461,29 @@ class CFGParser ( Parser ):
def getRuleIndex(self): def getRuleIndex(self):
return CFGParser.RULE_terminal return CFGParser.RULE_term
def enterRule(self, listener:ParseTreeListener): def enterRule(self, listener:ParseTreeListener):
if hasattr( listener, "enterTerminal" ): if hasattr( listener, "enterTerm" ):
listener.enterTerminal(self) listener.enterTerm(self)
def exitRule(self, listener:ParseTreeListener): def exitRule(self, listener:ParseTreeListener):
if hasattr( listener, "exitTerminal" ): if hasattr( listener, "exitTerm" ):
listener.exitTerminal(self) listener.exitTerm(self)
def accept(self, visitor:ParseTreeVisitor): def accept(self, visitor:ParseTreeVisitor):
if hasattr( visitor, "visitTerminal" ): if hasattr( visitor, "visitTerm" ):
return visitor.visitTerminal(self) return visitor.visitTerm(self)
else: else:
return visitor.visitChildren(self) return visitor.visitChildren(self)
def terminal(self): def term(self):
localctx = CFGParser.TerminalContext(self, self._ctx, self.state) localctx = CFGParser.TermContext(self, self._ctx, self.state)
self.enterRule(localctx, 8, self.RULE_terminal) self.enterRule(localctx, 8, self.RULE_term)
self._la = 0 # Token type self._la = 0 # Token type
try: try:
self.enterOuterAlt(localctx, 1) self.enterOuterAlt(localctx, 1)
...@@ -524,7 +524,7 @@ class CFGParser ( Parser ): ...@@ -524,7 +524,7 @@ class CFGParser ( Parser ):
return localctx return localctx
class NonterminalContext(ParserRuleContext): class NontermContext(ParserRuleContext):
__slots__ = 'parser' __slots__ = 'parser'
def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
...@@ -554,29 +554,29 @@ class CFGParser ( Parser ): ...@@ -554,29 +554,29 @@ class CFGParser ( Parser ):
return self.getToken(CFGParser.APOSTROPHE, i) return self.getToken(CFGParser.APOSTROPHE, i)
def getRuleIndex(self): def getRuleIndex(self):
return CFGParser.RULE_nonterminal return CFGParser.RULE_nonterm
def enterRule(self, listener:ParseTreeListener): def enterRule(self, listener:ParseTreeListener):
if hasattr( listener, "enterNonterminal" ): if hasattr( listener, "enterNonterm" ):
listener.enterNonterminal(self) listener.enterNonterm(self)
def exitRule(self, listener:ParseTreeListener): def exitRule(self, listener:ParseTreeListener):
if hasattr( listener, "exitNonterminal" ): if hasattr( listener, "exitNonterm" ):
listener.exitNonterminal(self) listener.exitNonterm(self)
def accept(self, visitor:ParseTreeVisitor): def accept(self, visitor:ParseTreeVisitor):
if hasattr( visitor, "visitNonterminal" ): if hasattr( visitor, "visitNonterm" ):
return visitor.visitNonterminal(self) return visitor.visitNonterm(self)
else: else:
return visitor.visitChildren(self) return visitor.visitChildren(self)
def nonterminal(self): def nonterm(self):
localctx = CFGParser.NonterminalContext(self, self._ctx, self.state) localctx = CFGParser.NontermContext(self, self._ctx, self.state)
self.enterRule(localctx, 10, self.RULE_nonterminal) self.enterRule(localctx, 10, self.RULE_nonterm)
self._la = 0 # Token type self._la = 0 # Token type
try: try:
self.enterOuterAlt(localctx, 1) self.enterOuterAlt(localctx, 1)
......
grammar RegG;
import CFG;
rewrite: (term nonterm | term | EPSILON);
/*
vim: ft=antlr
*/
...@@ -18,6 +18,9 @@ from lib.parser.RegExVisitor import RegExVisitor ...@@ -18,6 +18,9 @@ from lib.parser.RegExVisitor import RegExVisitor
from lib.parser.CFGLexer import CFGLexer from lib.parser.CFGLexer import CFGLexer
from lib.parser.CFGParser import CFGParser from lib.parser.CFGParser import CFGParser
from lib.parser.CFGListener import CFGListener from lib.parser.CFGListener import CFGListener
from lib.parser.RegGLexer import RegGLexer
from lib.parser.RegGParser import RegGParser
from lib.parser.RegGListener import RegGListener
class ParsingError(Exception): class ParsingError(Exception):
...@@ -162,8 +165,9 @@ def cfg(string: str) -> CFG: ...@@ -162,8 +165,9 @@ def cfg(string: str) -> CFG:
def reggrammar(string: str) -> RegGrammar: def reggrammar(string: str) -> RegGrammar:
try: try:
cfg_ = cfg(string) builder = _common_parse(string, RegGLexer, RegGParser, RegGBuilder)
return RegGrammar.from_cfg(cfg_) return RegGrammar(builder.nonterminals, builder.terminals,
builder.rules, builder.init)
except Exception as e: except Exception as e:
raise ParsingError(e.args) raise ParsingError(e.args)
...@@ -224,7 +228,7 @@ def regex(string: str) -> RegEx: ...@@ -224,7 +228,7 @@ def regex(string: str) -> RegEx:
raise ParsingError(e.args) raise ParsingError(e.args)
class CommonVisitor: class CommonFABuilder(object):
def visitStatename(self, ctx: Any) -> str: def visitStatename(self, ctx: Any) -> str:
if ctx.QUOTE(): if ctx.QUOTE():
...@@ -248,8 +252,8 @@ class CommonVisitor: ...@@ -248,8 +252,8 @@ class CommonVisitor:
# The order of inheritance is important – we need the ‹exit*› functions from # The order of inheritance is important – we need the ‹exit*› functions from
# ‹CommonVisitor› to precede the empty versions from ‹DFAListener› # ‹CommonFABuilder› to precede the empty versions from ‹DFAListener›
class DFABuilder(CommonVisitor, DFAListener): class DFABuilder(CommonFABuilder, DFAListener):
def __init__(self) -> None: def __init__(self) -> None:
self.states = set() self.states = set()
...@@ -276,7 +280,7 @@ class DFABuilder(CommonVisitor, DFAListener): ...@@ -276,7 +280,7 @@ class DFABuilder(CommonVisitor, DFAListener):
self.first_state = state self.first_state = state
class NFABuilder(CommonVisitor, NFAListener): class NFABuilder(CommonFABuilder, NFAListener):
def __init__(self) -> None: def __init__(self) -> None:
self.states = set() self.states = set()
...@@ -392,7 +396,7 @@ class RegExBuilder(RegExVisitor): ...@@ -392,7 +396,7 @@ class RegExBuilder(RegExVisitor):
return None return None
class CFGBuilder(CFGListener): class CommonGrammarBuilder(object):
def __init__(self) -> None: def __init__(self) -> None:
self.terminals = set() self.terminals = set()
self.nonterminals = set() self.nonterminals = set()
...@@ -407,7 +411,7 @@ class CFGBuilder(CFGListener): ...@@ -407,7 +411,7 @@ class CFGBuilder(CFGListener):
else: else:
return '_' return '_'
def visitNonterminal(self, ctx: Any) -> None: def visitNonterm(self, ctx: Any) -> None:
if ctx.CAPS(): if ctx.CAPS():
name = str(ctx.CAPS()) name = str(ctx.CAPS())
elif ctx.LEFT_ANGLE(): elif ctx.LEFT_ANGLE():
...@@ -423,28 +427,18 @@ class CFGBuilder(CFGListener): ...@@ -423,28 +427,18 @@ class CFGBuilder(CFGListener):
self.nonterminals.add(nonterminal) self.nonterminals.add(nonterminal)
return nonterminal return nonterminal
def visitRewrite(self, ctx: Any) -> None: def visitTerm(self, term_ctx: Any) -> None:
sequence = [] if term_ctx.TERMINAL():
for subctx in ctx.term_or_nonterm(): name = str(term_ctx.TERMINAL())
if subctx.terminal(): elif term_ctx.QUOTE():
term_ctx = subctx.terminal() name = _unquote(term_ctx.getText())
if term_ctx.TERMINAL():
name = str(term_ctx.TERMINAL())
elif term_ctx.QUOTE():
name = _unquote(term_ctx.getText())
terminal = Terminal(name)
self.terminals.add(terminal)
sequence.append(terminal)
else:
sequence.append(self.visitNonterminal(
subctx.nonterminal()))
return sequence terminal = Terminal(name)
self.terminals.add(terminal)
return terminal
def exitOnerule(self, ctx: Any) -> None: def exitOnerule(self, ctx: Any) -> None:
nonterminal = self.visitNonterminal(ctx.nonterminal()) nonterminal = self.visitNonterm(ctx.nonterm())
self.nonterminals.add(nonterminal) self.nonterminals.add(nonterminal)
if self.init is None: if self.init is None:
self.init = nonterminal self.init = nonterminal
...@@ -454,12 +448,36 @@ class CFGBuilder(CFGListener): ...@@ -454,12 +448,36 @@ class CFGBuilder(CFGListener):
self.rules[nonterminal] = set() self.rules[nonterminal] = set()
for subctx in ctx.rewrite(): for subctx in ctx.rewrite():
if subctx.EPSILON(): self.rules[nonterminal].add(self.visitRewrite(subctx))
self.rules[nonterminal].add(Eps())
if subctx.term_or_nonterm():
sequence = self.visitRewrite(subctx)
self.rules[nonterminal].add(tuple(sequence))
# for future support of comments # for future support of comments
def exitComment(self, ctx: Any) -> None: def exitComment(self, ctx: Any) -> None:
return None return None
class CFGBuilder(CommonGrammarBuilder, CFGListener):
def visitRewrite(self, ctx: Any) \
-> Union[Eps, Tuple[Union[Terminal, Nonterminal], ...]]:
if ctx.EPSILON():
return Eps()
sequence = []
for subctx in ctx.term_or_nonterm():
if subctx.term():
sequence.append(self.visitTerm(subctx.term()))
else:
sequence.append(self.visitNonterm(subctx.nonterm()))
return tuple(sequence)
class RegGBuilder(CommonGrammarBuilder, RegGListener):
def visitRewrite(self, ctx: Any) -> None:
if ctx.EPSILON():
return Eps()
assert ctx.term() is not None
term = self.visitTerm(ctx.term())
if ctx.nonterm():
return tuple([term, self.visitNonterm(ctx.nonterm())])
return term
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment