Verified Commit 740d5989 authored by Vladimír Štill's avatar Vladimír Štill
Browse files

CFL: Rework word generation

parent de4bc741
......@@ -278,9 +278,7 @@ class CFG:
for src, prod in self.productions():
for new_prod in drop(prod):
if new_prod:
if src not in new_rules:
new_rules[src] = set()
new_rules[src].add(new_prod)
new_rules.setdefault(src, set()).add(new_prod)
if self.init in erasable:
new_init = Nonterminal("S")
while new_init in self.nonterminals:
......@@ -308,9 +306,7 @@ class CFG:
for esrc in simple_to[src]:
for prod in self.rules.get(esrc, []):
if len(prod) != 1 or prod[0] in self.terminals:
if src not in new_rules:
new_rules[src] = set()
new_rules[src].add(prod)
new_rules.setdefault(src, set()).add(prod)
return CFG(self.nonterminals, self.terminals, new_rules, self.init)
def proper(self) -> CFG:
......@@ -444,42 +440,6 @@ class CFG:
def all_terminal(sentence: CFG.Sentence) -> bool:
return all_of(lambda x: isinstance(x, Terminal), sentence)
def _generate(self, max_length: int) -> Iterable[CFG.Word]:
"""
Yields all words of the grammar up to {max_length} in length. If the
grammar is in CNF (Chomsky normal form) then shorter words come first.
No words are repeated.
Needs grammar in epsilon normal form. Otherwise it might fail to
generate some words.
"""
seen = set()
queue: Deque[CFG.Sentence] = deque([(self.init,)])
# Walk in BFS order so the the sentences are explored from shorter
# to longer for CNF.
# As we yield a word immediatelly on finding it between the sentences
# (i.e., when we find a sentence with no nonterminals), we also yield
# words from shorter to longer for CNF grammars (because a word of
# length N needs exactly N + (N - 1) derivations in CNF and therefore
# shorter words preceed longer once in BFS order.
while queue:
sentence = queue.popleft()
if len(sentence) > max_length or sentence in seen:
continue
seen.add(sentence)
if CFG.all_terminal(sentence):
yield typing.cast(CFG.Word, sentence)
else:
for i in range(len(sentence)):
if isinstance(sentence[i], Nonterminal):
for p in self.rules.get(
typing.cast(Nonterminal, sentence[i]), []):
new_sentence = sentence[:i] + p + sentence[i + 1:]
queue.append(new_sentence)
break # it suffices to perform left derivations
def _recursive_nonterminals(self) -> Set[Nonterminal]:
rewritable_to: Dict[Nonterminal, Set[Nonterminal]] \
= {n: set() for n in self.nonterminals}
......@@ -507,13 +467,19 @@ class CFG:
@staticmethod
def is_equivalent_test(left_ : CFG, right_ : CFG,
full_cmp_len: Optional[int] = None,
full_cmp_cnt: Optional[int] = None,
max_cmp_len: Optional[int] = None,
random_samples: int = 1000
) -> IsEquivalentResult:
left = left_.cnf()
right = right_.cnf()
left_gen = WordGenerator(left)
right_gen = WordGenerator(right)
left_rnd = CFGRandom(left)
right_rnd = CFGRandom(right)
left_ce: Optional[CFG.Word] = None
right_ce: Optional[CFG.Word] = None
......@@ -545,12 +511,9 @@ class CFG:
return None
return word
if full_cmp_len is None:
alphabet_size = max(len(left.terminals), len(right.terminals))
max_full_compare = pow(2, 16)
full_cmp_len = math.floor(math.log(max_full_compare,
alphabet_size))
print(f"full_cmp_len = {full_cmp_len}")
if full_cmp_cnt is None:
full_cmp_cnt = pow(2, 12)
print(f"full_cmp_cnt = {full_cmp_cnt}")
if max_cmp_len is None:
max_cmp_len = min(max(pow(2, len(left.nonterminals) + 1),
......@@ -558,39 +521,58 @@ class CFG:
25)
print(f"max_cmp_len = {max_cmp_len}")
if full_cmp_len > 0:
lenmap: Dict[int, Set[CFG.Word]] \
= {n: set() for n in range(full_cmp_len + 1)}
left_words = deepcopy(lenmap)
right_words = lenmap
last_checked_len = 0
if full_cmp_cnt > 0:
Lenmap = Dict[int, Set[CFG.Word]]
left_words: Lenmap = dict()
right_words: Lenmap = dict()
last_min_size = 0
for lword, rword in zip_fill(left._generate(full_cmp_len),
right._generate(full_cmp_len),
sentinel=True):
def total(words: Dict[int, Set[CFG.Word]]) -> int:
return sum(len(x) for k, x in words.items()
if k <= last_min_size)
def nxderivations(words: Dict[int, Set[CFG.Word]], rng: CFGRandom,
new_size: int) -> int:
return total(words) + \
sum(rng.derivations_count(l) for l in
range(last_min_size + 1, new_size + 1))
while True:
lword = left_gen.next()
rword = right_gen.next()
if lword is not None:
left_words[len(lword)].add(lword)
left_words.setdefault(len(lword), set()).add(lword)
if rword is not None:
right_words[len(rword)].add(rword)
right_words.setdefault(len(rword), set()).add(rword)
min_size = min((len(w) for w in [lword, rword]
if w is not None),
default=last_min_size + 1)
if last_min_size < min_size:
for sz in range(last_min_size, min_size):
fill_ces(left_words[sz], right_words[sz])
fill_ces(left_words.get(sz, set()),
right_words.get(sz, set()))
last_checked_len = sz
nxl = nxderivations(left_words, left_rnd, min_size)
nxr = nxderivations(right_words, right_rnd, min_size)
last_min_size = min_size
print(f"Full comparison for {last_min_size} done…")
print(f"Full comparison up to {last_checked_len} done…")
print(f"{min_size} {nxl}, {nxr}")
if max(nxl, nxr) > full_cmp_cnt:
print(f"Not going to generate {nxl}, {nxr} words")
break
if left_ce is not None and right_ce is not None:
return mkres()
left_rnd = CFGRandom(left)
left_cyk = CachedCYK(left)
right_rnd = CFGRandom(right)
right_cyk = CachedCYK(right)
for length in range(full_cmp_len + 1, max_cmp_len + 1):
for length in range(last_checked_len + 1, max_cmp_len + 1):
for _ in range(random_samples):
left_ce = try_word(left_ce, left_rnd, right_cyk, length)
right_ce = try_word(right_ce, right_rnd, left_cyk, length)
......@@ -717,10 +699,7 @@ class CachedCYK:
for src, dst in self.cfg.productions():
if len(dst) <= 1:
dst = typing.cast(CFG.Word, dst)
if dst not in self.cache:
self.cache[dst] = {src}
else:
self.cache[dst].add(src)
self.cache.setdefault(dst, set()).add(src)
def generates(self, word: Union[str, Iterable[Terminal]]) -> bool:
if isinstance(word, str):
......@@ -751,3 +730,48 @@ class CachedCYK:
self.cache[word] = out
return out
class WordGenerator:
"""
Generates all words of the grammar in ascending order.
No words are repeated.
"""
def __init__(self, cfg: CFG):
self.cfg = cfg.cnf()
self.seen: Set[CFG.Sentence] = set()
self.queue: Deque[CFG.Sentence] = deque([(self.cfg.init,)])
self.last: Optional[CFG.Word] = None
def get(self) -> Optional[CFG.Word]:
if self.last is None:
return self.next()
return self.last
def next(self) -> Optional[CFG.Word]:
# Walk in BFS order so the the sentences are explored from shorter
# to longer for CNF.
# As we yield a word immediatelly on finding it between the sentences
# (i.e., when we find a sentence with no nonterminals), we also yield
# words from shorter to longer for CNF grammars (because a word of
# length N needs exactly N + (N - 1) derivations in CNF and therefore
# shorter words preceed longer once in BFS order.
while self.queue:
sentence = self.queue.popleft()
if sentence in self.seen:
continue
self.seen.add(sentence)
if CFG.all_terminal(sentence):
self.last = typing.cast(CFG.Word, sentence)
return self.last
else:
for i in range(len(sentence)):
if isinstance(sentence[i], Nonterminal):
for p in self.cfg.rules.get(
typing.cast(Nonterminal, sentence[i]), []):
new_sentence = sentence[:i] + p + sentence[i + 1:]
self.queue.append(new_sentence)
break # it suffices to perform left derivations
return None
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment