Commit a6dae3fa authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Fix regex in `Document.get_sentences()`

parent 757d1890
Loading
Loading
Loading
Loading
+7 −4
Original line number Diff line number Diff line
@@ -114,11 +114,14 @@ class Document:
            books[document.book_id].append(document)

        def get_sentences(text: str) -> Iterable[Sentence]:
            sentence_regex = (
                r'.*\P{L}\p{Ll}+\.\P{L}+(?P<sentence>\p{Lu}\p{Ll}*\P{L}.*?\P{L}\p{Ll}+\.)\P{L}+\p{Lu}\p{Ll}*(?:\P{L}|$)'
            )
            prefix_regex = r'\P{L}\p{Ll}+\.\P{L}+'
            match_regex = r'((?P<sentence_head>\p{Lu}\p{Ll}*\P{L}.*?)'
            suffix_regex = r'(?=(?P<sentence_tail>\P{L}\p{Ll}+\.)\P{L}+\p{Lu}\p{Ll}*(?:\P{L}|$)))'
            sentence_regex = f'{prefix_regex}{match_regex}{suffix_regex}'
            for match in regex.finditer(sentence_regex, text):
                sentence = match.group('sentence')
                sentence_head = match.group('sentence_head')
                sentence_tail = match.group('sentence_tail')
                sentence = f'{sentence_head}{sentence_tail}'
                sentence = regex.sub(r'\n+', ' ', sentence)
                yield sentence