Loading ahisto_named_entity_search/document/document.py +7 −4 Original line number Diff line number Diff line Loading @@ -114,11 +114,14 @@ class Document: books[document.book_id].append(document) def get_sentences(text: str) -> Iterable[Sentence]: sentence_regex = ( r'.*\P{L}\p{Ll}+\.\P{L}+(?P<sentence>\p{Lu}\p{Ll}*\P{L}.*?\P{L}\p{Ll}+\.)\P{L}+\p{Lu}\p{Ll}*(?:\P{L}|$)' ) prefix_regex = r'\P{L}\p{Ll}+\.\P{L}+' match_regex = r'((?P<sentence_head>\p{Lu}\p{Ll}*\P{L}.*?)' suffix_regex = r'(?=(?P<sentence_tail>\P{L}\p{Ll}+\.)\P{L}+\p{Lu}\p{Ll}*(?:\P{L}|$)))' sentence_regex = f'{prefix_regex}{match_regex}{suffix_regex}' for match in regex.finditer(sentence_regex, text): sentence = match.group('sentence') sentence_head = match.group('sentence_head') sentence_tail = match.group('sentence_tail') sentence = f'{sentence_head}{sentence_tail}' sentence = regex.sub(r'\n+', ' ', sentence) yield sentence Loading Loading
ahisto_named_entity_search/document/document.py +7 −4 Original line number Diff line number Diff line Loading @@ -114,11 +114,14 @@ class Document: books[document.book_id].append(document) def get_sentences(text: str) -> Iterable[Sentence]: sentence_regex = ( r'.*\P{L}\p{Ll}+\.\P{L}+(?P<sentence>\p{Lu}\p{Ll}*\P{L}.*?\P{L}\p{Ll}+\.)\P{L}+\p{Lu}\p{Ll}*(?:\P{L}|$)' ) prefix_regex = r'\P{L}\p{Ll}+\.\P{L}+' match_regex = r'((?P<sentence_head>\p{Lu}\p{Ll}*\P{L}.*?)' suffix_regex = r'(?=(?P<sentence_tail>\P{L}\p{Ll}+\.)\P{L}+\p{Lu}\p{Ll}*(?:\P{L}|$)))' sentence_regex = f'{prefix_regex}{match_regex}{suffix_regex}' for match in regex.finditer(sentence_regex, text): sentence = match.group('sentence') sentence_head = match.group('sentence_head') sentence_tail = match.group('sentence_tail') sentence = f'{sentence_head}{sentence_tail}' sentence = regex.sub(r'\n+', ' ', sentence) yield sentence Loading