Loading scripts/06_list_entities_in_vert_file.py +31 −21 Original line number Diff line number Diff line Loading @@ -136,6 +136,9 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I current_book = None current_page = None in_entity = False entity_words = [] entity_word_types = [] entity_languages = Counter() with input_file.open('rb') as rf: Loading @@ -159,11 +162,17 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I match = re.fullmatch(r'<entity type="(?P<type>.*)" norm="(?P<norm>.*)" display="(?P<display>.*)">', line) assert match, f'Failed to parse entity: {line}' in_entity = True if match.group('display') != 'true': continue words = match.group('norm').split() word_types = chain([f'B-{match.group("type")}'], repeat(f'I-{match.group("type")}')) for word, current_word_type in zip(words, word_types): assert len(entity_words) == 0 assert len(entity_word_types) == 0 if match.group('display') == 'true': entity_words = match.group('norm').split() entity_word_types = chain([f'B-{match.group("type")}'], repeat(f'I-{match.group("type")}')) entity_word_types = list(entity_word_types) elif line == '</entity>': in_entity = False if len(entity_words) > 0: (language, _), = entity_languages.most_common(1) for word, current_word_type in zip_equal(entity_words, entity_word_types): if current_word_type == 'B-PER': word = BPerson(word, language) elif current_word_type == 'I-PER': Loading @@ -175,8 +184,9 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I else: raise ValueError(f'Unknown word type: {current_word_type}') current_paragraph.append(word) elif line == '</entity>': in_entity = False entity_words.clear() entity_word_types.clear() entity_languages.clear() elif line.startswith('<doc '): match = re.match(r'<doc[^>]* id="(\d+)"', line) assert match is not None Loading @@ -192,11 +202,11 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I current_paragraph = [] continue if in_entity: continue # We skip the entity text from VERT file, because we already know if from <entity norm="..."> # Handle the current word word, *_, language = line.split('\t') if in_entity: entity_languages[language] += 1 else: word = Other(word) current_paragraph.append(word) Loading Loading
scripts/06_list_entities_in_vert_file.py +31 −21 Original line number Diff line number Diff line Loading @@ -136,6 +136,9 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I current_book = None current_page = None in_entity = False entity_words = [] entity_word_types = [] entity_languages = Counter() with input_file.open('rb') as rf: Loading @@ -159,11 +162,17 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I match = re.fullmatch(r'<entity type="(?P<type>.*)" norm="(?P<norm>.*)" display="(?P<display>.*)">', line) assert match, f'Failed to parse entity: {line}' in_entity = True if match.group('display') != 'true': continue words = match.group('norm').split() word_types = chain([f'B-{match.group("type")}'], repeat(f'I-{match.group("type")}')) for word, current_word_type in zip(words, word_types): assert len(entity_words) == 0 assert len(entity_word_types) == 0 if match.group('display') == 'true': entity_words = match.group('norm').split() entity_word_types = chain([f'B-{match.group("type")}'], repeat(f'I-{match.group("type")}')) entity_word_types = list(entity_word_types) elif line == '</entity>': in_entity = False if len(entity_words) > 0: (language, _), = entity_languages.most_common(1) for word, current_word_type in zip_equal(entity_words, entity_word_types): if current_word_type == 'B-PER': word = BPerson(word, language) elif current_word_type == 'I-PER': Loading @@ -175,8 +184,9 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I else: raise ValueError(f'Unknown word type: {current_word_type}') current_paragraph.append(word) elif line == '</entity>': in_entity = False entity_words.clear() entity_word_types.clear() entity_languages.clear() elif line.startswith('<doc '): match = re.match(r'<doc[^>]* id="(\d+)"', line) assert match is not None Loading @@ -192,11 +202,11 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I current_paragraph = [] continue if in_entity: continue # We skip the entity text from VERT file, because we already know if from <entity norm="..."> # Handle the current word word, *_, language = line.split('\t') if in_entity: entity_languages[language] += 1 else: word = Other(word) current_paragraph.append(word) Loading