Commit bb43570b authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Update 06_list_entities_in_vert_file.py

parent 75730ad2
Loading
Loading
Loading
Loading
Loading
+31 −21
Original line number Diff line number Diff line
@@ -136,6 +136,9 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I
    current_book = None
    current_page = None
    in_entity = False
    entity_words = []
    entity_word_types = []
    entity_languages = Counter()

    with input_file.open('rb') as rf:

@@ -159,11 +162,17 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I
                    match = re.fullmatch(r'<entity type="(?P<type>.*)" norm="(?P<norm>.*)" display="(?P<display>.*)">', line)
                    assert match, f'Failed to parse entity: {line}'
                    in_entity = True
                    if match.group('display') != 'true':
                        continue
                    words = match.group('norm').split()
                    word_types = chain([f'B-{match.group("type")}'], repeat(f'I-{match.group("type")}'))
                    for word, current_word_type in zip(words, word_types):
                    assert len(entity_words) == 0
                    assert len(entity_word_types) == 0
                    if match.group('display') == 'true':
                        entity_words = match.group('norm').split()
                        entity_word_types = chain([f'B-{match.group("type")}'], repeat(f'I-{match.group("type")}'))
                        entity_word_types = list(entity_word_types)
                elif line == '</entity>':
                    in_entity = False
                    if len(entity_words) > 0:
                        (language, _), = entity_languages.most_common(1)
                        for word, current_word_type in zip_equal(entity_words, entity_word_types):
                            if current_word_type == 'B-PER':
                                word = BPerson(word, language)
                            elif current_word_type == 'I-PER':
@@ -175,8 +184,9 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I
                            else:
                                raise ValueError(f'Unknown word type: {current_word_type}')
                            current_paragraph.append(word)
                elif line == '</entity>':
                    in_entity = False
                    entity_words.clear()
                    entity_word_types.clear()
                    entity_languages.clear()
                elif line.startswith('<doc '):
                    match = re.match(r'<doc[^>]* id="(\d+)"', line)
                    assert match is not None
@@ -192,11 +202,11 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I
                    current_paragraph = []
                continue

            if in_entity:
                continue  # We skip the entity text from VERT file, because we already know if from <entity norm="...">

            # Handle the current word
            word, *_, language = line.split('\t')
            if in_entity:
                entity_languages[language] += 1
            else:
                word = Other(word)
                current_paragraph.append(word)