Update 06_list_entities_in_vert_file.py (bb43570b) · Commits · nlp / ahisto-modules / Named Entity Experiments

scripts/06_list_entities_in_vert_file.py

+31 −21

Original line number	Diff line number	Diff line
		@@ -136,6 +136,9 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I
		current_book = None
		current_page = None
		in_entity = False
		entity_words = []
		entity_word_types = []
		entity_languages = Counter()

		with input_file.open('rb') as rf:

		@@ -159,11 +162,17 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I
		match = re.fullmatch(r'<entity type="(?P<type>.)" norm="(?P<norm>.)" display="(?P<display>.*)">', line)
		assert match, f'Failed to parse entity: {line}'
		in_entity = True
		if match.group('display') != 'true':
		continue
		words = match.group('norm').split()
		word_types = chain([f'B-{match.group("type")}'], repeat(f'I-{match.group("type")}'))
		for word, current_word_type in zip(words, word_types):
		assert len(entity_words) == 0
		assert len(entity_word_types) == 0
		if match.group('display') == 'true':
		entity_words = match.group('norm').split()
		entity_word_types = chain([f'B-{match.group("type")}'], repeat(f'I-{match.group("type")}'))
		entity_word_types = list(entity_word_types)
		elif line == '</entity>':
		in_entity = False
		if len(entity_words) > 0:
		(language, _), = entity_languages.most_common(1)
		for word, current_word_type in zip_equal(entity_words, entity_word_types):
		if current_word_type == 'B-PER':
		word = BPerson(word, language)
		elif current_word_type == 'I-PER':
		@@ -175,8 +184,9 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I
		else:
		raise ValueError(f'Unknown word type: {current_word_type}')
		current_paragraph.append(word)
		elif line == '</entity>':
		in_entity = False
		entity_words.clear()
		entity_word_types.clear()
		entity_languages.clear()
		elif line.startswith('<doc '):
		match = re.match(r'<doc[^>]* id="(\d+)"', line)
		assert match is not None
		@@ -192,11 +202,11 @@ def read_vert_file(input_file: Path, num_input_lines: Optional[int] = None) -> I
		current_paragraph = []
		continue

		if in_entity:
		continue # We skip the entity text from VERT file, because we already know if from <entity norm="...">

		# Handle the current word
		word, *_, language = line.split('\t')
		if in_entity:
		entity_languages[language] += 1
		else:
		word = Other(word)
		current_paragraph.append(word)