`PER-or-LOC` labels are not used during evaluation
To use the PER-or-LOC
labels during evaluation, the following changes need to be made:
diff --git a/ahisto_named_entity_search/recognition/model.py b/ahisto_named_entity_search/recognition/model.py
index 06db0c2..0d2776e 100644
--- a/ahisto_named_entity_search/recognition/model.py
+++ b/ahisto_named_entity_search/recognition/model.py
@@ -307,7 +307,7 @@ def load_ner_dataset(tagged_sentence_basename: str) -> Tuple[List[Sentence], Lis
assert suffix in ('PER', 'LOC')
ner_tag = f'{prefix}PER-or-LOC'
ner_tags.append(ner_tag)
- all_ner_tags.append(tagged_sentence.ner_tags)
+ all_ner_tags.append(' '.join(ner_tags))
return ner_texts, all_ner_tags
@@ -337,8 +337,8 @@ def get_label_weights(labels: Iterable[Label], dataset: Iterable[BioNerTags]) ->
label_counts = {label: 0 for label in labels}
for bio_ner_tags in dataset:
for label in bio_ner_tags.split():
- assert label in label_counts
- label_counts[label] += 1
+ if label in label_counts:
+ label_counts[label] += 1
label_weights = {
label: count**-1 if count > 0 else 0
for label, count
Furthermore, we would need to alter LangModule
in the Adaptor library, so that it passes ignore_mismatched_sizes=True
to the AutoModelForTokenClassification.from_pretrained()
call in load_head()
method.