Commit 1717b7fa authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files
parent faea9c48
Loading
Loading
Loading
Loading
Loading
+5 −24
Original line number Diff line number Diff line
@@ -12,8 +12,6 @@ from adaptor.objectives.MLM import MaskedLanguageModeling
from adaptor.lang_module import LangModule
from adaptor.schedules import SequentialSchedule
from adaptor.utils import StoppingStrategy, AdaptationArguments
from more_itertools import zip_equal
import regex
from transformers import AutoModelForTokenClassification

from ..config import CONFIG as _CONFIG
@@ -56,32 +54,15 @@ class NerModel:
                                               val_texts_or_path=mlm_validation_texts)

        # Set up named entity recognition (NER) training
        def load_ner_dataset_and_remove_tokens_with_only_punctuation(
                tagged_sentence_basename: str) -> Tuple[List[Sentence], List[NerTags]]:
        def load_ner_dataset(tagged_sentence_basename: str) -> Tuple[List[Sentence], List[NerTags]]:
            ner_texts, all_ner_tags = [], []
            for tagged_sentence in TaggedSentence.load(tagged_sentence_basename):
                ner_token_list, ner_tag_list = [], []
                for ner_token, ner_tag in zip_equal(tagged_sentence.sentence.split(),
                                                    tagged_sentence.ner_tags.split()):
                    if regex.fullmatch(r'\W+', ner_token):
                        continue
                    ner_token_list.append(ner_token)
                    ner_tag_list.append(ner_tag)
                if not ner_token_list:
                    continue
                ner_text = ' '.join(ner_token_list)
                ner_tags = ' '.join(ner_tag_list)
                ner_texts.append(ner_text)
                all_ner_tags.append(ner_tags)
                ner_texts.append(tagged_sentence.sentence)
                all_ner_tags.append(tagged_sentence.ner_tags)
            return ner_texts, all_ner_tags

        ner_training_texts, ner_training_labels = \
            load_ner_dataset_and_remove_tokens_with_only_punctuation(
                training_tagged_sentence_basename)

        ner_validation_texts, ner_validation_labels = \
            load_ner_dataset_and_remove_tokens_with_only_punctuation(
                validation_tagged_sentence_basename)
        ner_training_texts, ner_training_labels = load_ner_dataset(training_tagged_sentence_basename)
        ner_validation_texts, ner_validation_labels = load_ner_dataset(validation_tagged_sentence_basename)

        ner_evaluators = [MeanFScore(decides_convergence=True)]
        ner_objective = TokenClassification(lang_module,
+1 −1
+2 −0
Original line number Diff line number Diff line
@@ -2,6 +2,8 @@ FROM pytorch/pytorch:latest
ARG UNAME=testuser
ARG UID=1000
ARG GID=1000
RUN apt -qy update
RUN apt -qy --no-install-recommends install git
COPY setup.py requirements.txt MANIFEST.in /ahisto_named_entity_search/
COPY scripts/ /ahisto_named_entity_search/scripts
COPY ahisto_named_entity_search /ahisto_named_entity_search/ahisto_named_entity_search
+2 −2
Original line number Diff line number Diff line
@@ -4,14 +4,14 @@
set -e -o xtrace

HOSTNAME=docker.apollo.fi.muni.cz
IMAGE_NAME=nlp/named-entity-search:latest
IMAGE_NAME=ahisto/named-entity-search:latest
ROOT_PATH=/nlp/projekty/ahisto/public_html/named-entity-search/results/
ANNOTATION_PATH=/nlp/projekty/ahisto/annotations/
OCR_EVAL_PATH=/nlp/projekty/ahisto/ahisto-ocr-eval

DOCKER_BUILDKIT=1 docker build --build-arg UID="$(id -u)" --build-arg GID="$(id -g)" --build-arg UNAME="$(id -u -n)" . -f scripts//03_train_ner_models.Dockerfile -t "$IMAGE_NAME"

parallel --halt=soon,fail=100% --jobs=100% --resume-failed \
parallel --halt=soon,fail=100% --jobs=100% --resume-failed --bar --delay 30 \
         --joblog=scripts/03_train_ner_models.joblog \
         --colsep ' +' \
         -- \