Commit 3d99edcc authored by Vít Novotný's avatar Vít Novotný
Browse files

Separate NerModel training schedules to `recognition.schedule`

parent 63d085bd
......@@ -97,6 +97,7 @@ log_every_n_steps = 100
evaluate_every_n_steps = 10000
save_every_n_steps = 10000
number_of_training_epochs = 10
schedule = fair-sequential-schedule
[recognition.FairSequentialSchedule]
maximum_number_of_training_epochs_per_objective = 1
......@@ -2,7 +2,16 @@ from .model import (
NerModel
)
from .schedule import (
get_schedule,
ScheduleName,
Schedule,
)
__all__ = [
'get_schedule',
'NerModel',
'ScheduleName',
'Schedule',
]
......@@ -2,47 +2,26 @@ from __future__ import annotations
from logging import getLogger
from pathlib import Path
from typing import Tuple, List, Iterable
from typing import Tuple, List, Optional
import comet_ml # noqa: F401
from adaptor.adapter import Adapter
from adaptor.objectives.objective_base import Objective
from adaptor.objectives.classification import TokenClassification
from adaptor.evaluators.token_classification import MeanFScore
from adaptor.objectives.MLM import MaskedLanguageModeling
from adaptor.lang_module import LangModule
from adaptor.schedules import SequentialSchedule
from adaptor.utils import StoppingStrategy, AdaptationArguments
from transformers import AutoModelForTokenClassification
from ..config import CONFIG as _CONFIG
from ..document import Document, Sentence
from ..search import TaggedSentence, NerTags
from .schedule import ScheduleName, get_schedule
LOGGER = getLogger(__name__)
class FairSequentialSchedule(SequentialSchedule):
CONFIG = _CONFIG['recognition.FairSequentialSchedule']
MAX_NUM_TRAIN_EPOCHS = CONFIG.getint('maximum_number_of_training_epochs_per_objective')
label = 'fair_sequential'
def _sample_objectives(self, split: str) -> Iterable[Objective]:
while True:
for objective in self.objectives[split].values():
starting_epoch = objective.epoch
for _ in range(objective.dataset_length[split]):
if objective in self.converged_objectives and not self.args.log_converged_objectives:
continue
if split == 'train':
num_train_epochs = objective.epoch - starting_epoch
if num_train_epochs >= self.MAX_NUM_TRAIN_EPOCHS:
continue
yield objective
class NerModel:
CONFIG = _CONFIG['recognition.NerModel']
ROOT_PATH = Path(CONFIG['root_path'])
......@@ -53,6 +32,7 @@ class NerModel:
SAVE_STEPS = CONFIG.getint('save_every_n_steps')
LOGGING_STEPS = CONFIG.getint('log_every_n_steps')
NUM_TRAIN_EPOCHS = CONFIG.getint('number_of_training_epochs')
SCHEDULE_NAME = CONFIG['schedule']
def __init__(self, model: AutoModelForTokenClassification):
self.model = model
......@@ -61,7 +41,11 @@ class NerModel:
def train_and_save(cls, model_checkpoint_basename: str, model_basename: str,
training_sentence_basename: str, validation_sentence_basename: str,
training_tagged_sentence_basename: str,
validation_tagged_sentence_basename: str) -> None:
validation_tagged_sentence_basename: str,
schedule_name: Optional[ScheduleName] = None) -> None:
if schedule_name is None:
schedule_name = cls.SCHEDULE_NAME
lang_module = LangModule(cls.BASE_MODEL)
......@@ -113,7 +97,7 @@ class NerModel:
fp16_full_eval=True,
)
schedule = FairSequentialSchedule([mlm_objective, ner_objective], adaptation_arguments)
schedule = get_schedule(schedule_name, [mlm_objective, ner_objective], adaptation_arguments)
adapter = Adapter(lang_module, schedule, adaptation_arguments)
adapter.train()
......
from typing import Iterable
from adaptor.objectives.objective_base import Objective
from adaptor.schedules import Schedule, SequentialSchedule, ParallelSchedule
from adaptor.utils import AdaptationArguments
from ..config import CONFIG as _CONFIG
ScheduleName = str
class FairSequentialSchedule(SequentialSchedule):
CONFIG = _CONFIG['recognition.FairSequentialSchedule']
MAX_NUM_TRAIN_EPOCHS = CONFIG.getint('maximum_number_of_training_epochs_per_objective')
label = 'fair_sequential'
def _sample_objectives(self, split: str) -> Iterable[Objective]:
while True:
for objective in self.objectives[split].values():
starting_epoch = objective.epoch
for _ in range(objective.dataset_length[split]):
if objective in self.converged_objectives and not self.args.log_converged_objectives:
continue
if split == 'train':
num_train_epochs = objective.epoch - starting_epoch
if num_train_epochs >= self.MAX_NUM_TRAIN_EPOCHS:
continue
yield objective
def get_schedule(schedule_name: str, objectives: Iterable[Objective],
adaptation_arguments: AdaptationArguments) -> Schedule:
objectives = list(objectives)
if schedule_name == 'sequential':
schedule = SequentialSchedule(objectives, adaptation_arguments)
elif schedule_name == 'fair-sequential':
schedule = FairSequentialSchedule(objectives, adaptation_arguments)
elif schedule_name == 'parallel':
schedule = ParallelSchedule(objectives, adaptation_arguments)
else:
raise ValueError(f'Unknown schedule "{schedule_name}"')
return schedule
import os
import sys
from ahisto_named_entity_search.recognition import NerModel
from ahisto_named_entity_search.recognition import NerModel, get_schedule
if __name__ == '__main__':
assert len(sys.argv) == 4
assert len(sys.argv) == 5
search_method = sys.argv[1]
cross_page_boundaries = sys.argv[2]
only_relevant = sys.argv[3]
schedule_name = sys.argv[4]
project_name = f'AHISTO NER: {search_method}, {cross_page_boundaries}, {only_relevant}'
os.environ['COMET_PROJECT_NAME'] = project_name
model_basename = f'model_ner_{search_method}_{cross_page_boundaries}_{only_relevant}_fair-sequential'
model_basename = f'model_ner_{search_method}_{cross_page_boundaries}_{only_relevant}_{schedule_name}'
model_checkpoint_basename = f'{model_basename}_checkpoints'
sentence_basename = f'dataset_mlm_{cross_page_boundaries}_{only_relevant}'
......@@ -31,4 +32,4 @@ if __name__ == '__main__':
NerModel.train_and_save(model_checkpoint_basename, model_basename,
training_sentence_basename, validation_sentence_basename,
training_tagged_sentence_basename,
validation_tagged_sentence_basename)
validation_tagged_sentence_basename, schedule_name)
......@@ -8,6 +8,7 @@ IMAGE_NAME=ahisto/named-entity-search:latest
ROOT_PATH=/nlp/projekty/ahisto/public_html/named-entity-search/results/
ANNOTATION_PATH=/nlp/projekty/ahisto/annotations/
OCR_EVAL_PATH=/nlp/projekty/ahisto/ahisto-ocr-eval
SCHEDULE_NAME=fair-sequential
DOCKER_BUILDKIT=1 docker build --build-arg UID="$(id -u)" --build-arg GID="$(id -g)" --build-arg UNAME="$(id -u -n)" . -f scripts//03_train_ner_models.Dockerfile -t "$IMAGE_NAME"
......@@ -15,5 +16,5 @@ parallel --halt=soon,fail=100% --jobs=100% --bar --delay 60 \
--colsep ' +' \
-- '
GPU_ID=$(nvidia-smi | grep -F -B 1 -- "0MiB / 15360MiB" | head -n 1 | awk "{ print \$2 }")
docker run --rm -u "$(id -u):$(id -g)" --hostname "'"$HOSTNAME"'" --runtime=nvidia -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e NVIDIA_VISIBLE_DEVICES="$GPU_ID" -e TOKENIZERS_PARALLELISM=false -e COMET_API_KEY -v "$PWD"/..:/workdir:rw -w /workdir/"${PWD##*/}" -v "'"$ROOT_PATH"'":"'"$ROOT_PATH"'":rw -v "'"$ANNOTATION_PATH"'":"'"$ANNOTATION_PATH"'":ro -v "'"$OCR_EVAL_PATH"'":"'"$OCR_EVAL_PATH"'":ro "'"$IMAGE_NAME"'" nice -n 19 python scripts/03_train_ner_models.py {1} {2} {3}
docker run --rm -u "$(id -u):$(id -g)" --hostname "'"$HOSTNAME"'" --runtime=nvidia -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e NVIDIA_VISIBLE_DEVICES="$GPU_ID" -e TOKENIZERS_PARALLELISM=false -e COMET_API_KEY -v "$PWD"/..:/workdir:rw -w /workdir/"${PWD##*/}" -v "'"$ROOT_PATH"'":"'"$ROOT_PATH"'":rw -v "'"$ANNOTATION_PATH"'":"'"$ANNOTATION_PATH"'":ro -v "'"$OCR_EVAL_PATH"'":"'"$OCR_EVAL_PATH"'":ro "'"$IMAGE_NAME"'" nice -n 19 python scripts/03_train_ner_models.py {1} {2} {3} '"$SCHEDULE_NAME"'
' :::: scripts/03_train_ner_models.tasks
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment