Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
nlp
ahisto-modules
Named Entity Recognition Experiments
Commits
3d99edcc
Commit
3d99edcc
authored
Aug 13, 2022
by
Vít Novotný
Browse files
Separate NerModel training schedules to `recognition.schedule`
parent
63d085bd
Changes
6
Hide whitespace changes
Inline
Side-by-side
ahisto_named_entity_search/default.ini
View file @
3d99edcc
...
...
@@ -97,6 +97,7 @@ log_every_n_steps = 100
evaluate_every_n_steps
=
10000
save_every_n_steps
=
10000
number_of_training_epochs
=
10
schedule
=
fair-sequential-schedule
[recognition.FairSequentialSchedule]
maximum_number_of_training_epochs_per_objective
=
1
ahisto_named_entity_search/recognition/__init__.py
View file @
3d99edcc
...
...
@@ -2,7 +2,16 @@ from .model import (
NerModel
)
from
.schedule
import
(
get_schedule
,
ScheduleName
,
Schedule
,
)
__all__
=
[
'get_schedule'
,
'NerModel'
,
'ScheduleName'
,
'Schedule'
,
]
ahisto_named_entity_search/recognition/model.py
View file @
3d99edcc
...
...
@@ -2,47 +2,26 @@ from __future__ import annotations
from
logging
import
getLogger
from
pathlib
import
Path
from
typing
import
Tuple
,
List
,
Iterable
from
typing
import
Tuple
,
List
,
Optional
import
comet_ml
# noqa: F401
from
adaptor.adapter
import
Adapter
from
adaptor.objectives.objective_base
import
Objective
from
adaptor.objectives.classification
import
TokenClassification
from
adaptor.evaluators.token_classification
import
MeanFScore
from
adaptor.objectives.MLM
import
MaskedLanguageModeling
from
adaptor.lang_module
import
LangModule
from
adaptor.schedules
import
SequentialSchedule
from
adaptor.utils
import
StoppingStrategy
,
AdaptationArguments
from
transformers
import
AutoModelForTokenClassification
from
..config
import
CONFIG
as
_CONFIG
from
..document
import
Document
,
Sentence
from
..search
import
TaggedSentence
,
NerTags
from
.schedule
import
ScheduleName
,
get_schedule
LOGGER
=
getLogger
(
__name__
)
class
FairSequentialSchedule
(
SequentialSchedule
):
CONFIG
=
_CONFIG
[
'recognition.FairSequentialSchedule'
]
MAX_NUM_TRAIN_EPOCHS
=
CONFIG
.
getint
(
'maximum_number_of_training_epochs_per_objective'
)
label
=
'fair_sequential'
def
_sample_objectives
(
self
,
split
:
str
)
->
Iterable
[
Objective
]:
while
True
:
for
objective
in
self
.
objectives
[
split
].
values
():
starting_epoch
=
objective
.
epoch
for
_
in
range
(
objective
.
dataset_length
[
split
]):
if
objective
in
self
.
converged_objectives
and
not
self
.
args
.
log_converged_objectives
:
continue
if
split
==
'train'
:
num_train_epochs
=
objective
.
epoch
-
starting_epoch
if
num_train_epochs
>=
self
.
MAX_NUM_TRAIN_EPOCHS
:
continue
yield
objective
class
NerModel
:
CONFIG
=
_CONFIG
[
'recognition.NerModel'
]
ROOT_PATH
=
Path
(
CONFIG
[
'root_path'
])
...
...
@@ -53,6 +32,7 @@ class NerModel:
SAVE_STEPS
=
CONFIG
.
getint
(
'save_every_n_steps'
)
LOGGING_STEPS
=
CONFIG
.
getint
(
'log_every_n_steps'
)
NUM_TRAIN_EPOCHS
=
CONFIG
.
getint
(
'number_of_training_epochs'
)
SCHEDULE_NAME
=
CONFIG
[
'schedule'
]
def
__init__
(
self
,
model
:
AutoModelForTokenClassification
):
self
.
model
=
model
...
...
@@ -61,7 +41,11 @@ class NerModel:
def
train_and_save
(
cls
,
model_checkpoint_basename
:
str
,
model_basename
:
str
,
training_sentence_basename
:
str
,
validation_sentence_basename
:
str
,
training_tagged_sentence_basename
:
str
,
validation_tagged_sentence_basename
:
str
)
->
None
:
validation_tagged_sentence_basename
:
str
,
schedule_name
:
Optional
[
ScheduleName
]
=
None
)
->
None
:
if
schedule_name
is
None
:
schedule_name
=
cls
.
SCHEDULE_NAME
lang_module
=
LangModule
(
cls
.
BASE_MODEL
)
...
...
@@ -113,7 +97,7 @@ class NerModel:
fp16_full_eval
=
True
,
)
schedule
=
FairSequentialSchedule
(
[
mlm_objective
,
ner_objective
],
adaptation_arguments
)
schedule
=
get_schedule
(
schedule_name
,
[
mlm_objective
,
ner_objective
],
adaptation_arguments
)
adapter
=
Adapter
(
lang_module
,
schedule
,
adaptation_arguments
)
adapter
.
train
()
...
...
ahisto_named_entity_search/recognition/schedule.py
0 → 100644
View file @
3d99edcc
from
typing
import
Iterable
from
adaptor.objectives.objective_base
import
Objective
from
adaptor.schedules
import
Schedule
,
SequentialSchedule
,
ParallelSchedule
from
adaptor.utils
import
AdaptationArguments
from
..config
import
CONFIG
as
_CONFIG
ScheduleName
=
str
class
FairSequentialSchedule
(
SequentialSchedule
):
CONFIG
=
_CONFIG
[
'recognition.FairSequentialSchedule'
]
MAX_NUM_TRAIN_EPOCHS
=
CONFIG
.
getint
(
'maximum_number_of_training_epochs_per_objective'
)
label
=
'fair_sequential'
def
_sample_objectives
(
self
,
split
:
str
)
->
Iterable
[
Objective
]:
while
True
:
for
objective
in
self
.
objectives
[
split
].
values
():
starting_epoch
=
objective
.
epoch
for
_
in
range
(
objective
.
dataset_length
[
split
]):
if
objective
in
self
.
converged_objectives
and
not
self
.
args
.
log_converged_objectives
:
continue
if
split
==
'train'
:
num_train_epochs
=
objective
.
epoch
-
starting_epoch
if
num_train_epochs
>=
self
.
MAX_NUM_TRAIN_EPOCHS
:
continue
yield
objective
def
get_schedule
(
schedule_name
:
str
,
objectives
:
Iterable
[
Objective
],
adaptation_arguments
:
AdaptationArguments
)
->
Schedule
:
objectives
=
list
(
objectives
)
if
schedule_name
==
'sequential'
:
schedule
=
SequentialSchedule
(
objectives
,
adaptation_arguments
)
elif
schedule_name
==
'fair-sequential'
:
schedule
=
FairSequentialSchedule
(
objectives
,
adaptation_arguments
)
elif
schedule_name
==
'parallel'
:
schedule
=
ParallelSchedule
(
objectives
,
adaptation_arguments
)
else
:
raise
ValueError
(
f
'Unknown schedule "
{
schedule_name
}
"'
)
return
schedule
scripts/03_train_ner_models.py
View file @
3d99edcc
import
os
import
sys
from
ahisto_named_entity_search.recognition
import
NerModel
from
ahisto_named_entity_search.recognition
import
NerModel
,
get_schedule
if
__name__
==
'__main__'
:
assert
len
(
sys
.
argv
)
==
4
assert
len
(
sys
.
argv
)
==
5
search_method
=
sys
.
argv
[
1
]
cross_page_boundaries
=
sys
.
argv
[
2
]
only_relevant
=
sys
.
argv
[
3
]
schedule_name
=
sys
.
argv
[
4
]
project_name
=
f
'AHISTO NER:
{
search_method
}
,
{
cross_page_boundaries
}
,
{
only_relevant
}
'
os
.
environ
[
'COMET_PROJECT_NAME'
]
=
project_name
model_basename
=
f
'model_ner_
{
search_method
}
_
{
cross_page_boundaries
}
_
{
only_relevant
}
_
fair-sequential
'
model_basename
=
f
'model_ner_
{
search_method
}
_
{
cross_page_boundaries
}
_
{
only_relevant
}
_
{
schedule_name
}
'
model_checkpoint_basename
=
f
'
{
model_basename
}
_checkpoints'
sentence_basename
=
f
'dataset_mlm_
{
cross_page_boundaries
}
_
{
only_relevant
}
'
...
...
@@ -31,4 +32,4 @@ if __name__ == '__main__':
NerModel
.
train_and_save
(
model_checkpoint_basename
,
model_basename
,
training_sentence_basename
,
validation_sentence_basename
,
training_tagged_sentence_basename
,
validation_tagged_sentence_basename
)
validation_tagged_sentence_basename
,
schedule_name
)
scripts/03_train_ner_models.sh
View file @
3d99edcc
...
...
@@ -8,6 +8,7 @@ IMAGE_NAME=ahisto/named-entity-search:latest
ROOT_PATH
=
/nlp/projekty/ahisto/public_html/named-entity-search/results/
ANNOTATION_PATH
=
/nlp/projekty/ahisto/annotations/
OCR_EVAL_PATH
=
/nlp/projekty/ahisto/ahisto-ocr-eval
SCHEDULE_NAME
=
fair-sequential
DOCKER_BUILDKIT
=
1 docker build
--build-arg
UID
=
"
$(
id
-u
)
"
--build-arg
GID
=
"
$(
id
-g
)
"
--build-arg
UNAME
=
"
$(
id
-u
-n
)
"
.
-f
scripts//03_train_ner_models.Dockerfile
-t
"
$IMAGE_NAME
"
...
...
@@ -15,5 +16,5 @@ parallel --halt=soon,fail=100% --jobs=100% --bar --delay 60 \
--colsep
' +'
\
--
'
GPU_ID=$(nvidia-smi | grep -F -B 1 -- "0MiB / 15360MiB" | head -n 1 | awk "{ print \$2 }")
docker run --rm -u "$(id -u):$(id -g)" --hostname "'
"
$HOSTNAME
"
'" --runtime=nvidia -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e NVIDIA_VISIBLE_DEVICES="$GPU_ID" -e TOKENIZERS_PARALLELISM=false -e COMET_API_KEY -v "$PWD"/..:/workdir:rw -w /workdir/"${PWD##*/}" -v "'
"
$ROOT_PATH
"
'":"'
"
$ROOT_PATH
"
'":rw -v "'
"
$ANNOTATION_PATH
"
'":"'
"
$ANNOTATION_PATH
"
'":ro -v "'
"
$OCR_EVAL_PATH
"
'":"'
"
$OCR_EVAL_PATH
"
'":ro "'
"
$IMAGE_NAME
"
'" nice -n 19 python scripts/03_train_ner_models.py {1} {2} {3}
docker run --rm -u "$(id -u):$(id -g)" --hostname "'
"
$HOSTNAME
"
'" --runtime=nvidia -e CUDA_DEVICE_ORDER=PCI_BUS_ID -e NVIDIA_VISIBLE_DEVICES="$GPU_ID" -e TOKENIZERS_PARALLELISM=false -e COMET_API_KEY -v "$PWD"/..:/workdir:rw -w /workdir/"${PWD##*/}" -v "'
"
$ROOT_PATH
"
'":"'
"
$ROOT_PATH
"
'":rw -v "'
"
$ANNOTATION_PATH
"
'":"'
"
$ANNOTATION_PATH
"
'":ro -v "'
"
$OCR_EVAL_PATH
"
'":"'
"
$OCR_EVAL_PATH
"
'":ro "'
"
$IMAGE_NAME
"
'" nice -n 19 python scripts/03_train_ner_models.py {1} {2} {3}
'
"
$SCHEDULE_NAME
"
'
'
:::: scripts/03_train_ner_models.tasks
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment