Commit cc65efdc authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Add output-ocr-calamari.{log,accuracy-results}

parent 19e63942
Loading
Loading
Loading
Loading
+55 −22
Original line number Original line Diff line number Diff line
.PHONY: all import-sql-dump remount
.PHONY: all import-sql-dump remount setup-python


SHELL = /bin/bash
SHELL = /bin/bash


@@ -13,6 +13,12 @@ TESSERACT_DATA = $(SCRIPT_DIRNAME)/tessdata
TESSERACT_OUTPUT_FORMATS = hocr txt tsv makebox
TESSERACT_OUTPUT_FORMATS = hocr txt tsv makebox
TESSERACT_TIMEOUT = 120
TESSERACT_TIMEOUT = 120


CALAMARI_PREDICT_RUN = $(CALAMARI_PREDICT)
CALAMARI_PREDICT = calamari-predict
CALAMARI_PREDICT_OPTIONS = --checkpoint $$(sed 's|^|$(CALAMARI_MODELS)/|' $(CALAMARI_MODEL_FILENAMES))
CALAMARI_MODELS = $(SCRIPT_DIRNAME)/calamari_models
CALAMARI_MODEL_FILENAMES = $(SCRIPT_DIRNAME)/calamari_models_filenames

CONVERT_RUN = $(CONVERT)
CONVERT_RUN = $(CONVERT)
CONVERT_OPTIONS = -deskew 45% -quality 100% +repage
CONVERT_OPTIONS = -deskew 45% -quality 100% +repage
CONVERT = convert
CONVERT = convert
@@ -57,23 +63,32 @@ INPUT_SQL_DUMP = $(DATA_DIRNAME)/CMS_archiv_SQL_20200528.sql.gz
INPUT_CSV_DUMP = $(DATA_DIRNAME)/'Obsah CMS Sources online - zakladni data a signatury.csv'
INPUT_CSV_DUMP = $(DATA_DIRNAME)/'Obsah CMS Sources online - zakladni data a signatury.csv'


OUTPUT_DESKEWED_SCRIPT_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed
OUTPUT_DESKEWED_SCRIPT_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed
OUTPUT_OCR3_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr3
OUTPUT_OCR_TESSERACT3_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr3
OUTPUT_DESKEWED_OCR3_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr3
OUTPUT_DESKEWED_OCR_TESSERACT3_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr3
OUTPUT_OCR34_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr3+4
OUTPUT_OCR_TESSERACT34_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr3+4
OUTPUT_DESKEWED_OCR34_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr3+4
OUTPUT_DESKEWED_OCR_TESSERACT34_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr3+4
OUTPUT_OCR4_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr4
OUTPUT_OCR_TESSERACT4_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr4
OUTPUT_DESKEWED_OCR4_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr4
OUTPUT_DESKEWED_OCR_TESSERACT4_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr4
OUTPUT_OCR_CALAMARI_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr-calamari
OUTPUT_GROUND_TRUTH_DIRNAME = $(SCRIPT_DIRNAME)/ground-truth
OUTPUT_GROUND_TRUTH_DIRNAME = $(SCRIPT_DIRNAME)/ground-truth
OUTPUT_GROUND_TRUTH_FILENAMES = $(SCRIPT_DIRNAME)/ground-truth_filenames
OUTPUT_GROUND_TRUTH_FILENAMES = $(SCRIPT_DIRNAME)/ground-truth_filenames


PYTHON = python3
PYTHON = python3
PYTHON_RUN = nice -n $(NICENESS) $(PYTHON) -m
PYTHON_RUN = nice -n $(NICENESS) $(PYTHON) -m


OUTPUT_OCRS = $(OUTPUT_OCR3_DIRNAME) $(OUTPUT_DESKEWED_OCR3_DIRNAME) $(OUTPUT_OCR34_DIRNAME) $(OUTPUT_DESKEWED_OCR34_DIRNAME) $(OUTPUT_OCR4_DIRNAME) $(OUTPUT_DESKEWED_OCR4_DIRNAME)
OUTPUT_OCRS = $(OUTPUT_OCR_TESSERACT3_DIRNAME) $(OUTPUT_DESKEWED_OCR_TESSERACT3_DIRNAME) $(OUTPUT_OCR_TESSERACT34_DIRNAME) $(OUTPUT_DESKEWED_OCR_TESSERACT34_DIRNAME) $(OUTPUT_OCR_TESSERACT4_DIRNAME) $(OUTPUT_DESKEWED_OCR_TESSERACT4_DIRNAME) $(OUTPUT_OCR_CALAMARI_DIRNAME)
OUTPUTS = $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(OUTPUT_OCRS) $(OUTPUT_GROUND_TRUTH_FILENAMES)
OUTPUTS = $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(OUTPUT_OCRS) $(OUTPUT_GROUND_TRUTH_FILENAMES)


all: $(OUTPUTS)
all: $(OUTPUTS)


setup-python:
	curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh | bash
	conda create --name tesseract python=3 cudatoolkit=10.1 pip
	curl https://developer.nvidia.com/compute/machine-learning/cudnn/secure/7.6.5.32/Production/10.1_20191031/cudnn-10.1-linux-x64-v7.6.5.32.tgz | tar xzv
	mv conda/lib64/*.so ~/miniconda3/envs/tesseract/lib
	conda activate tesseract
	pip install -r requirements.txt

import-sql-dump:
import-sql-dump:
	(echo 'USE CMS_archiv;'; gzip -d < <(pv $(INPUT_SQL_DUMP))) | $(MYSQL_RUN) $(MYSQL_OPTIONS)
	(echo 'USE CMS_archiv;'; gzip -d < <(pv $(INPUT_SQL_DUMP))) | $(MYSQL_RUN) $(MYSQL_OPTIONS)


@@ -101,6 +116,9 @@ $(TESSERACT_DATA):
	git clone https://github.com/tesseract-ocr/tessdata.git $@
	git clone https://github.com/tesseract-ocr/tessdata.git $@
	git clone https://github.com/tesseract-ocr/tessconfigs.git $@/tessconfigs
	git clone https://github.com/tesseract-ocr/tessconfigs.git $@/tessconfigs


$(CALAMARI_MODELS):
	git clone https://github.com/Calamari-OCR/calamari_models.git $@

$(INPUT_DIRNAMES):
$(INPUT_DIRNAMES):
	(cd $(INPUT_DIRNAME) && find -type d) > $@
	(cd $(INPUT_DIRNAME) && find -type d) > $@


@@ -127,13 +145,25 @@ mkdir -p $@
(cd $@ && $(PARALLEL_RUN) $(PARALLEL_SINGLE_NODE_OPTIONS) -- 'mkdir -p {}') < $(INPUT_DIRNAMES)
(cd $@ && $(PARALLEL_RUN) $(PARALLEL_SINGLE_NODE_OPTIONS) -- 'mkdir -p {}') < $(INPUT_DIRNAMES)
endef
endef


define ocr =
define symlink-directories
# symlink directory structure
rm -rf $@
cp -as $< $@
endef

define ocr-calamari=
$(symlink-directories)
# run the ocr
tr '\r\n' '\0' < $(INPUT_FILENAMES_FILTERED) | (cd $@ && time xargs --null -- $(CALAMARI_PREDICT_RUN) $(1) --files) |& tee $@.log
endef

define ocr-tesseract =
$(create-directories)
$(create-directories)
# run the ocr
# run the ocr
$(PARALLEL_RUN) $(PARALLEL_MANY_NODES_OPTIONS) --timeout $(TESSERACT_TIMEOUT) -- '$(TESSERACT_RUN) $</{} $@/{.} $(1)' :::: $(INPUT_FILENAMES_FILTERED)
$(PARALLEL_RUN) $(PARALLEL_MANY_NODES_OPTIONS) --timeout $(TESSERACT_TIMEOUT) -- '$(TESSERACT_RUN) $</{} $@/{.} $(1)' :::: $(INPUT_FILENAMES_FILTERED)
endef
endef


define resume-ocr =
define resume-ocr-tesseract =
# resume the ocr
# resume the ocr
$(PARALLEL_RUN) $(PARALLEL_MANY_NODES_OPTIONS) --timeout $(TESSERACT_TIMEOUT) --resume-failed -- '$(TESSERACT_RUN) $</{} $@/{.} $(1)' :::: $(INPUT_FILENAMES_FILTERED)
$(PARALLEL_RUN) $(PARALLEL_MANY_NODES_OPTIONS) --timeout $(TESSERACT_TIMEOUT) --resume-failed -- '$(TESSERACT_RUN) $</{} $@/{.} $(1)' :::: $(INPUT_FILENAMES_FILTERED)
endef
endef
@@ -144,20 +174,23 @@ endef
%.speed-results: %.joblog
%.speed-results: %.joblog
	$(PYTHON_RUN) scripts.evaluate_speed $< > $@
	$(PYTHON_RUN) scripts.evaluate_speed $< > $@


$(OUTPUT_OCR3_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
$(OUTPUT_OCR_TESSERACT3_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
	$(call ocr,$(TESSERACT_OPTIONS3))
	$(call ocr-tesseract,$(TESSERACT_OPTIONS3))

$(OUTPUT_OCR_TESSERACT34_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
	$(call ocr-tesseract,$(TESSERACT_OPTIONS34))


$(OUTPUT_OCR34_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
$(OUTPUT_OCR_TESSERACT4_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
	$(call ocr,$(TESSERACT_OPTIONS34))
	$(call ocr-tesseract,$(TESSERACT_OPTIONS4))


$(OUTPUT_OCR4_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
$(OUTPUT_OCR_CALAMARI_DIRNAME): $(INPUT_DIRNAME) $(CALAMARI_MODELS) $(CALAMARI_MODEL_FILENAMES) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
	$(call ocr,$(TESSERACT_OPTIONS4))
	$(call ocr-calamari,$(CALAMARI_PREDICT_OPTIONS))


$(OUTPUT_DESKEWED_OCR3_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
$(OUTPUT_DESKEWED_OCR_TESSERACT3_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
	$(call ocr,$(TESSERACT_OPTIONS3))
	$(call ocr-tesseract,$(TESSERACT_OPTIONS3))


$(OUTPUT_DESKEWED_OCR34_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
$(OUTPUT_DESKEWED_OCR_TESSERACT34_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
	$(call ocr,$(TESSERACT_OPTIONS34))
	$(call ocr-tesseract,$(TESSERACT_OPTIONS34))


$(OUTPUT_DESKEWED_OCR4_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
$(OUTPUT_DESKEWED_OCR_TESSERACT4_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED)
	$(call ocr,$(TESSERACT_OPTIONS4))
	$(call ocr-tesseract,$(TESSERACT_OPTIONS4))
+6 −0
Original line number Original line Diff line number Diff line
antiqua_historical/4.ckpt
antiqua_historical_ligs/4.ckpt
antiqua_modern/4.ckpt
fraktur_historical/4.ckpt
fraktur_historical_ligs/4.ckpt
fraktur_19th_century/4.ckpt
+36093 −0

File added.

Preview size limit exceeded, changes collapsed.

+4 −2
Original line number Original line Diff line number Diff line
calamari-ocr~=1.0.5
edit_distance~=1.0.4
gensim~=3.8.3
gensim~=3.8.3
mysqlclient~=1.4.6
mysqlclient~=1.4.6
edit_distance~=1.0.4
tqdm~=4.46.1
numpy~=1.19.0
numpy~=1.19.0
scipy~=1.5.0
scipy~=1.5.0
tensorflow-gpu~=2.2.0
tqdm~=4.46.1
+8 −4
Original line number Original line Diff line number Diff line
@@ -30,7 +30,8 @@ OCR_FILENAMES = [
GROUND_TRUTH_FILENAMES_LIST_FILENAME = sys.argv[2]
GROUND_TRUTH_FILENAMES_LIST_FILENAME = sys.argv[2]
GROUND_TRUTH_FILENAMES = set(read_filenames(GROUND_TRUTH_FILENAMES_LIST_FILENAME))
GROUND_TRUTH_FILENAMES = set(read_filenames(GROUND_TRUTH_FILENAMES_LIST_FILENAME))
DEACCENTED_FILENAME_MAP = {
DEACCENTED_FILENAME_MAP = {
    deaccent(filename): filename
    deaccent(filename): '{}.pred.txt'.format(filename[:-4])
#   deaccent(filename): filename
    for filename
    for filename
    in OCR_FILENAMES
    in OCR_FILENAMES
    if deaccent(filename) in GROUND_TRUTH_FILENAMES
    if deaccent(filename) in GROUND_TRUTH_FILENAMES
@@ -45,8 +46,11 @@ assert len(DEACCENTED_FILENAME_MAP) == len(GROUND_TRUTH_FILENAMES)


def evaluate_worker(deaccented_filename):
def evaluate_worker(deaccented_filename):
    filename = DEACCENTED_FILENAME_MAP[deaccented_filename]
    filename = DEACCENTED_FILENAME_MAP[deaccented_filename]
    try:
        correct_text = (INPUT_ROOT / deaccented_filename).open('rt').read()
        correct_text = (INPUT_ROOT / deaccented_filename).open('rt').read()
        predicted_text = (OUTPUT_ROOT / filename).open('rt').read()
        predicted_text = (OUTPUT_ROOT / filename).open('rt').read()
    except IOError:
        return None
    character_error_rate = get_character_error_rate(correct_text, predicted_text)
    character_error_rate = get_character_error_rate(correct_text, predicted_text)
    word_error_rate = get_word_error_rate(correct_text, predicted_text)
    word_error_rate = get_word_error_rate(correct_text, predicted_text)
    return (character_error_rate, word_error_rate)
    return (character_error_rate, word_error_rate)
@@ -81,7 +85,7 @@ def evaluate():
def sanity_check():
def sanity_check():
    for deaccented_filename, filename in DEACCENTED_FILENAME_MAP.items():
    for deaccented_filename, filename in DEACCENTED_FILENAME_MAP.items():
        assert (INPUT_ROOT / deaccented_filename).exists(), (INPUT_ROOT / deaccented_filename)
        assert (INPUT_ROOT / deaccented_filename).exists(), (INPUT_ROOT / deaccented_filename)
        assert (OUTPUT_ROOT / filename).exists(), (OUTPUT_ROOT / filename)
        # assert (OUTPUT_ROOT / filename).exists(), (OUTPUT_ROOT / filename)




def main():
def main():