diff --git a/Makefile b/Makefile index cb8fcec99833b7fa01a0b3e69345f02edebfa243..ef97f326fb7174b5d983f986f6e66e30de23b78c 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,8 @@ SHELL = /bin/bash TESSERACT_RUN = $(TESSERACT) TESSERACT_OPTIONS3 = --oem 0 --psm 3 -l ces+deu+lat --tessdata-dir $(TESSERACT_DATA) $(TESSERACT_OUTPUT_FORMATS) -TESSERACT_OPTIONS4 = --oem 2 --psm 3 -l ces+deu+lat --tessdata-dir $(TESSERACT_DATA) $(TESSERACT_OUTPUT_FORMATS) +TESSERACT_OPTIONS4 = --oem 1 --psm 3 -l ces+deu+lat --tessdata-dir $(TESSERACT_DATA) $(TESSERACT_OUTPUT_FORMATS) +TESSERACT_OPTIONS34 = --oem 2 --psm 3 -l ces+deu+lat --tessdata-dir $(TESSERACT_DATA) $(TESSERACT_OUTPUT_FORMATS) TESSERACT = tesseract TESSERACT_DATA = $(SCRIPT_DIRNAME)/tessdata TESSERACT_OUTPUT_FORMATS = hocr txt tsv makebox @@ -45,7 +46,7 @@ SCRIPT_DIRNAME = /var/tmp/tesseract SCRIPT_DIRNAME_SOURCE = asteria04:$(SCRIPT_DIRNAME) DATA_DIRNAME = /var/tmp/ahisto-2020-06-26 DATA_DIRNAME_SOURCE = mir:/mnt/nvme-storage/ahisto-2020-06-26 -INPUT_DIRNAME = $(DATA_DIRNAME)/Knihovna/ +INPUT_DIRNAME = $(DATA_DIRNAME)/Knihovna INPUT_DIRNAMES = $(SCRIPT_DIRNAME)/input_dirnames INPUT_FILENAMES = $(SCRIPT_DIRNAME)/input_filenames INPUT_FILENAMES_FILTER = $(SCRIPT_DIRNAME)/input_filenames_filter @@ -54,16 +55,19 @@ INPUT_SQL_DUMP = $(DATA_DIRNAME)/CMS_archiv_SQL_20200528.sql.gz INPUT_CSV_DUMP = $(DATA_DIRNAME)/'Obsah CMS Sources online - zakladni data a signatury.csv' OUTPUT_DESKEWED_SCRIPT_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed -OUTPUT_DESKEWED_OCR3_SCRIPT_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr3 -OUTPUT_DESKEWED_OCR4_SCRIPT_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr4 -OUTPUT_OCR3_SCRIPT_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr3 +OUTPUT_OCR3_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr3 +OUTPUT_DESKEWED_OCR3_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr3 +OUTPUT_OCR34_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr3+4 +OUTPUT_DESKEWED_OCR34_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr3+4 +OUTPUT_OCR4_DIRNAME = $(SCRIPT_DIRNAME)/output-ocr4 +OUTPUT_DESKEWED_OCR4_DIRNAME = $(SCRIPT_DIRNAME)/output-deskewed-ocr4 OUTPUT_GROUND_TRUTH_DIRNAME = $(SCRIPT_DIRNAME)/ground-truth OUTPUT_GROUND_TRUTH_FILENAMES = $(SCRIPT_DIRNAME)/ground-truth_filenames PYTHON = python3 PYTHON_RUN = $(PYTHON) -m -OUTPUT_OCRS = $(OUTPUT_OCR3_SCRIPT_DIRNAME) $(OUTPUT_DESKEWED_OCR3_SCRIPT_DIRNAME) $(OUTPUT_DESKEWED_OCR4_SCRIPT_DIRNAME) +OUTPUT_OCRS = $(OUTPUT_OCR3_DIRNAME) $(OUTPUT_DESKEWED_OCR3_DIRNAME) $(OUTPUT_OCR34_DIRNAME) $(OUTPUT_DESKEWED_OCR34_DIRNAME) $(OUTPUT_OCR4_DIRNAME) $(OUTPUT_DESKEWED_OCR4_DIRNAME) OUTPUTS = $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(OUTPUT_OCRS) $(OUTPUT_GROUND_TRUTH_FILENAMES) all: $(OUTPUTS) @@ -132,11 +136,20 @@ define resume-ocr = $(PARALLEL_RUN) $(PARALLEL_MANY_NODES_OPTIONS) --timeout $(TESSERACT_TIMEOUT) --resume-failed -- '$(TESSERACT_RUN) $</{} $@/{.} $(1)' :::: $(INPUT_FILENAMES_FILTERED) endef -$(OUTPUT_DESKEWED_OCR3_SCRIPT_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED) +$(OUTPUT_OCR3_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED) $(call ocr,$(TESSERACT_OPTIONS3)) -$(OUTPUT_DESKEWED_OCR4_SCRIPT_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED) +$(OUTPUT_OCR34_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED) + $(call ocr,$(TESSERACT_OPTIONS34)) + +$(OUTPUT_OCR4_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED) $(call ocr,$(TESSERACT_OPTIONS4)) -$(OUTPUT_OCR3_SCRIPT_DIRNAME): $(INPUT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED) +$(OUTPUT_DESKEWED_OCR3_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED) $(call ocr,$(TESSERACT_OPTIONS3)) + +$(OUTPUT_DESKEWED_OCR34_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED) + $(call ocr,$(TESSERACT_OPTIONS34)) + +$(OUTPUT_DESKEWED_OCR4_DIRNAME): $(OUTPUT_DESKEWED_SCRIPT_DIRNAME) $(TESSERACT_DATA) $(INPUT_DIRNAMES) $(INPUT_FILENAMES_FILTERED) + $(call ocr,$(TESSERACT_OPTIONS4)) diff --git a/output-deskewed-ocr4.joblog b/output-deskewed-ocr3+4.joblog similarity index 100% rename from output-deskewed-ocr4.joblog rename to output-deskewed-ocr3+4.joblog diff --git a/output-ocr3+4.joblog b/output-ocr3+4.joblog new file mode 100644 index 0000000000000000000000000000000000000000..98abb72132a97e5823faad130794bb103f060699 Binary files /dev/null and b/output-ocr3+4.joblog differ diff --git a/output-ocr4.joblog b/output-ocr4.joblog new file mode 100644 index 0000000000000000000000000000000000000000..87d65451efbfcb7142393e306c5a8170243024b1 Binary files /dev/null and b/output-ocr4.joblog differ