From 40bf45c1a7e6567761d2adc6c875f6948693d95b Mon Sep 17 00:00:00 2001 From: Vit Novotny <witiko@mail.muni.cz> Date: Wed, 2 Mar 2022 22:03:26 +0000 Subject: [PATCH] Add scripts.combine_tesseract_with_google_docker --- .../combine_tesseract_with_google_docker.py | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 scripts/combine_tesseract_with_google_docker.py diff --git a/scripts/combine_tesseract_with_google_docker.py b/scripts/combine_tesseract_with_google_docker.py new file mode 100644 index 00000000..5f4c8860 --- /dev/null +++ b/scripts/combine_tesseract_with_google_docker.py @@ -0,0 +1,72 @@ +import json +from json import JSONDecodeError +from pathlib import Path +import shutil +import sys + +from scripts.common import is_multicolumn + + +TESSERACT_PATH = Path(sys.argv[1]) +GOOGLE_VISION_AI_PATH = Path(sys.argv[2]) +OUTPUT_PATH = Path(sys.argv[3]) + + +def read_google_json(filelike) -> str: + try: + ocr_output = json.load(filelike) + except JSONDecodeError: + return '' + if not ocr_output['responses'][0]: + return '' + return ocr_output['responses'][0]['fullTextAnnotation']['text'] + + +def main() -> None: + with (TESSERACT_PATH / 'list.txt').open('rt') as tf, \ + (GOOGLE_VISION_AI_PATH / 'list.txt').open('rt') as gf: + tesseract_lines = list(tf) + google_lines = list(gf) + + for tesseract_line, google_line in zip(tesseract_lines, google_lines): + tesseract_filename = Path(tesseract_line.rstrip('\r\n')) + google_filename = Path(google_line.rstrip('\r\n')) + + input_tesseract_filename = (TESSERACT_PATH / tesseract_filename.stem).with_suffix('.txt') + input_google_filename = (GOOGLE_VISION_AI_PATH / google_filename.stem).with_suffix('.json') + output_filename = (OUTPUT_PATH / google_filename.stem).with_suffix('.txt') + decision_filename = (OUTPUT_PATH / google_filename.stem).with_suffix('.ground-truth') + + for suffix in ('hocr', 'tsv', 'box', 'page'): + shutil.copy( + (TESSERACT_PATH / tesseract_filename.stem).with_suffix(f'.{suffix}'), + (OUTPUT_PATH / google_filename.stem).with_suffix(f'.{suffix}'), + ) + + for suffix in ('json', ): + shutil.copy( + (GOOGLE_VISION_AI_PATH / google_filename.stem).with_suffix(f'.{suffix}'), + (OUTPUT_PATH / google_filename.stem).with_suffix(f'.{suffix}'), + ) + + with input_tesseract_filename.with_suffix('.hocr').open('rt') as f: + hocr_content = f.read() + + if not hocr_content.strip(): + with output_filename.open('wt') as f: + print(file=f) + continue + + if is_multicolumn(input_tesseract_filename.with_suffix('.hocr')): + shutil.copy(input_tesseract_filename, output_filename) + with decision_filename.open('wt') as f: + print('hocr', file=f) + else: + with input_google_filename.open('rt') as rf, output_filename.open('wt') as wf: + print(read_google_json(rf), file=wf) + with decision_filename.open('wt') as f: + print('json', file=f) + + +if __name__ == '__main__': + main() -- GitLab