From 40bf45c1a7e6567761d2adc6c875f6948693d95b Mon Sep 17 00:00:00 2001
From: Vit Novotny <witiko@mail.muni.cz>
Date: Wed, 2 Mar 2022 22:03:26 +0000
Subject: [PATCH] Add scripts.combine_tesseract_with_google_docker

---
 .../combine_tesseract_with_google_docker.py   | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 scripts/combine_tesseract_with_google_docker.py

diff --git a/scripts/combine_tesseract_with_google_docker.py b/scripts/combine_tesseract_with_google_docker.py
new file mode 100644
index 00000000..5f4c8860
--- /dev/null
+++ b/scripts/combine_tesseract_with_google_docker.py
@@ -0,0 +1,72 @@
+import json
+from json import JSONDecodeError
+from pathlib import Path
+import shutil
+import sys
+
+from scripts.common import is_multicolumn
+
+
+TESSERACT_PATH = Path(sys.argv[1])
+GOOGLE_VISION_AI_PATH = Path(sys.argv[2])
+OUTPUT_PATH = Path(sys.argv[3])
+
+
+def read_google_json(filelike) -> str:
+    try:
+        ocr_output = json.load(filelike)
+    except JSONDecodeError:
+        return ''
+    if not ocr_output['responses'][0]:
+        return ''
+    return ocr_output['responses'][0]['fullTextAnnotation']['text']
+
+
+def main() -> None:
+    with (TESSERACT_PATH / 'list.txt').open('rt') as tf, \
+            (GOOGLE_VISION_AI_PATH / 'list.txt').open('rt') as gf:
+        tesseract_lines = list(tf)
+        google_lines = list(gf)
+
+    for tesseract_line, google_line in zip(tesseract_lines, google_lines):
+        tesseract_filename = Path(tesseract_line.rstrip('\r\n'))
+        google_filename = Path(google_line.rstrip('\r\n'))
+
+        input_tesseract_filename = (TESSERACT_PATH / tesseract_filename.stem).with_suffix('.txt')
+        input_google_filename = (GOOGLE_VISION_AI_PATH / google_filename.stem).with_suffix('.json')
+        output_filename = (OUTPUT_PATH / google_filename.stem).with_suffix('.txt')
+        decision_filename = (OUTPUT_PATH / google_filename.stem).with_suffix('.ground-truth')
+
+        for suffix in ('hocr', 'tsv', 'box', 'page'):
+            shutil.copy(
+                (TESSERACT_PATH / tesseract_filename.stem).with_suffix(f'.{suffix}'),
+                (OUTPUT_PATH / google_filename.stem).with_suffix(f'.{suffix}'),
+            )
+
+        for suffix in ('json', ):
+            shutil.copy(
+                (GOOGLE_VISION_AI_PATH / google_filename.stem).with_suffix(f'.{suffix}'),
+                (OUTPUT_PATH / google_filename.stem).with_suffix(f'.{suffix}'),
+            )
+
+        with input_tesseract_filename.with_suffix('.hocr').open('rt') as f:
+            hocr_content = f.read()
+
+        if not hocr_content.strip():
+            with output_filename.open('wt') as f:
+                print(file=f)
+            continue
+
+        if is_multicolumn(input_tesseract_filename.with_suffix('.hocr')):
+            shutil.copy(input_tesseract_filename, output_filename)
+            with decision_filename.open('wt') as f:
+                print('hocr', file=f)
+        else:
+            with input_google_filename.open('rt') as rf, output_filename.open('wt') as wf:
+                print(read_google_json(rf), file=wf)
+            with decision_filename.open('wt') as f:
+                print('json', file=f)
+
+
+if __name__ == '__main__':
+    main()
-- 
GitLab