Skip to content
Snippets Groups Projects
Commit 40bf45c1 authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Add scripts.combine_tesseract_with_google_docker

parent 07f1f065
No related branches found
No related tags found
No related merge requests found
Pipeline #
import json
from json import JSONDecodeError
from pathlib import Path
import shutil
import sys
from scripts.common import is_multicolumn
TESSERACT_PATH = Path(sys.argv[1])
GOOGLE_VISION_AI_PATH = Path(sys.argv[2])
OUTPUT_PATH = Path(sys.argv[3])
def read_google_json(filelike) -> str:
try:
ocr_output = json.load(filelike)
except JSONDecodeError:
return ''
if not ocr_output['responses'][0]:
return ''
return ocr_output['responses'][0]['fullTextAnnotation']['text']
def main() -> None:
with (TESSERACT_PATH / 'list.txt').open('rt') as tf, \
(GOOGLE_VISION_AI_PATH / 'list.txt').open('rt') as gf:
tesseract_lines = list(tf)
google_lines = list(gf)
for tesseract_line, google_line in zip(tesseract_lines, google_lines):
tesseract_filename = Path(tesseract_line.rstrip('\r\n'))
google_filename = Path(google_line.rstrip('\r\n'))
input_tesseract_filename = (TESSERACT_PATH / tesseract_filename.stem).with_suffix('.txt')
input_google_filename = (GOOGLE_VISION_AI_PATH / google_filename.stem).with_suffix('.json')
output_filename = (OUTPUT_PATH / google_filename.stem).with_suffix('.txt')
decision_filename = (OUTPUT_PATH / google_filename.stem).with_suffix('.ground-truth')
for suffix in ('hocr', 'tsv', 'box', 'page'):
shutil.copy(
(TESSERACT_PATH / tesseract_filename.stem).with_suffix(f'.{suffix}'),
(OUTPUT_PATH / google_filename.stem).with_suffix(f'.{suffix}'),
)
for suffix in ('json', ):
shutil.copy(
(GOOGLE_VISION_AI_PATH / google_filename.stem).with_suffix(f'.{suffix}'),
(OUTPUT_PATH / google_filename.stem).with_suffix(f'.{suffix}'),
)
with input_tesseract_filename.with_suffix('.hocr').open('rt') as f:
hocr_content = f.read()
if not hocr_content.strip():
with output_filename.open('wt') as f:
print(file=f)
continue
if is_multicolumn(input_tesseract_filename.with_suffix('.hocr')):
shutil.copy(input_tesseract_filename, output_filename)
with decision_filename.open('wt') as f:
print('hocr', file=f)
else:
with input_google_filename.open('rt') as rf, output_filename.open('wt') as wf:
print(read_google_json(rf), file=wf)
with decision_filename.open('wt') as f:
print('json', file=f)
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment