Loading ahisto_ocr/ocr.py +40 −26 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from pathlib import Path from .config import CONFIG as _CONFIG from .util import ( create_temporary_docker_volume, create_temporary_docker_container, run_docker_container, extract_text_file_from_container, add_text_file_to_container, Loading Loading @@ -151,13 +152,21 @@ def combine_results(client, tesseract_volume, google_vision_ai_volume, output_vo from scripts.common import is_multicolumn with open('/input/list.txt', 'rt') as rf, open('/output/detected-layout.txt', 'wt') as wf: for line in rf: with open('/input/list.txt', 'rt') as f: lines = list(f) for line in lines: filename = Path(line.rstrip('\r\n')) input_tesseract_filename = Path('/input') / f'{filename.stem}.txt' input_google_filename = Path('/google-vision-ai') / f'{filename.stem}.txt' output_filename = Path('/output') / f'{filename.stem}.txt' decision_filename = Path('/output') / f'{filename.stem}.ground-truth' input_tesseract_filename = (Path('/input') / filename.stem).with_suffix('.txt') input_google_filename = (Path('/google-vision-ai') / filename.stem).with_suffix('.txt') output_filename = (Path('/output') / filename.stem).with_suffix('.txt') decision_filename = (Path('/output') / filename.stem).with_suffix('.ground-truth') for suffix in ('hocr', 'txt', 'tsv', 'box', 'page'): shutil.copy( (Path('/input') / filename.stem).with_suffix(f'.{suffix}'), (Path('/output') / filename.stem).with_suffix(f'.{suffix}'), ) with input_tesseract_filename.with_suffix('.hocr').open('rt') as f: hocr_content = f.read() Loading @@ -180,3 +189,8 @@ def combine_results(client, tesseract_volume, google_vision_ai_volume, output_vo """, ] run_docker_container(client, 'ahisto/ocr-eval', command=command, volumes=volumes) with create_temporary_docker_container(client, 'ahisto/empty', command='cmd', volumes=volumes) as container: text = extract_text_file_from_container(container, '/input/list.txt') add_text_file_to_container(container, '/output/list.txt', text) ahisto_ocr/volume.py +1 −1 Original line number Diff line number Diff line from typing import List from typing import List, Optional from pathlib import Path from logging import getLogger from tempfile import TemporaryFile Loading Loading
ahisto_ocr/ocr.py +40 −26 Original line number Diff line number Diff line Loading @@ -4,6 +4,7 @@ from pathlib import Path from .config import CONFIG as _CONFIG from .util import ( create_temporary_docker_volume, create_temporary_docker_container, run_docker_container, extract_text_file_from_container, add_text_file_to_container, Loading Loading @@ -151,13 +152,21 @@ def combine_results(client, tesseract_volume, google_vision_ai_volume, output_vo from scripts.common import is_multicolumn with open('/input/list.txt', 'rt') as rf, open('/output/detected-layout.txt', 'wt') as wf: for line in rf: with open('/input/list.txt', 'rt') as f: lines = list(f) for line in lines: filename = Path(line.rstrip('\r\n')) input_tesseract_filename = Path('/input') / f'{filename.stem}.txt' input_google_filename = Path('/google-vision-ai') / f'{filename.stem}.txt' output_filename = Path('/output') / f'{filename.stem}.txt' decision_filename = Path('/output') / f'{filename.stem}.ground-truth' input_tesseract_filename = (Path('/input') / filename.stem).with_suffix('.txt') input_google_filename = (Path('/google-vision-ai') / filename.stem).with_suffix('.txt') output_filename = (Path('/output') / filename.stem).with_suffix('.txt') decision_filename = (Path('/output') / filename.stem).with_suffix('.ground-truth') for suffix in ('hocr', 'txt', 'tsv', 'box', 'page'): shutil.copy( (Path('/input') / filename.stem).with_suffix(f'.{suffix}'), (Path('/output') / filename.stem).with_suffix(f'.{suffix}'), ) with input_tesseract_filename.with_suffix('.hocr').open('rt') as f: hocr_content = f.read() Loading @@ -180,3 +189,8 @@ def combine_results(client, tesseract_volume, google_vision_ai_volume, output_vo """, ] run_docker_container(client, 'ahisto/ocr-eval', command=command, volumes=volumes) with create_temporary_docker_container(client, 'ahisto/empty', command='cmd', volumes=volumes) as container: text = extract_text_file_from_container(container, '/input/list.txt') add_text_file_to_container(container, '/output/list.txt', text)
ahisto_ocr/volume.py +1 −1 Original line number Diff line number Diff line from typing import List from typing import List, Optional from pathlib import Path from logging import getLogger from tempfile import TemporaryFile Loading