Loading ahisto_ocr/cli.py +1 −1 Original line number Original line Diff line number Diff line Loading @@ -85,10 +85,10 @@ def run_ocr(input_images: List[Path], output_dir: Path, google_vision_ai: bool, run_tesseract(client, postprocessing_volume, tesseract_volume) run_tesseract(client, postprocessing_volume, tesseract_volume) if google_vision_ai: if google_vision_ai: LOGGER.info('Skipping Google Vision AI') run_google_vision_ai(client, postprocessing_volume, google_vision_ai_volume) run_google_vision_ai(client, postprocessing_volume, google_vision_ai_volume) combine_results(client, tesseract_volume, google_vision_ai_volume, output_volume) combine_results(client, tesseract_volume, google_vision_ai_volume, output_volume) else: else: LOGGER.info('Skipping Google Vision AI') output_volume = tesseract_volume output_volume = tesseract_volume copy_output_from(client, output_volume, input_images, output_dir) copy_output_from(client, output_volume, input_images, output_dir) ahisto_ocr/ocr.py +1 −2 Original line number Original line Diff line number Diff line from logging import getLogger from logging import getLogger from pathlib import Path from .config import CONFIG as _CONFIG from .config import CONFIG as _CONFIG from .util import ( from .util import ( Loading Loading @@ -103,7 +102,7 @@ def run_tesseract_second_pass(client, preprocessing_volume, first_pass_volume, run_docker_container(client, 'ahisto/ocr-fileformat', command=command, volumes=volumes) run_docker_container(client, 'ahisto/ocr-fileformat', command=command, volumes=volumes) def run_google_vision_ai(client, postprocessing_volume, google_vision_ai_volume) -> None: def run_google_vision_ai(client, preprocessing_volume, google_vision_ai_volume) -> None: LOGGER.info('Running Google Vision AI') LOGGER.info('Running Google Vision AI') volumes = { volumes = { Loading ahisto_ocr/util.py +3 −1 Original line number Original line Diff line number Diff line Loading @@ -9,6 +9,7 @@ from contextlib import closing from io import BytesIO from io import BytesIO import os import os import sys import sys from pathlib import Path from docker.types import DeviceRequest from docker.types import DeviceRequest from docker.errors import ContainerError from docker.errors import ContainerError Loading Loading @@ -77,5 +78,6 @@ def extract_text_file_from_container(container, filename: str) -> str: for chunk in bits: for chunk in bits: tf.write(chunk) tf.write(chunk) tf.seek(0) tf.seek(0) text = tf.read().decode('utf-8') with tarfile.open(fileobj=tf, mode='r') as tar: text = tar.extractfile(Path(filename).name).read().decode('utf-8') return text return text ahisto_ocr/volume.py +1 −1 Original line number Original line Diff line number Diff line Loading @@ -4,7 +4,7 @@ from logging import getLogger from tempfile import TemporaryFile from tempfile import TemporaryFile import tarfile import tarfile from .util import create_temporary_docker_container, add_text_file_to_tarfile from .util import create_temporary_docker_container, add_text_file_to_tarfile, extract_text_file_from_container LOGGER = getLogger(__name__) LOGGER = getLogger(__name__) Loading Loading
ahisto_ocr/cli.py +1 −1 Original line number Original line Diff line number Diff line Loading @@ -85,10 +85,10 @@ def run_ocr(input_images: List[Path], output_dir: Path, google_vision_ai: bool, run_tesseract(client, postprocessing_volume, tesseract_volume) run_tesseract(client, postprocessing_volume, tesseract_volume) if google_vision_ai: if google_vision_ai: LOGGER.info('Skipping Google Vision AI') run_google_vision_ai(client, postprocessing_volume, google_vision_ai_volume) run_google_vision_ai(client, postprocessing_volume, google_vision_ai_volume) combine_results(client, tesseract_volume, google_vision_ai_volume, output_volume) combine_results(client, tesseract_volume, google_vision_ai_volume, output_volume) else: else: LOGGER.info('Skipping Google Vision AI') output_volume = tesseract_volume output_volume = tesseract_volume copy_output_from(client, output_volume, input_images, output_dir) copy_output_from(client, output_volume, input_images, output_dir)
ahisto_ocr/ocr.py +1 −2 Original line number Original line Diff line number Diff line from logging import getLogger from logging import getLogger from pathlib import Path from .config import CONFIG as _CONFIG from .config import CONFIG as _CONFIG from .util import ( from .util import ( Loading Loading @@ -103,7 +102,7 @@ def run_tesseract_second_pass(client, preprocessing_volume, first_pass_volume, run_docker_container(client, 'ahisto/ocr-fileformat', command=command, volumes=volumes) run_docker_container(client, 'ahisto/ocr-fileformat', command=command, volumes=volumes) def run_google_vision_ai(client, postprocessing_volume, google_vision_ai_volume) -> None: def run_google_vision_ai(client, preprocessing_volume, google_vision_ai_volume) -> None: LOGGER.info('Running Google Vision AI') LOGGER.info('Running Google Vision AI') volumes = { volumes = { Loading
ahisto_ocr/util.py +3 −1 Original line number Original line Diff line number Diff line Loading @@ -9,6 +9,7 @@ from contextlib import closing from io import BytesIO from io import BytesIO import os import os import sys import sys from pathlib import Path from docker.types import DeviceRequest from docker.types import DeviceRequest from docker.errors import ContainerError from docker.errors import ContainerError Loading Loading @@ -77,5 +78,6 @@ def extract_text_file_from_container(container, filename: str) -> str: for chunk in bits: for chunk in bits: tf.write(chunk) tf.write(chunk) tf.seek(0) tf.seek(0) text = tf.read().decode('utf-8') with tarfile.open(fileobj=tf, mode='r') as tar: text = tar.extractfile(Path(filename).name).read().decode('utf-8') return text return text
ahisto_ocr/volume.py +1 −1 Original line number Original line Diff line number Diff line Loading @@ -4,7 +4,7 @@ from logging import getLogger from tempfile import TemporaryFile from tempfile import TemporaryFile import tarfile import tarfile from .util import create_temporary_docker_container, add_text_file_to_tarfile from .util import create_temporary_docker_container, add_text_file_to_tarfile, extract_text_file_from_container LOGGER = getLogger(__name__) LOGGER = getLogger(__name__) Loading