Commit b5188a20 authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Finish ocr.combine_results()

parent 4deab09f
Loading
Loading
Loading
Loading
Loading
+40 −26
Original line number Diff line number Diff line
@@ -4,6 +4,7 @@ from pathlib import Path
from .config import CONFIG as _CONFIG
from .util import (
    create_temporary_docker_volume,
    create_temporary_docker_container,
    run_docker_container,
    extract_text_file_from_container,
    add_text_file_to_container,
@@ -151,13 +152,21 @@ def combine_results(client, tesseract_volume, google_vision_ai_volume, output_vo
        from scripts.common import is_multicolumn


        with open('/input/list.txt', 'rt') as rf, open('/output/detected-layout.txt', 'wt') as wf:
            for line in rf:
        with open('/input/list.txt', 'rt') as f:
            lines = list(f)

        for line in lines:
            filename = Path(line.rstrip('\r\n'))
                input_tesseract_filename = Path('/input') / f'{filename.stem}.txt'
                input_google_filename = Path('/google-vision-ai') / f'{filename.stem}.txt'
                output_filename = Path('/output') / f'{filename.stem}.txt'
                decision_filename = Path('/output') / f'{filename.stem}.ground-truth'
            input_tesseract_filename = (Path('/input') / filename.stem).with_suffix('.txt')
            input_google_filename = (Path('/google-vision-ai') / filename.stem).with_suffix('.txt')
            output_filename = (Path('/output') / filename.stem).with_suffix('.txt')
            decision_filename = (Path('/output') / filename.stem).with_suffix('.ground-truth')

            for suffix in ('hocr', 'txt', 'tsv', 'box', 'page'):
                shutil.copy(
                    (Path('/input') / filename.stem).with_suffix(f'.{suffix}'),
                    (Path('/output') / filename.stem).with_suffix(f'.{suffix}'),
                )

            with input_tesseract_filename.with_suffix('.hocr').open('rt') as f:
                hocr_content = f.read()
@@ -180,3 +189,8 @@ def combine_results(client, tesseract_volume, google_vision_ai_volume, output_vo
        """,
    ]
    run_docker_container(client, 'ahisto/ocr-eval', command=command, volumes=volumes)

    with create_temporary_docker_container(client, 'ahisto/empty', command='cmd',
                                           volumes=volumes) as container:
        text = extract_text_file_from_container(container, '/input/list.txt')
        add_text_file_to_container(container, '/output/list.txt', text)
+1 −1
Original line number Diff line number Diff line
from typing import List
from typing import List, Optional
from pathlib import Path
from logging import getLogger
from tempfile import TemporaryFile