Commit 4316ca30 authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Add code for sampling relevant pages

parent da99d036
Loading
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -822,6 +822,17 @@ def _read_page_languages_hocr(f):
    return languages


def iterate_lines_hocr(f):
    html5_parser = etree.HTMLParser(huge_tree=True)
    xml_document = etree.parse(f, html5_parser)
    for paragraph in xml_document.xpath('//p[@lang]'):
        language_code = paragraph.attrib['lang']
        language_code = normalize_language_code(language_code)
        for line in paragraph.xpath('span[@class="ocr_line"]'):
            words = tuple(word.text for word in line.xpath('span[@class="ocrx_word"]'))
            yield (language_code, words)


def get_max_key(dictionary):
    if not dictionary:
        return None
+80 −0
Original line number Diff line number Diff line
# -*- coding:utf-8 -*-

import logging


LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT)


import json
from pathlib import Path
import random
import shutil
import sys

from .common import read_facts, iterate_lines_hocr, image_filename_to_book


INPUT_FILENAME = Path(sys.argv[1])
INPUT_ROOT = Path(sys.argv[2])
OUTPUT_ROOT_INPUT = Path(sys.argv[3])
OUTPUT_ROOT_OUTPUT = Path(sys.argv[4])
SAMPLE_SIZE = int(sys.argv[5])
INPUT_UPSCALED_FILENAMES = sys.argv[6]

SEED = 21


def fence(text):
    separator_length = 1
    while ('`' * separator_length) in text:
        separator_length += 1
    return '`' * separator_length + text + '`' * separator_length


def main(seed=SEED, sample_size=SAMPLE_SIZE):
    logger = logging.getLogger('main')

    known_pages = {
        image_filename_to_book(output_filename): input_basename
        for output_filename, input_basename
        in read_facts(INPUT_UPSCALED_FILENAMES)
    }

    with INPUT_FILENAME.open('rt') as f:
        relevant_pages = set(
            (book_id, page_id)
            for book_id, page_ids in json.load(f)
            for page_id in page_ids
        ) & known_pages

    random.seed(seed)
    sample = random.sample(relevant_pages, sample_size)

    OUTPUT_ROOT_INPUT.mkdir(exist_ok=True)
    OUTPUT_ROOT_OUTPUT.mkdir(exist_ok=True)
    for book_id, page_id in sample:
        input_basename = known_pages[book_id, page_id]
        input_filename = (INPUT_ROOT / input_basename).with_suffix('.hocr')
        output_basename = f'{book_id}-{page_id}'
        output_hocr_filename = (OUTPUT_ROOT_INPUT / output_basename).with_suffix('.hocr')
        output_md_filename = (OUTPUT_ROOT_OUTPUT / output_basename).with_suffix('.md')
        shutil.copy(str(input_filename), str(output_hocr_filename))
        with output_md_filename.open('wt') as wf:
            with input_filename.open('rb') as rf:
                for language_code, words in iterate_lines_hocr(rf):
                    line = fence(' '.join(words))
                    if language_code is not None:
                        line = f'{line}{{{language_code}}}'
                    print(line, file=wf)

    message = 'Successfully produced a sample of {} relevant pages from {} to {} and to {}'
    logger.info(message.format(sample_size, INPUT_ROOT, OUTPUT_ROOT_INPUT, OUTPUT_ROOT_OUTPUT))


if __name__ == '__main__':
    main()