Loading scripts/common.py +11 −0 Original line number Diff line number Diff line Loading @@ -822,6 +822,17 @@ def _read_page_languages_hocr(f): return languages def iterate_lines_hocr(f): html5_parser = etree.HTMLParser(huge_tree=True) xml_document = etree.parse(f, html5_parser) for paragraph in xml_document.xpath('//p[@lang]'): language_code = paragraph.attrib['lang'] language_code = normalize_language_code(language_code) for line in paragraph.xpath('span[@class="ocr_line"]'): words = tuple(word.text for word in line.xpath('span[@class="ocrx_word"]')) yield (language_code, words) def get_max_key(dictionary): if not dictionary: return None Loading scripts/sample_relevant_pages.py 0 → 100644 +80 −0 Original line number Diff line number Diff line # -*- coding:utf-8 -*- import logging LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT) import json from pathlib import Path import random import shutil import sys from .common import read_facts, iterate_lines_hocr, image_filename_to_book INPUT_FILENAME = Path(sys.argv[1]) INPUT_ROOT = Path(sys.argv[2]) OUTPUT_ROOT_INPUT = Path(sys.argv[3]) OUTPUT_ROOT_OUTPUT = Path(sys.argv[4]) SAMPLE_SIZE = int(sys.argv[5]) INPUT_UPSCALED_FILENAMES = sys.argv[6] SEED = 21 def fence(text): separator_length = 1 while ('`' * separator_length) in text: separator_length += 1 return '`' * separator_length + text + '`' * separator_length def main(seed=SEED, sample_size=SAMPLE_SIZE): logger = logging.getLogger('main') known_pages = { image_filename_to_book(output_filename): input_basename for output_filename, input_basename in read_facts(INPUT_UPSCALED_FILENAMES) } with INPUT_FILENAME.open('rt') as f: relevant_pages = set( (book_id, page_id) for book_id, page_ids in json.load(f) for page_id in page_ids ) & known_pages random.seed(seed) sample = random.sample(relevant_pages, sample_size) OUTPUT_ROOT_INPUT.mkdir(exist_ok=True) OUTPUT_ROOT_OUTPUT.mkdir(exist_ok=True) for book_id, page_id in sample: input_basename = known_pages[book_id, page_id] input_filename = (INPUT_ROOT / input_basename).with_suffix('.hocr') output_basename = f'{book_id}-{page_id}' output_hocr_filename = (OUTPUT_ROOT_INPUT / output_basename).with_suffix('.hocr') output_md_filename = (OUTPUT_ROOT_OUTPUT / output_basename).with_suffix('.md') shutil.copy(str(input_filename), str(output_hocr_filename)) with output_md_filename.open('wt') as wf: with input_filename.open('rb') as rf: for language_code, words in iterate_lines_hocr(rf): line = fence(' '.join(words)) if language_code is not None: line = f'{line}{{{language_code}}}' print(line, file=wf) message = 'Successfully produced a sample of {} relevant pages from {} to {} and to {}' logger.info(message.format(sample_size, INPUT_ROOT, OUTPUT_ROOT_INPUT, OUTPUT_ROOT_OUTPUT)) if __name__ == '__main__': main() Loading
scripts/common.py +11 −0 Original line number Diff line number Diff line Loading @@ -822,6 +822,17 @@ def _read_page_languages_hocr(f): return languages def iterate_lines_hocr(f): html5_parser = etree.HTMLParser(huge_tree=True) xml_document = etree.parse(f, html5_parser) for paragraph in xml_document.xpath('//p[@lang]'): language_code = paragraph.attrib['lang'] language_code = normalize_language_code(language_code) for line in paragraph.xpath('span[@class="ocr_line"]'): words = tuple(word.text for word in line.xpath('span[@class="ocrx_word"]')) yield (language_code, words) def get_max_key(dictionary): if not dictionary: return None Loading
scripts/sample_relevant_pages.py 0 → 100644 +80 −0 Original line number Diff line number Diff line # -*- coding:utf-8 -*- import logging LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' if __name__ == '__main__': logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT) import json from pathlib import Path import random import shutil import sys from .common import read_facts, iterate_lines_hocr, image_filename_to_book INPUT_FILENAME = Path(sys.argv[1]) INPUT_ROOT = Path(sys.argv[2]) OUTPUT_ROOT_INPUT = Path(sys.argv[3]) OUTPUT_ROOT_OUTPUT = Path(sys.argv[4]) SAMPLE_SIZE = int(sys.argv[5]) INPUT_UPSCALED_FILENAMES = sys.argv[6] SEED = 21 def fence(text): separator_length = 1 while ('`' * separator_length) in text: separator_length += 1 return '`' * separator_length + text + '`' * separator_length def main(seed=SEED, sample_size=SAMPLE_SIZE): logger = logging.getLogger('main') known_pages = { image_filename_to_book(output_filename): input_basename for output_filename, input_basename in read_facts(INPUT_UPSCALED_FILENAMES) } with INPUT_FILENAME.open('rt') as f: relevant_pages = set( (book_id, page_id) for book_id, page_ids in json.load(f) for page_id in page_ids ) & known_pages random.seed(seed) sample = random.sample(relevant_pages, sample_size) OUTPUT_ROOT_INPUT.mkdir(exist_ok=True) OUTPUT_ROOT_OUTPUT.mkdir(exist_ok=True) for book_id, page_id in sample: input_basename = known_pages[book_id, page_id] input_filename = (INPUT_ROOT / input_basename).with_suffix('.hocr') output_basename = f'{book_id}-{page_id}' output_hocr_filename = (OUTPUT_ROOT_INPUT / output_basename).with_suffix('.hocr') output_md_filename = (OUTPUT_ROOT_OUTPUT / output_basename).with_suffix('.md') shutil.copy(str(input_filename), str(output_hocr_filename)) with output_md_filename.open('wt') as wf: with input_filename.open('rb') as rf: for language_code, words in iterate_lines_hocr(rf): line = fence(' '.join(words)) if language_code is not None: line = f'{line}{{{language_code}}}' print(line, file=wf) message = 'Successfully produced a sample of {} relevant pages from {} to {} and to {}' logger.info(message.format(sample_size, INPUT_ROOT, OUTPUT_ROOT_INPUT, OUTPUT_ROOT_OUTPUT)) if __name__ == '__main__': main()