diff --git a/scripts/common.py b/scripts/common.py index 2bd3c1fe431215686152050eef5800edb7d106aa..e1feb4ffeb0be5b59379bfc0d084f23464deb1f6 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -750,7 +750,7 @@ def l1_normalize(dictionary): } -def read_page_languages(basename, limit=None): +def read_page_languages(basename, limit=None, **kwargs): readers = { '.hocr': _read_page_languages_hocr, '.json': _read_page_languages_json, @@ -760,7 +760,7 @@ def read_page_languages(basename, limit=None): f = Path('{}{}'.format(basename, suffix)).open('rt') except IOError: continue - languages = reader(f) + languages = reader(f, **kwargs) f.close() languages = normalize_language_codes(languages) if limit is not None: @@ -770,7 +770,7 @@ def read_page_languages(basename, limit=None): raise IOError('Found no file with basename {} containing detected languages'.format(basename)) -def _read_page_languages_json(f): +def _read_page_languages_json(f, **kwargs): try: document = json.load(f) except json.decoder.JSONDecodeError: @@ -792,16 +792,36 @@ def _read_page_languages_json(f): return languages -def _read_page_languages_hocr(f): +def _read_page_languages_hocr(f, algorithm='NLDA', **kwargs): + content = f.read() + if not content.strip(): # the file is empty + return dict() html5_parser = etree.HTMLParser(huge_tree=True) - xml_document = etree.parse(f, html5_parser) - languages = dict() + xml_document = etree.fromstring(content.encode('utf-8'), html5_parser) + + languages = defaultdict(lambda: 0.0) + + def get_confidence(element): + return float(len(''.join(element.itertext()).strip())) + + assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word') for paragraph in xml_document.xpath('//p[@lang]'): - language_code = paragraph.attrib['lang'] - confidence = float(len(''.join(paragraph.itertext()))) - if language_code not in languages: - languages[language_code] = 0.0 - languages[language_code] += confidence + paragraph_language_code = paragraph.attrib['lang'] + paragraph_confidence = get_confidence(paragraph) + if algorithm == 'OLDA' or algorithm == 'paragraph': + languages[paragraph_language_code] += paragraph_confidence + elif algorithm == 'NLDA' or algorithm == 'word': + if not paragraph_confidence: + continue + for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): + word_language_code = word.attrib['lang'] + word_confidence = get_confidence(word) + if not word_confidence: + continue + languages[word_language_code] += word_confidence + paragraph_confidence -= word_confidence + assert paragraph_confidence >= 0.0 + languages[paragraph_language_code] += paragraph_confidence return languages diff --git a/scripts/extract_detected_languages.py b/scripts/extract_detected_languages.py index cd8e20683334de040ccdfc5aab9a78fa4375f6e5..8becd4b587585892068714aa6bf7cfdd02317481 100644 --- a/scripts/extract_detected_languages.py +++ b/scripts/extract_detected_languages.py @@ -29,7 +29,7 @@ THRESHOLD = float(sys.argv[5]) / 100.0 def get_languages_worker(filename): basename = str(INPUT_OCR_ROOT / filename.parent / filename.stem) try: - languages = read_page_languages(basename, DETECTED_LANGUAGES) + languages = read_page_languages(basename, DETECTED_LANGUAGES, algorithm='OLDA') except IOError: return 'not-exists' return (filename, languages)