From 3df50968ddb769f701c7c2191c223422ed90b981 Mon Sep 17 00:00:00 2001 From: Vit Novotny <witiko@mail.muni.cz> Date: Thu, 11 Nov 2021 15:39:15 +0100 Subject: [PATCH] Add support for annotated HOCR files --- scripts/common.py | 49 +++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/scripts/common.py b/scripts/common.py index 9aafa36e..da3111ab 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -813,24 +813,41 @@ def _read_page_languages_hocr(f, algorithm='NLDA', **kwargs): def get_confidence(element): return float(len(get_element_text(element))) - assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word') - for paragraph in xml_document.xpath('//p[@lang]'): - paragraph_language_code = paragraph.attrib['lang'] - paragraph_confidence = get_confidence(paragraph) - if algorithm == 'OLDA' or algorithm == 'paragraph': - languages[paragraph_language_code] += paragraph_confidence - elif algorithm == 'NLDA' or algorithm == 'word': - if not paragraph_confidence: + assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word', 'annotated') + if algorithm == 'annotated': + # The annotated HOCR files produced by + # https://gitlab.fi.muni.cz/nlp/ahisto-language-detection do not have + # paragraph-level language annotations, so we will only consider + # word-level annotations. + for word in xml_document.xpath('//span[@class="ocrx_word" and @lang]'): + word_language_code = word.attrib['lang'] + word_confidence = get_confidence(word) + if not word_confidence: continue - for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): - word_language_code = word.attrib['lang'] - word_confidence = get_confidence(word) - if not word_confidence: + languages[word_language_code] += word_confidence + else: + for paragraph in xml_document.xpath('//p[@lang]'): + paragraph_language_code = paragraph.attrib['lang'] + paragraph_confidence = get_confidence(paragraph) + if algorithm == 'OLDA' or algorithm == 'paragraph': + # With the old paragraph-only algorithm, we only take + # paragraph-level language annotations into account. + languages[paragraph_language_code] += paragraph_confidence + elif algorithm == 'NLDA' or algorithm == 'word': + # With the new paragraph-and-word algorithm, we take + # both paragraph-level and word-level language annotations + # into account. + if not paragraph_confidence: continue - languages[word_language_code] += word_confidence - paragraph_confidence -= word_confidence - assert paragraph_confidence >= 0.0 - languages[paragraph_language_code] += paragraph_confidence + for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): + word_language_code = word.attrib['lang'] + word_confidence = get_confidence(word) + if not word_confidence: + continue + languages[word_language_code] += word_confidence + paragraph_confidence -= word_confidence + assert paragraph_confidence >= 0.0 + languages[paragraph_language_code] += paragraph_confidence return languages -- GitLab