From c9796cbbabe142fc6dcbe1d337f78a0a71b14fbe Mon Sep 17 00:00:00 2001 From: Vit Novotny <witiko@mail.muni.cz> Date: Fri, 11 Jun 2021 18:38:34 +0200 Subject: [PATCH] Take word-level language annotations in HOCR output into account (cont.) --- scripts/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/common.py b/scripts/common.py index b90710a7..9d4472d7 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -696,16 +696,17 @@ def _read_page_languages_hocr(f): languages = defaultdict(lambda: 0.0) def get_confidence(element): - return float(len(''.join(element.itertext()))) + return float(len(''.join(element.itertext()).strip())) for paragraph in xml_document.xpath('//p[@lang]'): paragraph_language_code = paragraph.attrib['lang'] paragraph_confidence = get_confidence(paragraph) - for word in paragraph.xpath('//span[@class="ocrx_word" and @lang]'): + for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): word_language_code = word.attrib['lang'] word_confidence = get_confidence(word) languages[word_language_code] += word_confidence paragraph_confidence -= word_confidence + assert paragraph_confidence >= 0.0 languages[paragraph_language_code] += paragraph_confidence return languages -- GitLab