From d721be26eeead81710573cb793ef1e30e5ca82f4 Mon Sep 17 00:00:00 2001 From: Vit Novotny <witiko@mail.muni.cz> Date: Fri, 11 Jun 2021 18:44:38 +0200 Subject: [PATCH] Guard against zero probabilities of languages --- scripts/common.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scripts/common.py b/scripts/common.py index 9d4472d7..e9e4be00 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -645,6 +645,7 @@ def l1_normalize(dictionary): key: float(value) / value_sum for key, value in dictionary.items() + if value > 0.0 } @@ -701,9 +702,13 @@ def _read_page_languages_hocr(f): for paragraph in xml_document.xpath('//p[@lang]'): paragraph_language_code = paragraph.attrib['lang'] paragraph_confidence = get_confidence(paragraph) + if not paragraph_confidence: + continue for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): word_language_code = word.attrib['lang'] word_confidence = get_confidence(word) + if not word_confidence: + continue languages[word_language_code] += word_confidence paragraph_confidence -= word_confidence assert paragraph_confidence >= 0.0 -- GitLab