diff --git a/scripts/common.py b/scripts/common.py index 9d4472d701592d6b699af49cd57dff60d428d7e9..e9e4be00b1da9c25b85ea26e9b402e4fbeebda1b 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -645,6 +645,7 @@ def l1_normalize(dictionary): key: float(value) / value_sum for key, value in dictionary.items() + if value > 0.0 } @@ -701,9 +702,13 @@ def _read_page_languages_hocr(f): for paragraph in xml_document.xpath('//p[@lang]'): paragraph_language_code = paragraph.attrib['lang'] paragraph_confidence = get_confidence(paragraph) + if not paragraph_confidence: + continue for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): word_language_code = word.attrib['lang'] word_confidence = get_confidence(word) + if not word_confidence: + continue languages[word_language_code] += word_confidence paragraph_confidence -= word_confidence assert paragraph_confidence >= 0.0