From d721be26eeead81710573cb793ef1e30e5ca82f4 Mon Sep 17 00:00:00 2001
From: Vit Novotny <witiko@mail.muni.cz>
Date: Fri, 11 Jun 2021 18:44:38 +0200
Subject: [PATCH] Guard against zero probabilities of languages

---
 scripts/common.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scripts/common.py b/scripts/common.py
index 9d4472d7..e9e4be00 100644
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -645,6 +645,7 @@ def l1_normalize(dictionary):
         key: float(value) / value_sum
         for key, value
         in dictionary.items()
+        if value > 0.0
     }
 
 
@@ -701,9 +702,13 @@ def _read_page_languages_hocr(f):
     for paragraph in xml_document.xpath('//p[@lang]'):
         paragraph_language_code = paragraph.attrib['lang']
         paragraph_confidence = get_confidence(paragraph)
+        if not paragraph_confidence:
+            continue
         for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'):
             word_language_code = word.attrib['lang']
             word_confidence = get_confidence(word)
+            if not word_confidence:
+                continue
             languages[word_language_code] += word_confidence
             paragraph_confidence -= word_confidence
         assert paragraph_confidence >= 0.0
-- 
GitLab