From 3df50968ddb769f701c7c2191c223422ed90b981 Mon Sep 17 00:00:00 2001
From: Vit Novotny <witiko@mail.muni.cz>
Date: Thu, 11 Nov 2021 15:39:15 +0100
Subject: [PATCH] Add support for annotated HOCR files

---
 scripts/common.py | 49 +++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/scripts/common.py b/scripts/common.py
index 9aafa36e..da3111ab 100644
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -813,24 +813,41 @@ def _read_page_languages_hocr(f, algorithm='NLDA', **kwargs):
     def get_confidence(element):
         return float(len(get_element_text(element)))
 
-    assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word')
-    for paragraph in xml_document.xpath('//p[@lang]'):
-        paragraph_language_code = paragraph.attrib['lang']
-        paragraph_confidence = get_confidence(paragraph)
-        if algorithm == 'OLDA' or algorithm == 'paragraph':
-            languages[paragraph_language_code] += paragraph_confidence
-        elif algorithm == 'NLDA' or algorithm == 'word':
-            if not paragraph_confidence:
+    assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word', 'annotated')
+    if algorithm == 'annotated':
+        # The annotated HOCR files produced by
+        # https://gitlab.fi.muni.cz/nlp/ahisto-language-detection do not have
+        # paragraph-level language annotations, so we will only consider
+        # word-level annotations.
+        for word in xml_document.xpath('//span[@class="ocrx_word" and @lang]'):
+            word_language_code = word.attrib['lang']
+            word_confidence = get_confidence(word)
+            if not word_confidence:
                 continue
-            for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'):
-                word_language_code = word.attrib['lang']
-                word_confidence = get_confidence(word)
-                if not word_confidence:
+            languages[word_language_code] += word_confidence
+    else:
+        for paragraph in xml_document.xpath('//p[@lang]'):
+            paragraph_language_code = paragraph.attrib['lang']
+            paragraph_confidence = get_confidence(paragraph)
+            if algorithm == 'OLDA' or algorithm == 'paragraph':
+                # With the old paragraph-only algorithm, we only take
+                # paragraph-level language annotations into account.
+                languages[paragraph_language_code] += paragraph_confidence
+            elif algorithm == 'NLDA' or algorithm == 'word':
+                # With the new paragraph-and-word algorithm, we take
+                # both paragraph-level and word-level language annotations
+                # into account.
+                if not paragraph_confidence:
                     continue
-                languages[word_language_code] += word_confidence
-                paragraph_confidence -= word_confidence
-            assert paragraph_confidence >= 0.0
-            languages[paragraph_language_code] += paragraph_confidence
+                for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'):
+                    word_language_code = word.attrib['lang']
+                    word_confidence = get_confidence(word)
+                    if not word_confidence:
+                        continue
+                    languages[word_language_code] += word_confidence
+                    paragraph_confidence -= word_confidence
+                assert paragraph_confidence >= 0.0
+                languages[paragraph_language_code] += paragraph_confidence
     return languages
 
 
-- 
GitLab