diff --git a/scripts/common.py b/scripts/common.py
index 2bd3c1fe431215686152050eef5800edb7d106aa..e1feb4ffeb0be5b59379bfc0d084f23464deb1f6 100644
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -750,7 +750,7 @@ def l1_normalize(dictionary):
     }
 
 
-def read_page_languages(basename, limit=None):
+def read_page_languages(basename, limit=None, **kwargs):
     readers = {
         '.hocr': _read_page_languages_hocr,
         '.json': _read_page_languages_json,
@@ -760,7 +760,7 @@ def read_page_languages(basename, limit=None):
             f = Path('{}{}'.format(basename, suffix)).open('rt')
         except IOError:
             continue
-        languages = reader(f)
+        languages = reader(f, **kwargs)
         f.close()
         languages = normalize_language_codes(languages)
         if limit is not None:
@@ -770,7 +770,7 @@ def read_page_languages(basename, limit=None):
     raise IOError('Found no file with basename {} containing detected languages'.format(basename))
 
 
-def _read_page_languages_json(f):
+def _read_page_languages_json(f, **kwargs):
     try:
         document = json.load(f)
     except json.decoder.JSONDecodeError:
@@ -792,16 +792,36 @@ def _read_page_languages_json(f):
     return languages
 
 
-def _read_page_languages_hocr(f):
+def _read_page_languages_hocr(f, algorithm='NLDA', **kwargs):
+    content = f.read()
+    if not content.strip():  # the file is empty
+        return dict()
     html5_parser = etree.HTMLParser(huge_tree=True)
-    xml_document = etree.parse(f, html5_parser)
-    languages = dict()
+    xml_document = etree.fromstring(content.encode('utf-8'), html5_parser)
+
+    languages = defaultdict(lambda: 0.0)
+
+    def get_confidence(element):
+        return float(len(''.join(element.itertext()).strip()))
+
+    assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word')
     for paragraph in xml_document.xpath('//p[@lang]'):
-        language_code = paragraph.attrib['lang']
-        confidence = float(len(''.join(paragraph.itertext())))
-        if language_code not in languages:
-            languages[language_code] = 0.0
-        languages[language_code] += confidence
+        paragraph_language_code = paragraph.attrib['lang']
+        paragraph_confidence = get_confidence(paragraph)
+        if algorithm == 'OLDA' or algorithm == 'paragraph':
+            languages[paragraph_language_code] += paragraph_confidence
+        elif algorithm == 'NLDA' or algorithm == 'word':
+            if not paragraph_confidence:
+                continue
+            for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'):
+                word_language_code = word.attrib['lang']
+                word_confidence = get_confidence(word)
+                if not word_confidence:
+                    continue
+                languages[word_language_code] += word_confidence
+                paragraph_confidence -= word_confidence
+            assert paragraph_confidence >= 0.0
+            languages[paragraph_language_code] += paragraph_confidence
     return languages
 
 
diff --git a/scripts/extract_detected_languages.py b/scripts/extract_detected_languages.py
index cd8e20683334de040ccdfc5aab9a78fa4375f6e5..8becd4b587585892068714aa6bf7cfdd02317481 100644
--- a/scripts/extract_detected_languages.py
+++ b/scripts/extract_detected_languages.py
@@ -29,7 +29,7 @@ THRESHOLD = float(sys.argv[5]) / 100.0
 def get_languages_worker(filename):
     basename = str(INPUT_OCR_ROOT / filename.parent / filename.stem)
     try:
-        languages = read_page_languages(basename, DETECTED_LANGUAGES)
+        languages = read_page_languages(basename, DETECTED_LANGUAGES, algorithm='OLDA')
     except IOError:
         return 'not-exists'
     return (filename, languages)