diff --git a/scripts/common.py b/scripts/common.py
index 5821a26ddc70d143d3a33f8d7dd97142d54dfe75..06c18339ef194b7abc19c7f2dd48cd18a5cbcb7b 100644
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -696,7 +696,7 @@ def _read_page_languages_hocr(f):
     if not content.strip():  # the file is empty
         return dict()
     html5_parser = etree.HTMLParser(huge_tree=True)
-    xml_document = etree.fromstring(content, html5_parser)
+    xml_document = etree.fromstring(content.encode('utf-8'), html5_parser)
 
     languages = defaultdict(lambda: 0.0)