From 77a950cc32fc407df49293e4ef5eaaa7eda786ea Mon Sep 17 00:00:00 2001
From: Vit Novotny <witiko@mail.muni.cz>
Date: Sat, 26 Jun 2021 02:00:58 +0200
Subject: [PATCH] Fix an encoding error

---
 scripts/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/common.py b/scripts/common.py
index 5821a26d..06c18339 100644
--- a/scripts/common.py
+++ b/scripts/common.py
@@ -696,7 +696,7 @@ def _read_page_languages_hocr(f):
     if not content.strip():  # the file is empty
         return dict()
     html5_parser = etree.HTMLParser(huge_tree=True)
-    xml_document = etree.fromstring(content, html5_parser)
+    xml_document = etree.fromstring(content.encode('utf-8'), html5_parser)
 
     languages = defaultdict(lambda: 0.0)
 
-- 
GitLab