From 77a950cc32fc407df49293e4ef5eaaa7eda786ea Mon Sep 17 00:00:00 2001 From: Vit Novotny <witiko@mail.muni.cz> Date: Sat, 26 Jun 2021 02:00:58 +0200 Subject: [PATCH] Fix an encoding error --- scripts/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/common.py b/scripts/common.py index 5821a26d..06c18339 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -696,7 +696,7 @@ def _read_page_languages_hocr(f): if not content.strip(): # the file is empty return dict() html5_parser = etree.HTMLParser(huge_tree=True) - xml_document = etree.fromstring(content, html5_parser) + xml_document = etree.fromstring(content.encode('utf-8'), html5_parser) languages = defaultdict(lambda: 0.0) -- GitLab