diff --git a/scripts/common.py b/scripts/common.py index 5821a26ddc70d143d3a33f8d7dd97142d54dfe75..06c18339ef194b7abc19c7f2dd48cd18a5cbcb7b 100644 --- a/scripts/common.py +++ b/scripts/common.py @@ -696,7 +696,7 @@ def _read_page_languages_hocr(f): if not content.strip(): # the file is empty return dict() html5_parser = etree.HTMLParser(huge_tree=True) - xml_document = etree.fromstring(content, html5_parser) + xml_document = etree.fromstring(content.encode('utf-8'), html5_parser) languages = defaultdict(lambda: 0.0)