Skip to content
Snippets Groups Projects
Commit c9796cbb authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Take word-level language annotations in HOCR output into account (cont.)

parent 63fdca1a
No related branches found
No related tags found
No related merge requests found
...@@ -696,16 +696,17 @@ def _read_page_languages_hocr(f): ...@@ -696,16 +696,17 @@ def _read_page_languages_hocr(f):
languages = defaultdict(lambda: 0.0) languages = defaultdict(lambda: 0.0)
def get_confidence(element): def get_confidence(element):
return float(len(''.join(element.itertext()))) return float(len(''.join(element.itertext()).strip()))
for paragraph in xml_document.xpath('//p[@lang]'): for paragraph in xml_document.xpath('//p[@lang]'):
paragraph_language_code = paragraph.attrib['lang'] paragraph_language_code = paragraph.attrib['lang']
paragraph_confidence = get_confidence(paragraph) paragraph_confidence = get_confidence(paragraph)
for word in paragraph.xpath('//span[@class="ocrx_word" and @lang]'): for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'):
word_language_code = word.attrib['lang'] word_language_code = word.attrib['lang']
word_confidence = get_confidence(word) word_confidence = get_confidence(word)
languages[word_language_code] += word_confidence languages[word_language_code] += word_confidence
paragraph_confidence -= word_confidence paragraph_confidence -= word_confidence
assert paragraph_confidence >= 0.0
languages[paragraph_language_code] += paragraph_confidence languages[paragraph_language_code] += paragraph_confidence
return languages return languages
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment