Commit de23e382 authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Add support for annotated HOCR files

parent 778e33c1
Loading
Loading
Loading
Loading
Loading
+33 −16
Original line number Diff line number Diff line
@@ -822,24 +822,41 @@ def _read_page_languages_hocr(f, algorithm='NLDA', **kwargs):
    def get_confidence(element):
        return float(len(get_element_text(element)))

    assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word')
    for paragraph in xml_document.xpath('//p[@lang]'):
        paragraph_language_code = paragraph.attrib['lang']
        paragraph_confidence = get_confidence(paragraph)
        if algorithm == 'OLDA' or algorithm == 'paragraph':
            languages[paragraph_language_code] += paragraph_confidence
        elif algorithm == 'NLDA' or algorithm == 'word':
            if not paragraph_confidence:
    assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word', 'annotated')
    if algorithm == 'annotated':
        # The annotated HOCR files produced by
        # https://gitlab.fi.muni.cz/nlp/ahisto-language-detection do not have
        # paragraph-level language annotations, so we will only consider
        # word-level annotations.
        for word in xml_document.xpath('//span[@class="ocrx_word" and @lang]'):
            word_language_code = word.attrib['lang']
            word_confidence = get_confidence(word)
            if not word_confidence:
                continue
            for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'):
                word_language_code = word.attrib['lang']
                word_confidence = get_confidence(word)
                if not word_confidence:
            languages[word_language_code] += word_confidence
    else:
        for paragraph in xml_document.xpath('//p[@lang]'):
            paragraph_language_code = paragraph.attrib['lang']
            paragraph_confidence = get_confidence(paragraph)
            if algorithm == 'OLDA' or algorithm == 'paragraph':
                # With the old paragraph-only algorithm, we only take
                # paragraph-level language annotations into account.
                languages[paragraph_language_code] += paragraph_confidence
            elif algorithm == 'NLDA' or algorithm == 'word':
                # With the new paragraph-and-word algorithm, we take
                # both paragraph-level and word-level language annotations
                # into account.
                if not paragraph_confidence:
                    continue
                languages[word_language_code] += word_confidence
                paragraph_confidence -= word_confidence
            assert paragraph_confidence >= 0.0
            languages[paragraph_language_code] += paragraph_confidence
                for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'):
                    word_language_code = word.attrib['lang']
                    word_confidence = get_confidence(word)
                    if not word_confidence:
                        continue
                    languages[word_language_code] += word_confidence
                    paragraph_confidence -= word_confidence
                assert paragraph_confidence >= 0.0
                languages[paragraph_language_code] += paragraph_confidence
    return languages