Loading scripts/common.py +33 −16 Original line number Diff line number Diff line Loading @@ -822,24 +822,41 @@ def _read_page_languages_hocr(f, algorithm='NLDA', **kwargs): def get_confidence(element): return float(len(get_element_text(element))) assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word') for paragraph in xml_document.xpath('//p[@lang]'): paragraph_language_code = paragraph.attrib['lang'] paragraph_confidence = get_confidence(paragraph) if algorithm == 'OLDA' or algorithm == 'paragraph': languages[paragraph_language_code] += paragraph_confidence elif algorithm == 'NLDA' or algorithm == 'word': if not paragraph_confidence: assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word', 'annotated') if algorithm == 'annotated': # The annotated HOCR files produced by # https://gitlab.fi.muni.cz/nlp/ahisto-language-detection do not have # paragraph-level language annotations, so we will only consider # word-level annotations. for word in xml_document.xpath('//span[@class="ocrx_word" and @lang]'): word_language_code = word.attrib['lang'] word_confidence = get_confidence(word) if not word_confidence: continue for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): word_language_code = word.attrib['lang'] word_confidence = get_confidence(word) if not word_confidence: languages[word_language_code] += word_confidence else: for paragraph in xml_document.xpath('//p[@lang]'): paragraph_language_code = paragraph.attrib['lang'] paragraph_confidence = get_confidence(paragraph) if algorithm == 'OLDA' or algorithm == 'paragraph': # With the old paragraph-only algorithm, we only take # paragraph-level language annotations into account. languages[paragraph_language_code] += paragraph_confidence elif algorithm == 'NLDA' or algorithm == 'word': # With the new paragraph-and-word algorithm, we take # both paragraph-level and word-level language annotations # into account. if not paragraph_confidence: continue languages[word_language_code] += word_confidence paragraph_confidence -= word_confidence assert paragraph_confidence >= 0.0 languages[paragraph_language_code] += paragraph_confidence for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): word_language_code = word.attrib['lang'] word_confidence = get_confidence(word) if not word_confidence: continue languages[word_language_code] += word_confidence paragraph_confidence -= word_confidence assert paragraph_confidence >= 0.0 languages[paragraph_language_code] += paragraph_confidence return languages Loading Loading
scripts/common.py +33 −16 Original line number Diff line number Diff line Loading @@ -822,24 +822,41 @@ def _read_page_languages_hocr(f, algorithm='NLDA', **kwargs): def get_confidence(element): return float(len(get_element_text(element))) assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word') for paragraph in xml_document.xpath('//p[@lang]'): paragraph_language_code = paragraph.attrib['lang'] paragraph_confidence = get_confidence(paragraph) if algorithm == 'OLDA' or algorithm == 'paragraph': languages[paragraph_language_code] += paragraph_confidence elif algorithm == 'NLDA' or algorithm == 'word': if not paragraph_confidence: assert algorithm in ('OLDA', 'NLDA', 'paragraph', 'word', 'annotated') if algorithm == 'annotated': # The annotated HOCR files produced by # https://gitlab.fi.muni.cz/nlp/ahisto-language-detection do not have # paragraph-level language annotations, so we will only consider # word-level annotations. for word in xml_document.xpath('//span[@class="ocrx_word" and @lang]'): word_language_code = word.attrib['lang'] word_confidence = get_confidence(word) if not word_confidence: continue for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): word_language_code = word.attrib['lang'] word_confidence = get_confidence(word) if not word_confidence: languages[word_language_code] += word_confidence else: for paragraph in xml_document.xpath('//p[@lang]'): paragraph_language_code = paragraph.attrib['lang'] paragraph_confidence = get_confidence(paragraph) if algorithm == 'OLDA' or algorithm == 'paragraph': # With the old paragraph-only algorithm, we only take # paragraph-level language annotations into account. languages[paragraph_language_code] += paragraph_confidence elif algorithm == 'NLDA' or algorithm == 'word': # With the new paragraph-and-word algorithm, we take # both paragraph-level and word-level language annotations # into account. if not paragraph_confidence: continue languages[word_language_code] += word_confidence paragraph_confidence -= word_confidence assert paragraph_confidence >= 0.0 languages[paragraph_language_code] += paragraph_confidence for word in paragraph.xpath('.//span[@class="ocrx_word" and @lang]'): word_language_code = word.attrib['lang'] word_confidence = get_confidence(word) if not word_confidence: continue languages[word_language_code] += word_confidence paragraph_confidence -= word_confidence assert paragraph_confidence >= 0.0 languages[paragraph_language_code] += paragraph_confidence return languages Loading