Commit 62b106a1 authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Add 122 annotated pages for language detection

parent de23e382
Loading
Loading
Loading
Loading
Loading
+13 −1
Original line number Diff line number Diff line
@@ -6,6 +6,9 @@ NICENESS = 10

DETECTED_LANGUAGES = ces+deu+lat

LANGUAGE_DETECTION_CODE = $(SCRIPT_DIRNAME)/language_detection
LANGUAGE_DETECTION_ANNOTATIONS = $(SCRIPT_DIRNAME)/language_detection_annotations

TESSERACT_RUN = $(TESSERACT)
TESSERACT_OPTIONS = --psm 3 --tessdata-dir $(TESSERACT_DATA) $(TESSERACT_OUTPUT_FORMATS)
TESSERACT_OPTIONS_OCRD = -P␣sparse_text␣false␣-P␣model␣$(DETECTED_LANGUAGES)
@@ -225,6 +228,7 @@ THRESHOLDS = 00 05 10 15 20 25 50 75 100

PYTHON = python3
PYTHON_RUN = nice -n $(NICENESS) $(PYTHON) -m
PYTHON_VENV = virtualenv-$(shell hostname)

install-ocrd:
	docker pull $(OCRD_IMAGE)
@@ -274,8 +278,16 @@ $(INPUT_UPSCALED_LOW_CONFIDENCE_FILENAMES): $(INPUT_FILENAMES_FILTERED) $(DOWNSC
$(INPUT_UPSCALED_HIGH_CONFIDENCE_FILENAMES): $(INPUT_FILENAMES_FILTERED) $(DOWNSCALED_INPUT_FILENAMES_FILTERED) $(OUTPUT_OCR_TESSERACT4_DIRNAME) $(INPUT_JSON_DUMP)
	$(PYTHON_RUN) scripts.upscale_downscaled_high_confidence $(INPUT_DIRNAME) $(INPUT_FILENAMES_FILTERED) $(DOWNSCALED_INPUT_DIRNAME) $(DOWNSCALED_INPUT_FILENAMES_FILTERED) $(OUTPUT_OCR_TESSERACT4_DIRNAME) $(INPUT_JSON_DUMP) $(INPUT_CSV_DUMP) $@

$(LANGUAGE_DETECTION_CODE):
	git clone https://gitlab.fi.muni.cz/nlp/ahisto-language-detection.git $@
	$(PYTHON_RUN) venv $@/$(PYTHON_VENV)
	cd $@ && source $(PYTHON_VENV)/bin/activate && pip install -r requirements.txt && $(PYTHON_RUN) scripts.create_annotated_hocr

$(LANGUAGE_DETECTION_ANNOTATIONS): $(LANGUAGE_DETECTION_CODE) $(INPUT_UPSCALED_HIGH_CONFIDENCE_FILENAMES)
	$(PYTHON_RUN) scripts.unflatten_directory_annotated $</output $(INPUT_UPSCALED_HIGH_CONFIDENCE_FILENAMES) $@

$(TESSERACT_DATA):
	git clone --recurse-submodules https://github.com/tesseract-ocr/tessdata.git tessdata
	git clone --recurse-submodules https://github.com/tesseract-ocr/tessdata.git $@

$(CALAMARI_MODELS):
	mkdir -p $@
+1 −0
Original line number Diff line number Diff line
<html><body><div class="ocr_page"><p class="ocr_par"><span class="ocr_line"><span class="ocrx_word" lang="ces">&#269;&#237;s.</span> <span class="ocrx_word" lang="ces">280,</span> <span class="ocrx_word" lang="ces">281.</span> <span class="ocrx_word" lang="ces">1419</span> <span class="ocrx_word" lang="ces">&#345;&#237;jen</span> <span class="ocrx_word" lang="ces">26</span> <span class="ocrx_word" lang="ces">&#8212;</span> <span class="ocrx_word" lang="ces">1420</span> <span class="ocrx_word" lang="ces">&#250;nor</span> <span class="ocrx_word" lang="ces">19.</span> <span class="ocrx_word" lang="ces">173</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Old&#345;icha</span> <span class="ocrx_word" lang="ces">z</span> <span class="ocrx_word" lang="ces">Tecku</span> <span class="ocrx_word" lang="ces">(von</span> <span class="ocrx_word" lang="ces">Teggk),</span> <span class="ocrx_word" lang="ces">Friedricha</span> <span class="ocrx_word" lang="ces">z</span> <span class="ocrx_word" lang="ces">Helfensteinu,</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Albrechta</span> <span class="ocrx_word" lang="ces">z</span> <span class="ocrx_word" lang="ces">Rechbergu.</span> <span class="ocrx_word" lang="ces">Hanu&#353;e</span> <span class="ocrx_word" lang="ces">ze</span> <span class="ocrx_word" lang="ces">Staaden</span> <span class="ocrx_word" lang="ces">(von</span> <span class="ocrx_word" lang="ces">Stadgen)</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">a</span> <span class="ocrx_word" lang="ces">jin&#253;ch</span> <span class="ocrx_word" lang="ces">wiirtenbersk&#253;ch</span> <span class="ocrx_word" lang="ces">velmo&#382;&#367;,</span> <span class="ocrx_word" lang="ces">Zikmunda,</span> <span class="ocrx_word" lang="ces">kr&#225;le</span> <span class="ocrx_word" lang="ces">&#345;&#237;m-</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">sk&#233;ho</span> <span class="ocrx_word" lang="ces">a</span> <span class="ocrx_word" lang="ces">kr&#225;le</span> <span class="ocrx_word" lang="ces">uhersk&#233;ho</span> <span class="ocrx_word" lang="ces">i</span> <span class="ocrx_word" lang="ces">&#269;esk&#233;ho,</span> <span class="ocrx_word" lang="ces">za</span> <span class="ocrx_word" lang="ces">vrchn&#237;ho</span> <span class="ocrx_word" lang="ces">p&#225;na</span> <span class="ocrx_word" lang="ces">v&#353;ech</span></span><span class="ocr_line"><span class="ocrx_word" error="error">vz</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">jejich</span> <span class="ocrx_word" lang="ces">&#345;&#237;&#353;sk&#253;ch</span> <span class="ocrx_word" lang="ces">a</span> <span class="ocrx_word" lang="ces">&#269;esk&#253;ch</span> <span class="ocrx_word" lang="ces">lenn&#237;ch</span> <span class="ocrx_word" lang="ces">statk&#367;.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Orig.</span> <span class="ocrx_word" lang="ces">perg.</span> <span class="ocrx_word" lang="ces">40x26&#8212;5</span> <span class="ocrx_word" lang="ces">cm,</span> <span class="ocrx_word" lang="ces">n&#283;m.</span> <span class="ocrx_word" lang="ces">K</span> <span class="ocrx_word" lang="ces">listin&#283;</span> <span class="ocrx_word" lang="ces">jest</span> <span class="ocrx_word" lang="ces">p&#345;i-</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">v&#283;&#353;ena</span> <span class="ocrx_word" lang="ces">na</span> <span class="ocrx_word" lang="ces">pergamenov&#233;m</span> <span class="ocrx_word" lang="ces">prou&#382;ku</span> <span class="ocrx_word" lang="ces">kulat&#225;</span> <span class="ocrx_word" lang="ces">pe&#269;e&#357;</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Rudolfa</span> <span class="ocrx_word" lang="ces">ze</span> <span class="ocrx_word" lang="ces">Sulzu</span> <span class="ocrx_word" lang="ces">z</span> <span class="ocrx_word" lang="ces">vosku</span> <span class="ocrx_word" lang="ces">zelen&#233;</span> <span class="ocrx_word" lang="ces">barvy.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Na</span> <span class="ocrx_word" lang="ces">rubu:</span> <span class="ocrx_word" lang="ces">Wirtenberg.</span> <span class="ocrx_word" lang="ces">&#8212;</span> <span class="ocrx_word" lang="ces">M</span> <span class="ocrx_word" lang="ces">ceec</span> <span class="ocrx_word" lang="ces">xix.</span> <span class="ocrx_word" lang="ces">&#8212;</span> <span class="ocrx_word" lang="ces">Pi&#237;ips&#225;n</span> <span class="ocrx_word" lang="ces">a</span> <span class="ocrx_word" lang="ces">corigovan.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">&#8212;</span> <span class="ocrx_word" lang="ces">T.</span> <span class="ocrx_word" lang="ces">B.</span> <span class="ocrx_word" lang="ces">Fol.</span> <span class="ocrx_word" lang="ces">135.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Knihy</span> <span class="ocrx_word" lang="ces">priv.</span> <span class="ocrx_word" lang="ces">A</span> <span class="ocrx_word" lang="ces">IL,</span> <span class="ocrx_word" lang="ces">fol.</span> <span class="ocrx_word" lang="ces">135pv&#8212;136,</span> <span class="ocrx_word" lang="ces">B</span> <span class="ocrx_word" lang="ces">IL,</span> <span class="ocrx_word" lang="ces">fol.</span> <span class="ocrx_word" lang="ces">135pv&#8212;136.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Rosenthal</span> <span class="ocrx_word" lang="ces">KA,</span> <span class="ocrx_word" lang="ces">lit.</span> <span class="ocrx_word" lang="ces">A,</span> <span class="ocrx_word" lang="ces">&#269;.</span> <span class="ocrx_word" lang="ces">176.</span> <span class="ocrx_word" lang="ces">&#8212;</span> <span class="ocrx_word" lang="ces">V&#237;de&#328;sk&#233;</span> <span class="ocrx_word" lang="ces">rep.</span> <span class="ocrx_word" lang="ces">&#269;.</span> <span class="ocrx_word" lang="ces">1221.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">L&#252;nig,</span> <span class="ocrx_word" lang="ces">Codex</span> <span class="ocrx_word" lang="ces">Germaniae</span> <span class="ocrx_word" lang="ces">I.,</span> <span class="ocrx_word" lang="ces">str.</span> <span class="ocrx_word" lang="ces">1431,</span> <span class="ocrx_word" lang="ces">&#269;.</span> <span class="ocrx_word" lang="ces">CCCL.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">281.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">1420,</span> <span class="ocrx_word" lang="ces">&#250;nor</span> <span class="ocrx_word" lang="ces">4.</span> <span class="ocrx_word" lang="ces">Vratislav.</span></span><span class="ocr_line"><span class="ocrx_word" lang="lat">(Datum</span> <span class="ocrx_word" lang="lat">Wratislavie</span> <span class="ocrx_word" lang="lat">anno</span> <span class="ocrx_word" lang="lat">Domini</span> <span class="ocrx_word" lang="lat">millesimo</span></span><span class="ocr_line"><span class="ocrx_word" lang="lat">quadringentesimo</span> <span class="ocrx_word" lang="lat">vigesimo</span> <span class="ocrx_word" lang="lat">quarta</span> <span class="ocrx_word" lang="lat">die</span> <span class="ocrx_word" lang="lat">Februarii,</span></span><span class="ocr_line"><span class="ocrx_word" lang="lat">regnorum</span> <span class="ocrx_word" lang="lat">nostrorum</span> <span class="ocrx_word" lang="lat">anno</span> <span class="ocrx_word" lang="lat">Hungarie</span> <span class="ocrx_word" lang="lat">tricesimo</span></span><span class="ocr_line"><span class="ocrx_word" lang="lat">tercio,</span> <span class="ocrx_word" lang="lat">Romanorum</span> <span class="ocrx_word" lang="lat">vero</span> <span class="ocrx_word" lang="lat">decimo.)</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Zikmund,</span> <span class="ocrx_word" lang="ces">kr&#225;l</span> <span class="ocrx_word" lang="ces">&#345;&#237;msk&#253;,</span> <span class="ocrx_word" lang="ces">uhersk&#253;</span> <span class="ocrx_word" lang="ces">a</span> <span class="ocrx_word" lang="ces">&#269;esk&#253;,</span> <span class="ocrx_word" lang="ces">na&#345;izuje</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">V&#225;clavovi</span> <span class="ocrx_word" lang="ces">z</span> <span class="ocrx_word" lang="ces">Dub&#233;</span> <span class="ocrx_word" lang="ces">a</span> <span class="ocrx_word" lang="ces">z</span> <span class="ocrx_word" lang="ces">Le&#353;tn&#233;,</span> <span class="ocrx_word" lang="ces">podkomo&#345;&#237;mu</span> <span class="ocrx_word" lang="ces">kr&#225;lovstv&#237;</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">&#268;esk&#233;ho,</span> <span class="ocrx_word" lang="ces">aby</span> <span class="ocrx_word" lang="ces">na</span> <span class="ocrx_word" lang="ces">&#250;&#345;ad</span> <span class="ocrx_word" lang="ces">not&#225;&#345;e</span> <span class="ocrx_word" lang="ces">pra&#382;sk&#233;ho</span> <span class="ocrx_word" lang="ces">ungeltu</span> <span class="ocrx_word" lang="ces">uvedl</span> <span class="ocrx_word" lang="ces">po</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Erhardovi,</span> <span class="ocrx_word" lang="ces">pra&#382;sk&#233;m</span> <span class="ocrx_word" lang="ces">m&#283;&#353;&#357;anovi,</span> <span class="ocrx_word" lang="ces">Jana</span> <span class="ocrx_word" lang="ces">Ulmannova,</span> <span class="ocrx_word" lang="ces">rovn&#283;&#382;</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">m&#283;&#353;&#357;ana</span> <span class="ocrx_word" lang="ces">pra&#382;sk&#233;ho.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Orig.</span> <span class="ocrx_word" lang="ces">perg.</span> <span class="ocrx_word" lang="ces">vlhkem</span> <span class="ocrx_word" lang="ces">poSkozen</span> <span class="ocrx_word" lang="ces">37Xx19&#8212;7</span> <span class="ocrx_word" lang="ces">cm,</span> <span class="ocrx_word" lang="ces">lat.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Klistin&#233;</span> <span class="ocrx_word" lang="ces">jest</span> <span class="ocrx_word" lang="ces">pfiv&#233;Sena</span> <span class="ocrx_word" lang="ces">na</span> <span class="ocrx_word" lang="ces">pergamenov&#233;m</span> <span class="ocrx_word" lang="ces">prou&#380;-</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">ku</span> <span class="ocrx_word" lang="ces">kulat&#225;,</span> <span class="ocrx_word" lang="ces">uprost&#345;ed</span> <span class="ocrx_word" lang="ces">praskl&#225;</span> <span class="ocrx_word" lang="ces">majest&#225;tn&#237;</span> <span class="ocrx_word" lang="ces">pe&#269;e&#357;</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">kr&#225;le</span> <span class="ocrx_word" lang="ces">Zikmunda</span> <span class="ocrx_word" lang="ces">z</span> <span class="ocrx_word" lang="ces">vosku</span> <span class="ocrx_word" lang="ces">p&#345;irozen&#233;</span> <span class="ocrx_word" lang="ces">barvy.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Na</span> <span class="ocrx_word" lang="ces">plice:</span><span class="ocrx_word" lang="lat">Ad</span> <span class="ocrx_word" lang="lat">mandatum.</span> <span class="ocrx_word" lang="lat">d.</span> <span class="ocrx_word" lang="lat">regis</span> <span class="ocrx_word" lang="lat">d.</span> <span class="ocrx_word" lang="lat">Jo.</span> <span class="ocrx_word" lang="lat">episcopo</span> <span class="ocrx_word" lang="lat">Luthomislensi</span></span><span class="ocr_line"><span class="ocrx_word" lang="lat">referente</span> <span class="ocrx_word" lang="lat">Michael</span> <span class="ocrx_word" lang="lat">de</span> <span class="ocrx_word" lang="lat">Priest,</span> <span class="ocrx_word" lang="lat">canonicus</span> <span class="ocrx_word" lang="lat">Pragensis.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Na</span> <span class="ocrx_word" lang="ces">rubu:</span><span class="ocrx_word" lang="lat">Rta.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Rosenthal</span> <span class="ocrx_word" lang="ces">MR,</span> <span class="ocrx_word" lang="ces">&#269;.</span> <span class="ocrx_word" lang="ces">30.</span> <span class="ocrx_word" lang="ces">&#8212;</span> <span class="ocrx_word" lang="ces">V&#237;de&#328;sk&#233;</span> <span class="ocrx_word" lang="ces">rep.</span> <span class="ocrx_word" lang="ces">&#269;.</span> <span class="ocrx_word" lang="ces">1222.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">Regesta</span> <span class="ocrx_word" lang="ces">imperii</span> <span class="ocrx_word" lang="ces">XI.,</span> <span class="ocrx_word" lang="ces">str.</span> <span class="ocrx_word" lang="ces">281,</span> <span class="ocrx_word" lang="ces">&#269;.</span> <span class="ocrx_word" lang="ces">4002.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">282.</span></span><span class="ocr_line"><span class="ocrx_word" lang="ces">1420,</span> <span class="ocrx_word" lang="ces">&#250;nor</span> <span class="ocrx_word" lang="ces">19</span></span><span class="ocr_line"><span class="ocrx_word" lang="deu">(Geben</span> <span class="ocrx_word" lang="deu">am</span> <span class="ocrx_word" lang="deu">montage</span> <span class="ocrx_word" lang="deu">noch</span> <span class="ocrx_word" lang="deu">deme</span> <span class="ocrx_word" lang="deu">sontage,</span> <span class="ocrx_word" lang="deu">als</span> <span class="ocrx_word" lang="deu">man</span></span><span class="ocr_line"><span class="ocrx_word" lang="deu">in</span> <span class="ocrx_word" lang="deu">der</span> <span class="ocrx_word" lang="deu">kirchen</span> <span class="ocrx_word" lang="deu">Gotis</span> <span class="ocrx_word" lang="deu">singet</span> <span class="ocrx_word" lang="deu">Esto</span> <span class="ocrx_word" lang="deu">mishi</span> <span class="ocrx_word" lang="deu">in</span> <span class="ocrx_word" lang="deu">deum</span></span><span class="ocr_line"><span class="ocrx_word" lang="deu">etc.</span> <span class="ocrx_word" lang="deu">noch</span> <span class="ocrx_word" lang="deu">Cristi</span> <span class="ocrx_word" lang="deu">gebort</span> <span class="ocrx_word" lang="deu">firczenhundert</span> <span class="ocrx_word" lang="deu">jar</span> <span class="ocrx_word" lang="deu">und</span></span><span class="ocr_line"><span class="ocrx_word" lang="deu">dornoch</span> <span class="ocrx_word" lang="deu">in</span> <span class="ocrx_word" lang="deu">deme</span> <span class="ocrx_word" lang="deu">czwenczigsten</span> <span class="ocrx_word" lang="deu">jar.)</span></span></p></div></body></html>
 No newline at end of file
+1 −0

File added.

Preview size limit exceeded, changes collapsed.

+1 −0

File added.

Preview size limit exceeded, changes collapsed.

+1 −0

File added.

Preview size limit exceeded, changes collapsed.

Loading