From 7f8afe0446979c39f6170c21e6d993274d22b09a Mon Sep 17 00:00:00 2001 From: Vit Novotny <witiko@mail.muni.cz> Date: Mon, 6 Dec 2021 19:23:18 +0100 Subject: [PATCH] Subdivide LINDAT dataset into smaller archives --- Makefile | 5 +- dataset/LICENSE | 121 ++++++++++++++++++++++++++++++ dataset/README.md | 65 ++++++++++++++++ scripts/produce_lindat_dataset.py | 2 +- 4 files changed, 189 insertions(+), 4 deletions(-) create mode 100644 dataset/LICENSE create mode 100644 dataset/README.md diff --git a/Makefile b/Makefile index 79121640..2a0d7d9d 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,6 @@ OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_TESSERACT = $(OUTPUT_OCR_WAIFU2X_HIGH_NOISE OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_GOOGLE = $(OUTPUT_OCR_GOOGLE_LOWRES_DIRNAME) OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_COMBINED = $(OUTPUT_OCR_WAIFU2X_HIGH_NOISE_TESSERACT4_GOOGLE_LOWRES_DIRNAME) OUTPUT_LINDAT_DATASET_ROOT = $(SCRIPT_DIRNAME)/dataset -OUTPUT_LINDAT_DATASET = $(SCRIPT_DIRNAME)/dataset.zip LANGUAGE_DETECTION_CODE = $(SCRIPT_DIRNAME)/language_detection LANGUAGE_DETECTION_ANNOTATIONS = $(SCRIPT_DIRNAME)/language_detection_annotations @@ -250,10 +249,10 @@ remount-nymfe: %.zip: % rm -f $@ - (cd $< && zip -r $@ .) + (cd $< && zip -q -r $@ .) $(OUTPUT_LINDAT_DATASET_ROOT): $(INPUT_UPSCALED_HIGH_CONFIDENCE_FILENAMES) $(OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_TESSERACT) $(OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_GOOGLE) $(OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_COMBINED) $(INPUT_HUMAN_JUDGEMENTS_DIRNAME) $(INPUT_HUMAN_JUDGEMENTS_UPSCALED_HIGH_CONFIDENCE_FILENAMES_WITH_COLUMNS) $(INPUT_HUMAN_JUDGEMENTS_UPSCALED_HIGH_CONFIDENCE_FILENAMES_WITHOUT_COLUMNS) $(LANGUAGE_DETECTION_ANNOTATIONS) - rm -rf $@ + rm -rf $@/*/ $(PYTHON_RUN) scripts.produce_lindat_dataset $(INPUT_DIRNAME) $^ $(OUTPUT_LINDAT_INPUT_RIGHTS) $@ $(INPUT_JSON_DUMP): diff --git a/dataset/LICENSE b/dataset/LICENSE new file mode 100644 index 00000000..0e259d42 --- /dev/null +++ b/dataset/LICENSE @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. diff --git a/dataset/README.md b/dataset/README.md new file mode 100644 index 00000000..be2ee02c --- /dev/null +++ b/dataset/README.md @@ -0,0 +1,65 @@ +# A Human-Annotated Dataset of Scanned Images and OCR Texts from Medieval Documents + +This is an open dataset of scanned images and OCR texts from 19th and +20th century letterpress reprints of documents from the Hussite era. The +dataset contains human annotations for layout analysis, OCR evaluation, +and language identification. + +## Contents + +The dataset is structured as follows: + +- The archive `scanned-images.zip` contains 51,351 high-resolution scanned images. +- The archive `ocr-texts.zip` contains 51,351 OCR texts in three formats: + + 1. HOCR documents from the Tesseract 4 OCR engine. + 2. JSON documents from the Google Vision AI OCR engine. + 3. TXT documents that combine Tesseract and Google outputs + to achieve maximum accuracy on different types of layout. + +- The archive `annotations-ocr.zip` contains 120 annotations for the evaluation + of OCR. The directory is divided into two subdirectories for the evaluation + of layout analysis: + + 1. The subdirectory `with-columns` contains annotations for 17 multi-column pages. + 2. The subdirectory `without-columns` contains annotations for 103 single-column pages. + +- The archive `annotations-language-identification.zip` contains 122 annotations + for the evaluation of language identification. + +## Citing + +If you use our dataset in your work, please cite the following article: + +> NovotnĂ˝, V., Seidlová, K., Vrabcová, T., Horák, A.: When Tesseract Brings +> Friends: Layout Analysis, Language Identification, and Super-Resolution in +> the Optical Character Recognition of Medieval Texts. In: Horák, A., RychlĂ˝, +> P., Rambousek, A. (eds.) Proceedings of Recent Advancesin Slavonic Natural +> Language Processing, RASLAN 2021. pp. 91–100. ISSN 2336-4289. +> ISBN 978-80-263-1600-8. Tribun EU (2021). +> Available also from WWW: <https://nlp.fi.muni.cz/raslan/2021/paper10.pdf>. + +If you use LaTeX, you can use the following BibTeX entry: + +``` bibtex +@inproceedings{novotny2020when, + title = {When Tesseract Brings Friends: Layout Analysis, Language + Identification, and Super-Resolution in the Optical Character + Recognition of Medieval Texts}, + author = {VĂt NovotnĂ˝ and KristĂ˝na Seidlová and Tereza Vrabcová and + Aleš Horák}, + editor = {Aleš Horák and Pavel RychlĂ˝ and Adam Rambousek}, + booktitle = {Proceedings of Recent Advances in Slavonic Natural Language + Processing, {RASLAN} 2021}, + publisher = {Tribun {EU}}, + pages = {91-100}, + year = {2021}, + issn = {2336-4289}, + isbn = {978-80-263-1600-8}, + url = {https://nlp.fi.muni.cz/raslan/2021/paper10.pdf}, +} +``` + +## Acknowledgements + +This work was funded by TAÄŚR Éta, project number TL03000365. diff --git a/scripts/produce_lindat_dataset.py b/scripts/produce_lindat_dataset.py index 564b80d2..988a5673 100644 --- a/scripts/produce_lindat_dataset.py +++ b/scripts/produce_lindat_dataset.py @@ -203,7 +203,7 @@ def produce_scanned_images_and_ocr_texts(images_directory, ocr_texts_directory): def main(directory=OUTPUT_ROOT): - directory.mkdir() + directory.mkdir(exist_ok=True) produce_scanned_images_and_ocr_texts(directory / 'scanned-images', directory / 'ocr-texts') produce_ocr_and_layout_analysis_annotations(directory / 'annotations-ocr') -- GitLab