From 7f8afe0446979c39f6170c21e6d993274d22b09a Mon Sep 17 00:00:00 2001
From: Vit Novotny <witiko@mail.muni.cz>
Date: Mon, 6 Dec 2021 19:23:18 +0100
Subject: [PATCH] Subdivide LINDAT dataset into smaller archives

---
 Makefile                          |   5 +-
 dataset/LICENSE                   | 121 ++++++++++++++++++++++++++++++
 dataset/README.md                 |  65 ++++++++++++++++
 scripts/produce_lindat_dataset.py |   2 +-
 4 files changed, 189 insertions(+), 4 deletions(-)
 create mode 100644 dataset/LICENSE
 create mode 100644 dataset/README.md

diff --git a/Makefile b/Makefile
index 79121640..2a0d7d9d 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,6 @@ OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_TESSERACT = $(OUTPUT_OCR_WAIFU2X_HIGH_NOISE
 OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_GOOGLE = $(OUTPUT_OCR_GOOGLE_LOWRES_DIRNAME)
 OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_COMBINED = $(OUTPUT_OCR_WAIFU2X_HIGH_NOISE_TESSERACT4_GOOGLE_LOWRES_DIRNAME)
 OUTPUT_LINDAT_DATASET_ROOT = $(SCRIPT_DIRNAME)/dataset
-OUTPUT_LINDAT_DATASET = $(SCRIPT_DIRNAME)/dataset.zip
 
 LANGUAGE_DETECTION_CODE = $(SCRIPT_DIRNAME)/language_detection
 LANGUAGE_DETECTION_ANNOTATIONS = $(SCRIPT_DIRNAME)/language_detection_annotations
@@ -250,10 +249,10 @@ remount-nymfe:
 
 %.zip: %
 	rm -f $@
-	(cd $< && zip -r $@ .)
+	(cd $< && zip -q -r $@ .)
 
 $(OUTPUT_LINDAT_DATASET_ROOT): $(INPUT_UPSCALED_HIGH_CONFIDENCE_FILENAMES) $(OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_TESSERACT) $(OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_GOOGLE) $(OUTPUT_LINDAT_DATASET_INPUT_OCR_ROOT_COMBINED) $(INPUT_HUMAN_JUDGEMENTS_DIRNAME) $(INPUT_HUMAN_JUDGEMENTS_UPSCALED_HIGH_CONFIDENCE_FILENAMES_WITH_COLUMNS) $(INPUT_HUMAN_JUDGEMENTS_UPSCALED_HIGH_CONFIDENCE_FILENAMES_WITHOUT_COLUMNS) $(LANGUAGE_DETECTION_ANNOTATIONS)
-	rm -rf $@
+	rm -rf $@/*/
 	$(PYTHON_RUN) scripts.produce_lindat_dataset $(INPUT_DIRNAME) $^ $(OUTPUT_LINDAT_INPUT_RIGHTS) $@
 
 $(INPUT_JSON_DUMP):
diff --git a/dataset/LICENSE b/dataset/LICENSE
new file mode 100644
index 00000000..0e259d42
--- /dev/null
+++ b/dataset/LICENSE
@@ -0,0 +1,121 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
diff --git a/dataset/README.md b/dataset/README.md
new file mode 100644
index 00000000..be2ee02c
--- /dev/null
+++ b/dataset/README.md
@@ -0,0 +1,65 @@
+# A Human-Annotated Dataset of Scanned Images and OCR Texts from Medieval Documents
+
+This is an open dataset of scanned images and OCR texts from 19th and
+20th century letterpress reprints of documents from the Hussite era. The
+dataset contains human annotations for layout analysis, OCR evaluation,
+and language identification.
+
+## Contents
+
+The dataset is structured as follows:
+
+- The archive `scanned-images.zip` contains 51,351 high-resolution scanned images.
+- The archive `ocr-texts.zip` contains 51,351 OCR texts in three formats:
+
+    1. HOCR documents from the Tesseract 4 OCR engine.
+    2. JSON documents from the Google Vision AI OCR engine.
+    3. TXT documents that combine Tesseract and Google outputs
+       to achieve maximum accuracy on different types of layout.
+
+- The archive `annotations-ocr.zip` contains 120 annotations for the evaluation
+  of OCR. The directory is divided into two subdirectories for the evaluation
+  of layout analysis:
+
+    1. The subdirectory `with-columns` contains annotations for 17 multi-column pages.
+    2. The subdirectory `without-columns` contains annotations for 103 single-column pages.
+
+- The archive `annotations-language-identification.zip` contains 122 annotations
+  for the evaluation of language identification.
+
+## Citing
+
+If you use our dataset in your work, please cite the following article:
+
+> Novotný, V., Seidlová, K., Vrabcová, T., Horák, A.: When Tesseract Brings
+> Friends: Layout Analysis, Language Identification, and Super-Resolution in
+> the Optical Character Recognition of Medieval Texts. In: Horák, A., Rychlý,
+> P., Rambousek, A. (eds.) Proceedings of Recent Advancesin Slavonic Natural
+> Language Processing, RASLAN 2021. pp. 91–100. ISSN 2336-4289.
+> ISBN 978-80-263-1600-8. Tribun EU (2021).
+> Available also from WWW: <https://nlp.fi.muni.cz/raslan/2021/paper10.pdf>.
+
+If you use LaTeX, you can use the following BibTeX entry:
+
+``` bibtex
+@inproceedings{novotny2020when,
+  title = {When Tesseract Brings Friends: Layout Analysis, Language
+           Identification, and Super-Resolution in the Optical Character
+           Recognition of Medieval Texts},
+  author = {Vít Novotný and Kristýna Seidlová and Tereza Vrabcová and
+            Aleš Horák},
+  editor = {Aleš Horák and Pavel Rychlý and Adam Rambousek},
+  booktitle = {Proceedings of Recent Advances in Slavonic Natural Language
+               Processing, {RASLAN} 2021},
+  publisher = {Tribun {EU}},
+  pages = {91-100},
+  year = {2021},
+  issn = {2336-4289},
+  isbn = {978-80-263-1600-8},
+  url = {https://nlp.fi.muni.cz/raslan/2021/paper10.pdf},
+}
+```
+
+## Acknowledgements
+
+This work was funded by TAČR Éta, project number TL03000365.
diff --git a/scripts/produce_lindat_dataset.py b/scripts/produce_lindat_dataset.py
index 564b80d2..988a5673 100644
--- a/scripts/produce_lindat_dataset.py
+++ b/scripts/produce_lindat_dataset.py
@@ -203,7 +203,7 @@ def produce_scanned_images_and_ocr_texts(images_directory, ocr_texts_directory):
 
 
 def main(directory=OUTPUT_ROOT):
-    directory.mkdir()
+    directory.mkdir(exist_ok=True)
 
     produce_scanned_images_and_ocr_texts(directory / 'scanned-images', directory / 'ocr-texts')
     produce_ocr_and_layout_analysis_annotations(directory / 'annotations-ocr')
-- 
GitLab