From 023997e40f8716a7d1d86c7dc5dd3f9f5b7906d6 Mon Sep 17 00:00:00 2001
From: Vit Novotny <witiko@mail.muni.cz>
Date: Sat, 27 Jun 2020 22:20:25 +0200
Subject: [PATCH] Add Python scripts for reading SQL, CSV, and TXT databases

---
 requirements.txt                |   2 +
 scripts/__init__.py             |   0
 scripts/common.py               | 108 ++++++++++++++++++++++++++++++++
 scripts/configuration.py        |  21 +++++++
 scripts/produce_ground_truth.py |  93 +++++++++++++++++++++++++++
 5 files changed, 224 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 scripts/__init__.py
 create mode 100644 scripts/common.py
 create mode 100644 scripts/configuration.py
 create mode 100644 scripts/produce_ground_truth.py

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..bd2ee283
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+gensim~=3.8.3
+mysqlclient~=1.4.6
diff --git a/scripts/__init__.py b/scripts/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/common.py b/scripts/common.py
new file mode 100644
index 00000000..294ee36c
--- /dev/null
+++ b/scripts/common.py
@@ -0,0 +1,108 @@
+# -*- coding:utf-8 -*-
+
+import csv
+import logging
+import re
+
+from MySQLdb import connect
+
+from .configuration import CSV_PARAMETERS, SQL_PARAMETERS, CSV_DATABASE_FILENAME, IMAGE_FILENAMES_LIST_FILENAME
+
+
+URL_REGEX = re.compile(r'https://sources.cms.flu.cas.cz/.*&bookid=(?P<book_id>[0-9]*).*')
+
+
+def read_csv(filename=CSV_DATABASE_FILENAME):
+    logger = logging.getLogger('read_csv')
+    successfully_read = 0
+    skipped = 0
+    with open(filename, 'rt') as f:
+        reader = csv.reader(f, **CSV_PARAMETERS)
+        next(reader)
+        for line_number, row in enumerate(reader):
+            line_number += 2
+            signature = row[4]
+            directory = row[5]
+            url = row[-1]
+            url_match = re.fullmatch(URL_REGEX, url)
+            if url_match is None:
+                if not url:
+                    warning = 'Line {} in CSV file {} contains an empty URL, skipping'.format(
+                        line_number,
+                        filename,
+                    )
+                    logger.warning(warning)
+                    skipped += 1
+                    continue
+                else:
+                    error = 'Line {} in CSV file {} contains a malformed URL "{}"'.format(
+                        line_number,
+                        filename,
+                        url,
+                    )
+                    raise ValueError(error)
+            book_id = int(url_match.group('book_id'))
+            successfully_read += 1
+            yield (book_id, {'signature': signature, 'directory': directory})
+    info = 'Successfully read {} records from CSV file {}, skipped {}'.format(
+        successfully_read,
+        filename,
+        skipped,
+    )
+    logger.info(info)
+
+
+def read_sql():
+    logger = logging.getLogger('read_sql')
+    successfully_read = 0
+    database = connect(**SQL_PARAMETERS)
+    cursor = database.cursor()
+    cursor.execute('''
+        SELECT kniha_id, COUNT(stranka_cislo) AS pocet_stran
+        FROM OCR_stranka
+        GROUP BY kniha_id
+        ORDER BY kniha_id
+    ''')
+    row_numbers = range(cursor.rowcount)
+    for row_number in row_numbers:
+        row = cursor.fetchone()
+        book_id = int(row[0])
+        number_of_pages = int(row[1])
+        if number_of_pages < 1:
+            warning = 'Book with ID {} has {} pages'.format(
+                book_id,
+                number_of_pages,
+            )
+            logger.warn(warning)
+        successfully_read += 1
+        yield (book_id, {'number_of_pages': number_of_pages})
+    info = 'Successfully read {} records from SQL database'.format(
+        successfully_read,
+    )
+    logger.info(info)
+
+
+def read_filenames(filename=IMAGE_FILENAMES_LIST_FILENAME):
+    logger = logging.getLogger('read_filenames')
+    successfully_read = 0
+    with open(filename, 'rt') as f:
+        for line in f:
+            line = line.rstrip('\r\n')
+            successfully_read += 1
+            yield line
+    info = 'Successfully read {} records from text file {}'.format(
+        successfully_read,
+        filename
+    )
+    logger.info(info)
+
+
+def preview(iterable, preview_size=5, separator=', '):
+    preview = separator.join(
+        [
+            str(element)
+            for element
+            in sorted(iterable)
+        ][:preview_size] + ['…']
+    )
+    return preview
diff --git a/scripts/configuration.py b/scripts/configuration.py
new file mode 100644
index 00000000..c636f229
--- /dev/null
+++ b/scripts/configuration.py
@@ -0,0 +1,21 @@
+# -*- coding:utf-8 -*-
+
+import csv
+import ctypes
+
+CSV_PARAMETERS = {
+    'delimiter': ',',
+    'quotechar': '"',
+    'quoting': csv.QUOTE_MINIMAL,
+}
+csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
+
+IMAGE_FILENAMES_LIST_FILENAME = 'input_filenames_filtered'
+
+CSV_DATABASE_FILENAME = 'dump.csv'
+
+SQL_PARAMETERS = {
+    'user': 'user',
+    'password': 'password',
+    'db': 'CMS_archiv',
+}
diff --git a/scripts/produce_ground_truth.py b/scripts/produce_ground_truth.py
new file mode 100644
index 00000000..409e78b0
--- /dev/null
+++ b/scripts/produce_ground_truth.py
@@ -0,0 +1,93 @@
+# -*- coding:utf-8 -*-
+
+import logging
+
+
+LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+
+
+if __name__ == '__main__':
+    logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT)
+
+
+from gensim.utils import deaccent
+
+from .common import preview, read_csv, read_filenames, read_sql
+
+
+CSV_DATABASE = dict(read_csv())
+SQL_DATABASE = dict(read_sql())
+IMAGE_FILENAMES = list(read_filenames())
+
+
+def check_sanity():
+    logger = logging.getLogger('check_sanity')
+
+    only_in_sql = set(SQL_DATABASE) - set(CSV_DATABASE)
+    if only_in_sql:
+        warning = 'The following book ids ({}) exist only in SQL: {}'.format(
+            len(only_in_sql),
+            preview(only_in_sql),
+        )
+        logger.warning(warning)
+
+    only_in_csv = set(CSV_DATABASE) - set(SQL_DATABASE)
+    if only_in_csv:
+        warning = 'The following book ids ({}) exist only in CSV: {}'.format(
+            len(only_in_csv),
+            preview(only_in_csv),
+        )
+        logger.warning(warning)
+
+    book_ids = set(CSV_DATABASE) & set(SQL_DATABASE)
+    database = {
+        book_id: {
+            **SQL_DATABASE[book_id],
+            **CSV_DATABASE[book_id],
+        }
+        for book_id
+        in book_ids
+    }
+    info = 'Intersection of CSV and SQL databases contains {} records'.format(
+        len(database),        
+    )
+    logger.info(info)
+
+    deaccented_image_filenames = list(map(deaccent, IMAGE_FILENAMES))
+    for book_id, book_information in database.items():
+        directory = book_information['directory']
+        signature = book_information['signature']
+        number_of_pages = book_information['number_of_pages']
+        book_description = 'book with ID {} in directory {} with signature {}, supposed to have {} pages'.format(
+            book_id,
+            directory,
+            signature,
+            number_of_pages,
+        )
+        deaccented_directory = deaccent(directory)
+        deaccented_signature = deaccent(signature)
+        matching_image_filenames = [
+            image_filename
+            for image_filename
+            in deaccented_image_filenames
+            if (
+                '/{}/'.format(deaccented_directory) in image_filename and
+                '/{}/'.format(deaccented_signature) in image_filename
+            )
+        ]
+        if not matching_image_filenames:
+            warning = 'No matching image filenames for {}'.format(
+                book_description,
+            )
+            logger.warning(warning)
+        elif len(matching_image_filenames) < number_of_pages:
+            warning = '{} matching image filenames for {}:\n - {}'.format(
+                len(matching_image_filenames),
+                book_description,
+                preview(matching_image_filenames, separator='\n - '),
+            )
+            logger.warning(warning)
+
+
+if __name__ == '__main__':
+    check_sanity()
-- 
GitLab