From 023997e40f8716a7d1d86c7dc5dd3f9f5b7906d6 Mon Sep 17 00:00:00 2001 From: Vit Novotny <witiko@mail.muni.cz> Date: Sat, 27 Jun 2020 22:20:25 +0200 Subject: [PATCH] Add Python scripts for reading SQL, CSV, and TXT databases --- requirements.txt | 2 + scripts/__init__.py | 0 scripts/common.py | 108 ++++++++++++++++++++++++++++++++ scripts/configuration.py | 21 +++++++ scripts/produce_ground_truth.py | 93 +++++++++++++++++++++++++++ 5 files changed, 224 insertions(+) create mode 100644 requirements.txt create mode 100644 scripts/__init__.py create mode 100644 scripts/common.py create mode 100644 scripts/configuration.py create mode 100644 scripts/produce_ground_truth.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..bd2ee283 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +gensim~=3.8.3 +mysqlclient~=1.4.6 diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scripts/common.py b/scripts/common.py new file mode 100644 index 00000000..294ee36c --- /dev/null +++ b/scripts/common.py @@ -0,0 +1,108 @@ +# -*- coding:utf-8 -*- + +import csv +import logging +import re + +from MySQLdb import connect + +from .configuration import CSV_PARAMETERS, SQL_PARAMETERS, CSV_DATABASE_FILENAME, IMAGE_FILENAMES_LIST_FILENAME + + +URL_REGEX = re.compile(r'https://sources.cms.flu.cas.cz/.*&bookid=(?P<book_id>[0-9]*).*') + + +def read_csv(filename=CSV_DATABASE_FILENAME): + logger = logging.getLogger('read_csv') + successfully_read = 0 + skipped = 0 + with open(filename, 'rt') as f: + reader = csv.reader(f, **CSV_PARAMETERS) + next(reader) + for line_number, row in enumerate(reader): + line_number += 2 + signature = row[4] + directory = row[5] + url = row[-1] + url_match = re.fullmatch(URL_REGEX, url) + if url_match is None: + if not url: + warning = 'Line {} in CSV file {} contains an empty URL, skipping'.format( + line_number, + filename, + ) + logger.warning(warning) + skipped += 1 + continue + else: + error = 'Line {} in CSV file {} contains a malformed URL "{}"'.format( + line_number, + filename, + url, + ) + raise ValueError(error) + book_id = int(url_match.group('book_id')) + successfully_read += 1 + yield (book_id, {'signature': signature, 'directory': directory}) + info = 'Successfully read {} records from CSV file {}, skipped {}'.format( + successfully_read, + filename, + skipped, + ) + logger.info(info) + + +def read_sql(): + logger = logging.getLogger('read_sql') + successfully_read = 0 + database = connect(**SQL_PARAMETERS) + cursor = database.cursor() + cursor.execute(''' + SELECT kniha_id, COUNT(stranka_cislo) AS pocet_stran + FROM OCR_stranka + GROUP BY kniha_id + ORDER BY kniha_id + ''') + row_numbers = range(cursor.rowcount) + for row_number in row_numbers: + row = cursor.fetchone() + book_id = int(row[0]) + number_of_pages = int(row[1]) + if number_of_pages < 1: + warning = 'Book with ID {} has {} pages'.format( + book_id, + number_of_pages, + ) + logger.warn(warning) + successfully_read += 1 + yield (book_id, {'number_of_pages': number_of_pages}) + info = 'Successfully read {} records from SQL database'.format( + successfully_read, + ) + logger.info(info) + + +def read_filenames(filename=IMAGE_FILENAMES_LIST_FILENAME): + logger = logging.getLogger('read_filenames') + successfully_read = 0 + with open(filename, 'rt') as f: + for line in f: + line = line.rstrip('\r\n') + successfully_read += 1 + yield line + info = 'Successfully read {} records from text file {}'.format( + successfully_read, + filename + ) + logger.info(info) + + +def preview(iterable, preview_size=5, separator=', '): + preview = separator.join( + [ + str(element) + for element + in sorted(iterable) + ][:preview_size] + ['…'] + ) + return preview diff --git a/scripts/configuration.py b/scripts/configuration.py new file mode 100644 index 00000000..c636f229 --- /dev/null +++ b/scripts/configuration.py @@ -0,0 +1,21 @@ +# -*- coding:utf-8 -*- + +import csv +import ctypes + +CSV_PARAMETERS = { + 'delimiter': ',', + 'quotechar': '"', + 'quoting': csv.QUOTE_MINIMAL, +} +csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2)) + +IMAGE_FILENAMES_LIST_FILENAME = 'input_filenames_filtered' + +CSV_DATABASE_FILENAME = 'dump.csv' + +SQL_PARAMETERS = { + 'user': 'user', + 'password': 'password', + 'db': 'CMS_archiv', +} diff --git a/scripts/produce_ground_truth.py b/scripts/produce_ground_truth.py new file mode 100644 index 00000000..409e78b0 --- /dev/null +++ b/scripts/produce_ground_truth.py @@ -0,0 +1,93 @@ +# -*- coding:utf-8 -*- + +import logging + + +LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT) + + +from gensim.utils import deaccent + +from .common import preview, read_csv, read_filenames, read_sql + + +CSV_DATABASE = dict(read_csv()) +SQL_DATABASE = dict(read_sql()) +IMAGE_FILENAMES = list(read_filenames()) + + +def check_sanity(): + logger = logging.getLogger('check_sanity') + + only_in_sql = set(SQL_DATABASE) - set(CSV_DATABASE) + if only_in_sql: + warning = 'The following book ids ({}) exist only in SQL: {}'.format( + len(only_in_sql), + preview(only_in_sql), + ) + logger.warning(warning) + + only_in_csv = set(CSV_DATABASE) - set(SQL_DATABASE) + if only_in_csv: + warning = 'The following book ids ({}) exist only in CSV: {}'.format( + len(only_in_csv), + preview(only_in_csv), + ) + logger.warning(warning) + + book_ids = set(CSV_DATABASE) & set(SQL_DATABASE) + database = { + book_id: { + **SQL_DATABASE[book_id], + **CSV_DATABASE[book_id], + } + for book_id + in book_ids + } + info = 'Intersection of CSV and SQL databases contains {} records'.format( + len(database), + ) + logger.info(info) + + deaccented_image_filenames = list(map(deaccent, IMAGE_FILENAMES)) + for book_id, book_information in database.items(): + directory = book_information['directory'] + signature = book_information['signature'] + number_of_pages = book_information['number_of_pages'] + book_description = 'book with ID {} in directory {} with signature {}, supposed to have {} pages'.format( + book_id, + directory, + signature, + number_of_pages, + ) + deaccented_directory = deaccent(directory) + deaccented_signature = deaccent(signature) + matching_image_filenames = [ + image_filename + for image_filename + in deaccented_image_filenames + if ( + '/{}/'.format(deaccented_directory) in image_filename and + '/{}/'.format(deaccented_signature) in image_filename + ) + ] + if not matching_image_filenames: + warning = 'No matching image filenames for {}'.format( + book_description, + ) + logger.warning(warning) + elif len(matching_image_filenames) < number_of_pages: + warning = '{} matching image filenames for {}:\n - {}'.format( + len(matching_image_filenames), + book_description, + preview(matching_image_filenames, separator='\n - '), + ) + logger.warning(warning) + + +if __name__ == '__main__': + check_sanity() -- GitLab