Skip to content
Snippets Groups Projects
Commit 023997e4 authored by Vít Starý Novotný's avatar Vít Starý Novotný
Browse files

Add Python scripts for reading SQL, CSV, and TXT databases

parent a69d6a48
No related branches found
No related tags found
No related merge requests found
gensim~=3.8.3
mysqlclient~=1.4.6
# -*- coding:utf-8 -*-
import csv
import logging
import re
from MySQLdb import connect
from .configuration import CSV_PARAMETERS, SQL_PARAMETERS, CSV_DATABASE_FILENAME, IMAGE_FILENAMES_LIST_FILENAME
URL_REGEX = re.compile(r'https://sources.cms.flu.cas.cz/.*&bookid=(?P<book_id>[0-9]*).*')
def read_csv(filename=CSV_DATABASE_FILENAME):
logger = logging.getLogger('read_csv')
successfully_read = 0
skipped = 0
with open(filename, 'rt') as f:
reader = csv.reader(f, **CSV_PARAMETERS)
next(reader)
for line_number, row in enumerate(reader):
line_number += 2
signature = row[4]
directory = row[5]
url = row[-1]
url_match = re.fullmatch(URL_REGEX, url)
if url_match is None:
if not url:
warning = 'Line {} in CSV file {} contains an empty URL, skipping'.format(
line_number,
filename,
)
logger.warning(warning)
skipped += 1
continue
else:
error = 'Line {} in CSV file {} contains a malformed URL "{}"'.format(
line_number,
filename,
url,
)
raise ValueError(error)
book_id = int(url_match.group('book_id'))
successfully_read += 1
yield (book_id, {'signature': signature, 'directory': directory})
info = 'Successfully read {} records from CSV file {}, skipped {}'.format(
successfully_read,
filename,
skipped,
)
logger.info(info)
def read_sql():
logger = logging.getLogger('read_sql')
successfully_read = 0
database = connect(**SQL_PARAMETERS)
cursor = database.cursor()
cursor.execute('''
SELECT kniha_id, COUNT(stranka_cislo) AS pocet_stran
FROM OCR_stranka
GROUP BY kniha_id
ORDER BY kniha_id
''')
row_numbers = range(cursor.rowcount)
for row_number in row_numbers:
row = cursor.fetchone()
book_id = int(row[0])
number_of_pages = int(row[1])
if number_of_pages < 1:
warning = 'Book with ID {} has {} pages'.format(
book_id,
number_of_pages,
)
logger.warn(warning)
successfully_read += 1
yield (book_id, {'number_of_pages': number_of_pages})
info = 'Successfully read {} records from SQL database'.format(
successfully_read,
)
logger.info(info)
def read_filenames(filename=IMAGE_FILENAMES_LIST_FILENAME):
logger = logging.getLogger('read_filenames')
successfully_read = 0
with open(filename, 'rt') as f:
for line in f:
line = line.rstrip('\r\n')
successfully_read += 1
yield line
info = 'Successfully read {} records from text file {}'.format(
successfully_read,
filename
)
logger.info(info)
def preview(iterable, preview_size=5, separator=', '):
preview = separator.join(
[
str(element)
for element
in sorted(iterable)
][:preview_size] + ['']
)
return preview
# -*- coding:utf-8 -*-
import csv
import ctypes
CSV_PARAMETERS = {
'delimiter': ',',
'quotechar': '"',
'quoting': csv.QUOTE_MINIMAL,
}
csv.field_size_limit(int(ctypes.c_ulong(-1).value // 2))
IMAGE_FILENAMES_LIST_FILENAME = 'input_filenames_filtered'
CSV_DATABASE_FILENAME = 'dump.csv'
SQL_PARAMETERS = {
'user': 'user',
'password': 'password',
'db': 'CMS_archiv',
}
# -*- coding:utf-8 -*-
import logging
LOGGING_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO, format=LOGGING_FORMAT)
from gensim.utils import deaccent
from .common import preview, read_csv, read_filenames, read_sql
CSV_DATABASE = dict(read_csv())
SQL_DATABASE = dict(read_sql())
IMAGE_FILENAMES = list(read_filenames())
def check_sanity():
logger = logging.getLogger('check_sanity')
only_in_sql = set(SQL_DATABASE) - set(CSV_DATABASE)
if only_in_sql:
warning = 'The following book ids ({}) exist only in SQL: {}'.format(
len(only_in_sql),
preview(only_in_sql),
)
logger.warning(warning)
only_in_csv = set(CSV_DATABASE) - set(SQL_DATABASE)
if only_in_csv:
warning = 'The following book ids ({}) exist only in CSV: {}'.format(
len(only_in_csv),
preview(only_in_csv),
)
logger.warning(warning)
book_ids = set(CSV_DATABASE) & set(SQL_DATABASE)
database = {
book_id: {
**SQL_DATABASE[book_id],
**CSV_DATABASE[book_id],
}
for book_id
in book_ids
}
info = 'Intersection of CSV and SQL databases contains {} records'.format(
len(database),
)
logger.info(info)
deaccented_image_filenames = list(map(deaccent, IMAGE_FILENAMES))
for book_id, book_information in database.items():
directory = book_information['directory']
signature = book_information['signature']
number_of_pages = book_information['number_of_pages']
book_description = 'book with ID {} in directory {} with signature {}, supposed to have {} pages'.format(
book_id,
directory,
signature,
number_of_pages,
)
deaccented_directory = deaccent(directory)
deaccented_signature = deaccent(signature)
matching_image_filenames = [
image_filename
for image_filename
in deaccented_image_filenames
if (
'/{}/'.format(deaccented_directory) in image_filename and
'/{}/'.format(deaccented_signature) in image_filename
)
]
if not matching_image_filenames:
warning = 'No matching image filenames for {}'.format(
book_description,
)
logger.warning(warning)
elif len(matching_image_filenames) < number_of_pages:
warning = '{} matching image filenames for {}:\n - {}'.format(
len(matching_image_filenames),
book_description,
preview(matching_image_filenames, separator='\n - '),
)
logger.warning(warning)
if __name__ == '__main__':
check_sanity()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment