Commit 8616ccc1 authored by Marek Medved's avatar Marek Medved
Browse files

utils

parent 3a197f17
Loading
Loading
Loading
Loading
+174 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# coding: utf-8
# Author: Marek Medved, marek.medved@sketchengine.eu, Lexical Computing CZ
from sqad_db import SqadDb
import sys
import persistent.list
from BTrees.OOBTree import BTree
from types import ModuleType, FunctionType
from gc import get_referents
# from numbers import Number
# from collections import deque
# from collections.abc import Set, Mapping

# ZERO_DEPTH_BASES = (str, int, float, bytes, Number, range, bytearray)

def bytesto(bytes, to='k', bsize=1024):
    a = {'k' : 1, 'm': 2, 'g' : 3, 't' : 4, 'p' : 5, 'e' : 6,}
    r = float(bytes)
    for i in range(a[to]):
        r = r / bsize
    return(r)


# def getsize(obj_0):
#     """Recursively iterate to sum size of object & members."""
#     _seen_ids = set()
#
#     def inner(obj):
#         obj_id = id(obj)
#         if obj_id in _seen_ids:
#             print('seen IDS')
#             return 0
#         _seen_ids.add(obj_id)
#         size = sys.getsizeof(obj)
#         if isinstance(obj, ZERO_DEPTH_BASES):
#             pass  # bypass remaining control flow and return
#         elif isinstance(obj, (tuple, list, Set, deque)):
#             print('here')
#             size += sum(inner(i) for i in obj)
#         elif isinstance(obj, Mapping) or hasattr(obj, 'items'):
#             size += sum(inner(k) + inner(v) for k, v in getattr(obj, 'items')())
#         # Check for custom object instances - may subclass above too
#         if hasattr(obj, '__dict__'):
#             size += inner(vars(obj))
#         if hasattr(obj, '__slots__'):  # can have __slots__ with __dict__
#             size += sum(inner(getattr(obj, s)) for s in obj.__slots__ if hasattr(obj, s))
#         return size
#
#     return inner(obj_0)

# Custom objects know their class.
# Function objects seem to know way too much, including modules.
# Exclude modules as well.
BLACKLIST = type, ModuleType, FunctionType


def getsize(obj):
    """sum size of object & members."""
    if isinstance(obj, BLACKLIST):
        raise TypeError('getsize() does not take argument of type: '+ str(type(obj)))
    seen_ids = set()
    size = 0
    objects = [obj]
    while objects:
        need_referents = []
        for obj in objects:
            if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids:
                seen_ids.add(id(obj))
                size += sys.getsizeof(obj)
                need_referents.append(obj)
        objects = get_referents(*need_referents)
    return size

def sentence_size(data, part):
    size_dict = {}
    for sentence in data:
        for key, value in sentence.items():
            if isinstance(value, BTree):
                value = dict(value)
            elif isinstance(value, persistent.list.PersistentList):
                value = list(value)

            if key == 'ctx':
                for c_type, content in value.items():
                    print(f'type: {c_type}')
                    for prev_s in content:
                        for sent_part in prev_s:
                            for c, r in sent_part.items():
                                print(f'{c}: {type(r)}: {type(r[0])}')
                                print(type(list(r[0])))
                                print(type(r[0]))
                                print(getsize(list(r[0])))
                                try:
                                    size_dict[f'{part}/sent/{key}/{type}'] += getsize(list(r))
                                except KeyError:
                                    size_dict[f'{part}/sent/{key}/{type}'] = getsize(list(r))
            else:
                try:
                    size_dict[f'{part}/sent/{key}'] += getsize(value)
                except KeyError:
                    size_dict[f'{part}/sent/{key}'] = getsize(value)

    return size_dict


def get_record_content(db_name, per_record=False):
    db = SqadDb(file_name=db_name, read_only=True)
    vocabulary, qa_type_dict, kb = db.get_dicts()

    total_sizes = {}

    print(f'Database: {db_name}')

    for record_id in db.get_all_records_id():
        record = db.get_record(record_id)

        data_in_db = {}
        data_in_db['rec_id'] = record.rec_id
        data_in_db['q_type'] = record.q_type
        data_in_db['a_type'] = record.a_type
        data_in_db['question'] = list(record.question)
        data_in_db['a_sel'] = list(record.answer_selection)
        data_in_db['a_sel_pos'] = record.text_answer_position
        data_in_db['a_ext'] = list(record.answer_extraction)
        data_in_db['similar_answers'] = dict(record.similar_answers)
        data_in_db['text_title'] = list(kb.url2doc.get(record.text)["title"])
        data_in_db['text'] = dict(kb.url2doc.get(record.text))['text']

        if per_record:
            print(f'================= {record_id} DB =================')

        for key, value in data_in_db.items():

            if key in ['question', 'a_sel', 'a_ext', 'text', 'text_title']:
                for sent_part, value in sentence_size(data_in_db[key], key).items():
                    try:
                        total_sizes[sent_part] += bytesto(value)
                    except KeyError:
                        total_sizes[sent_part] = bytesto(value)

                    if per_record:
                        print(f'{sent_part}: {bytesto(value):.2f}')

            else:
                part_size = getsize(value)
                try:
                    total_sizes[key] += bytesto(part_size)
                except KeyError:
                    total_sizes[key] = bytesto(part_size)

                if per_record:
                    print(f'{key}: {bytesto(part_size):.2f}')
        sys.exit()

    for key, value in total_sizes:
        print(f'{key}: {value:.2f}')


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Get DB sizes')
    parser.add_argument('--per_record', action='store_true',
                        required=False, default=False,
                        help='Sizes per record')

    parser.add_argument('-d', '--database', type=str,
                        required=True, help='Database name')

    args = parser.parse_args()
    get_record_content(args.database, args.per_record)


if __name__ == "__main__":
    main()

utils/get_vector.py

0 → 120000
+1 −0
Original line number Diff line number Diff line
../get_vector.py
 No newline at end of file
+62 −0
Original line number Diff line number Diff line
#!/usr/bin/env python3
# coding: utf-8
# Author: Marek Medved, marek.medved@sketchengine.eu, Lexical Computing CZ
import sys
from query_database import get_content
from sqad_db import SqadDb


def print_record(db, record_id, old, max_number):
    record = db.get_record(record_id)
    vocabulary, qa_type_dict, kb = db.get_dicts()

    sys.stdout.write(f'{record.rec_id}\n')
    # print(f'q_type: {id2qt(qa_type_dict, record.q_type)}')
    # print(f'a_type: {id2qt(qa_type_dict, record.a_type)}')

    sys.stdout.write('q: ')
    for i in get_content(record.question, vocabulary, old, part='w'):
        sys.stdout.write(f'{" ".join([x["word"] for x in i["sent"]])}\n')

    sys.stdout.write('a: ')
    for i in get_content(record.answer_selection, vocabulary, old, part='w'):
        sys.stdout.write(f'{" ".join([x["word"] for x in i["sent"]])}\n')

    sys.stdout.write(f'a_sel_pos: {record.text_answer_position}\n')

    sys.stdout.write('e: ')
    for i in get_content(record.answer_extraction, vocabulary, old, part='w'):
        sys.stdout.write(f'{" ".join([x["word"] for x in i["sent"]])}\n')

    for name, value in record.similar_answers.items():
        if name == 'sents_similar':
            sys.stdout.write(f'{name}:\n')
            for s_idx, score in value[:max_number]:
                for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old,
                                                                part='w')):
                    if idx == s_idx:
                        print(f'{idx:5.0f} ({score:.3f}): {" ".join([x["word"] for x in sent_and_phrs["sent"]])}')


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Print similar sentences for record')
    parser.add_argument('-d', '--database_file', type=str,
                        required=False, default='',
                        help='Database file name')

    parser.add_argument('-r', '--record_ids', type=str,
                        required=True,
                        help='Record id')
    parser.add_argument('-n', '--max_number', type=int,
                        required=False, default=20,
                        help='Number of best scored similar sentences')
    args = parser.parse_args()
    for rid in args.record_ids.strip().split(';'):
        db = SqadDb(file_name=args.database_file, read_only=True)
        print_record(db, rid, False, args.max_number)
        print('================================================================')


if __name__ == "__main__":
    main()
+1 −0
Original line number Diff line number Diff line
../query_database.py
 No newline at end of file

utils/sqad_db.py

0 → 120000
+1 −0
Original line number Diff line number Diff line
../sqad_db.py
 No newline at end of file
Loading