Commit 3fab93ca authored by Marek Medved's avatar Marek Medved
Browse files

query specific parts of word

parent 3154c4a6
Loading
Loading
Loading
Loading
+17 −13
Original line number Diff line number Diff line
@@ -13,7 +13,7 @@ def get_ctx(data, vocabulary, part=''):
            p_content = []
            for w_id_cx in p:
                if part:
                    p_content.append(id2word(vocabulary, w_id_cx)[part])
                    p_content.append(id2word(vocabulary, w_id_cx, part))
                else:
                    p_content.append(id2word(vocabulary, w_id_cx))
            if sentence_phrases.get(ctx_type):
@@ -28,7 +28,7 @@ def get_senence(data, vocabulary, part=''):
    sent = []
    for w_id in data:
        if part:
            sent.append(id2word(vocabulary, w_id)[part])
            sent.append(id2word(vocabulary, w_id, part))
        else:
            sent.append(id2word(vocabulary, w_id))

@@ -50,7 +50,7 @@ def get_content_ctx(url, kb, vocabulary, part=''):
    return result


def get_record(db, record_id):
def get_record(db, record_id, word_parts=''):
    record = db.get_record(record_id)
    vocabulary, qa_type_dict, kb = db.get_dicts()
    """
@@ -70,13 +70,13 @@ def get_record(db, record_id):
    data['rec_id'] = record.rec_id
    data['q_type'] = id2qt(qa_type_dict, record.q_type)
    data['a_type'] = id2qt(qa_type_dict, record.a_type)
    data['question'] = get_content(record.question, vocabulary)
    data['a_sel'] = get_content(record.answer_selection, vocabulary)
    data['question'] = get_content(record.question, vocabulary, word_parts)
    data['a_sel'] = get_content(record.answer_selection, vocabulary, word_parts)
    data['a_sel_pos'] = record.text_answer_position
    data['a_ext'] = get_content(record.answer_extraction, vocabulary)
    data['a_ext'] = get_content(record.answer_extraction, vocabulary, word_parts)
    data['similar_answers'] = record.similar_answers
    data['text_title'] = kb.url2doc.get(record.text)['title']
    data['text'] = get_content_ctx(record.text, kb, vocabulary)
    data['text'] = get_content_ctx(record.text, kb, vocabulary, word_parts)

    return data

@@ -90,17 +90,17 @@ def print_record(db, record_id):
    print(f'a_type: {id2qt(qa_type_dict, record.a_type)}')

    print('question:')
    for i in get_content(record.question, vocabulary, part='word'):
    for i in get_content(record.question, vocabulary, part='w'):
        print(f'\ts: {" ".join(i)}')

    print('a_sel:')
    for i in get_content(record.answer_selection, vocabulary, part='word'):
    for i in get_content(record.answer_selection, vocabulary, part='w'):
        print(f'\ts: {" ".join(i)}')

    print(f'a_sel_pos: {record.text_answer_position}')

    print('a_ext:')
    for i in get_content(record.answer_extraction, vocabulary, part='word'):
    for i in get_content(record.answer_extraction, vocabulary, part='w'):
        print(f'\ts: {" ".join(i)}')

    print('similar_answers:')
@@ -111,11 +111,11 @@ def print_record(db, record_id):
        #         print(f'\t\ts: {" ".join(sent_and_phrs["sent"])}')

    print(f'text_title:')
    for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, part="word"):
    for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, part="w"):
        print(f'\ts: {" ".join(i)}')

    print('text:')
    for sent_and_phrs in get_content_ctx(record.text, kb, vocabulary, part='word'):
    for sent_and_phrs in get_content_ctx(record.text, kb, vocabulary, part='w'):
        print(f'\ts: {" ".join(sent_and_phrs["sent"])}')
        for key, phrs in sent_and_phrs['ctx'].items():
            print(f'\t\tctx_type: {key}')
@@ -135,12 +135,16 @@ def main():
    parser.add_argument('--simple', action='store_true',
                        required=False, default=False,
                        help='Simple output')
    parser.add_argument('--word_parts', type=str,
                        required=False, default='',
                        help='Which word parts will be provided. Semicolon separated. For example "w;l;t;v100" '
                             'will return word, lemma, tag and 100 dim. vector')
    args = parser.parse_args()
    db = SqadDb(args.database_file, read_only=True)
    if args.simple:
        print_record(db, args.record_id)
    else:
        pprint(get_record(db, args.record_id))
        pprint(get_record(db, args.record_id, args.word_parts))
    db.close()


+17 −8
Original line number Diff line number Diff line
@@ -24,14 +24,23 @@ def word2id(vocabulary, word, lemma, tag, w2v):
        return key


def id2word(vocabulary, key):
def id2word(vocabulary, key, parts=''):
    result = {}
    word_parts = parts.strip().split(';')

    if 'w' in word_parts or not parts:
        result['word'] = vocabulary.id2wlt[key]['word']
    if 'l' in word_parts or not parts:
        result['lemma'] = vocabulary.id2wlt[key]['lemma']
    if 't' in word_parts or not parts:
        result['tag'] = vocabulary.id2wlt[key]['tag']
    if 'v100' in word_parts or not parts:
        result['v100'] = vocabulary.vectors[key][0]
    if 'v300' in word_parts or not parts:
        result['v300'] = vocabulary.vectors[key][1]
    if 'v500' in word_parts or not parts:
        result['v500'] = vocabulary.vectors[key][2]
    if 'id' in word_parts or not parts:
        result['id'] = key
    return result