Loading query_database.py +17 −13 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ def get_ctx(data, vocabulary, part=''): p_content = [] for w_id_cx in p: if part: p_content.append(id2word(vocabulary, w_id_cx)[part]) p_content.append(id2word(vocabulary, w_id_cx, part)) else: p_content.append(id2word(vocabulary, w_id_cx)) if sentence_phrases.get(ctx_type): Loading @@ -28,7 +28,7 @@ def get_senence(data, vocabulary, part=''): sent = [] for w_id in data: if part: sent.append(id2word(vocabulary, w_id)[part]) sent.append(id2word(vocabulary, w_id, part)) else: sent.append(id2word(vocabulary, w_id)) Loading @@ -50,7 +50,7 @@ def get_content_ctx(url, kb, vocabulary, part=''): return result def get_record(db, record_id): def get_record(db, record_id, word_parts=''): record = db.get_record(record_id) vocabulary, qa_type_dict, kb = db.get_dicts() """ Loading @@ -70,13 +70,13 @@ def get_record(db, record_id): data['rec_id'] = record.rec_id data['q_type'] = id2qt(qa_type_dict, record.q_type) data['a_type'] = id2qt(qa_type_dict, record.a_type) data['question'] = get_content(record.question, vocabulary) data['a_sel'] = get_content(record.answer_selection, vocabulary) data['question'] = get_content(record.question, vocabulary, word_parts) data['a_sel'] = get_content(record.answer_selection, vocabulary, word_parts) data['a_sel_pos'] = record.text_answer_position data['a_ext'] = get_content(record.answer_extraction, vocabulary) data['a_ext'] = get_content(record.answer_extraction, vocabulary, word_parts) data['similar_answers'] = record.similar_answers data['text_title'] = kb.url2doc.get(record.text)['title'] data['text'] = get_content_ctx(record.text, kb, vocabulary) data['text'] = get_content_ctx(record.text, kb, vocabulary, word_parts) return data Loading @@ -90,17 +90,17 @@ def print_record(db, record_id): print(f'a_type: {id2qt(qa_type_dict, record.a_type)}') print('question:') for i in get_content(record.question, vocabulary, part='word'): for i in get_content(record.question, vocabulary, part='w'): print(f'\ts: {" ".join(i)}') print('a_sel:') for i in get_content(record.answer_selection, vocabulary, part='word'): for i in get_content(record.answer_selection, vocabulary, part='w'): print(f'\ts: {" ".join(i)}') print(f'a_sel_pos: {record.text_answer_position}') print('a_ext:') for i in get_content(record.answer_extraction, vocabulary, part='word'): for i in get_content(record.answer_extraction, vocabulary, part='w'): print(f'\ts: {" ".join(i)}') print('similar_answers:') Loading @@ -111,11 +111,11 @@ def print_record(db, record_id): # print(f'\t\ts: {" ".join(sent_and_phrs["sent"])}') print(f'text_title:') for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, part="word"): for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, part="w"): print(f'\ts: {" ".join(i)}') print('text:') for sent_and_phrs in get_content_ctx(record.text, kb, vocabulary, part='word'): for sent_and_phrs in get_content_ctx(record.text, kb, vocabulary, part='w'): print(f'\ts: {" ".join(sent_and_phrs["sent"])}') for key, phrs in sent_and_phrs['ctx'].items(): print(f'\t\tctx_type: {key}') Loading @@ -135,12 +135,16 @@ def main(): parser.add_argument('--simple', action='store_true', required=False, default=False, help='Simple output') parser.add_argument('--word_parts', type=str, required=False, default='', help='Which word parts will be provided. Semicolon separated. For example "w;l;t;v100" ' 'will return word, lemma, tag and 100 dim. vector') args = parser.parse_args() db = SqadDb(args.database_file, read_only=True) if args.simple: print_record(db, args.record_id) else: pprint(get_record(db, args.record_id)) pprint(get_record(db, args.record_id, args.word_parts)) db.close() Loading sqad_db.py +17 −8 Original line number Diff line number Diff line Loading @@ -24,14 +24,23 @@ def word2id(vocabulary, word, lemma, tag, w2v): return key def id2word(vocabulary, key): def id2word(vocabulary, key, parts=''): result = {} word_parts = parts.strip().split(';') if 'w' in word_parts or not parts: result['word'] = vocabulary.id2wlt[key]['word'] if 'l' in word_parts or not parts: result['lemma'] = vocabulary.id2wlt[key]['lemma'] if 't' in word_parts or not parts: result['tag'] = vocabulary.id2wlt[key]['tag'] if 'v100' in word_parts or not parts: result['v100'] = vocabulary.vectors[key][0] if 'v300' in word_parts or not parts: result['v300'] = vocabulary.vectors[key][1] if 'v500' in word_parts or not parts: result['v500'] = vocabulary.vectors[key][2] if 'id' in word_parts or not parts: result['id'] = key return result Loading Loading
query_database.py +17 −13 Original line number Diff line number Diff line Loading @@ -13,7 +13,7 @@ def get_ctx(data, vocabulary, part=''): p_content = [] for w_id_cx in p: if part: p_content.append(id2word(vocabulary, w_id_cx)[part]) p_content.append(id2word(vocabulary, w_id_cx, part)) else: p_content.append(id2word(vocabulary, w_id_cx)) if sentence_phrases.get(ctx_type): Loading @@ -28,7 +28,7 @@ def get_senence(data, vocabulary, part=''): sent = [] for w_id in data: if part: sent.append(id2word(vocabulary, w_id)[part]) sent.append(id2word(vocabulary, w_id, part)) else: sent.append(id2word(vocabulary, w_id)) Loading @@ -50,7 +50,7 @@ def get_content_ctx(url, kb, vocabulary, part=''): return result def get_record(db, record_id): def get_record(db, record_id, word_parts=''): record = db.get_record(record_id) vocabulary, qa_type_dict, kb = db.get_dicts() """ Loading @@ -70,13 +70,13 @@ def get_record(db, record_id): data['rec_id'] = record.rec_id data['q_type'] = id2qt(qa_type_dict, record.q_type) data['a_type'] = id2qt(qa_type_dict, record.a_type) data['question'] = get_content(record.question, vocabulary) data['a_sel'] = get_content(record.answer_selection, vocabulary) data['question'] = get_content(record.question, vocabulary, word_parts) data['a_sel'] = get_content(record.answer_selection, vocabulary, word_parts) data['a_sel_pos'] = record.text_answer_position data['a_ext'] = get_content(record.answer_extraction, vocabulary) data['a_ext'] = get_content(record.answer_extraction, vocabulary, word_parts) data['similar_answers'] = record.similar_answers data['text_title'] = kb.url2doc.get(record.text)['title'] data['text'] = get_content_ctx(record.text, kb, vocabulary) data['text'] = get_content_ctx(record.text, kb, vocabulary, word_parts) return data Loading @@ -90,17 +90,17 @@ def print_record(db, record_id): print(f'a_type: {id2qt(qa_type_dict, record.a_type)}') print('question:') for i in get_content(record.question, vocabulary, part='word'): for i in get_content(record.question, vocabulary, part='w'): print(f'\ts: {" ".join(i)}') print('a_sel:') for i in get_content(record.answer_selection, vocabulary, part='word'): for i in get_content(record.answer_selection, vocabulary, part='w'): print(f'\ts: {" ".join(i)}') print(f'a_sel_pos: {record.text_answer_position}') print('a_ext:') for i in get_content(record.answer_extraction, vocabulary, part='word'): for i in get_content(record.answer_extraction, vocabulary, part='w'): print(f'\ts: {" ".join(i)}') print('similar_answers:') Loading @@ -111,11 +111,11 @@ def print_record(db, record_id): # print(f'\t\ts: {" ".join(sent_and_phrs["sent"])}') print(f'text_title:') for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, part="word"): for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, part="w"): print(f'\ts: {" ".join(i)}') print('text:') for sent_and_phrs in get_content_ctx(record.text, kb, vocabulary, part='word'): for sent_and_phrs in get_content_ctx(record.text, kb, vocabulary, part='w'): print(f'\ts: {" ".join(sent_and_phrs["sent"])}') for key, phrs in sent_and_phrs['ctx'].items(): print(f'\t\tctx_type: {key}') Loading @@ -135,12 +135,16 @@ def main(): parser.add_argument('--simple', action='store_true', required=False, default=False, help='Simple output') parser.add_argument('--word_parts', type=str, required=False, default='', help='Which word parts will be provided. Semicolon separated. For example "w;l;t;v100" ' 'will return word, lemma, tag and 100 dim. vector') args = parser.parse_args() db = SqadDb(args.database_file, read_only=True) if args.simple: print_record(db, args.record_id) else: pprint(get_record(db, args.record_id)) pprint(get_record(db, args.record_id, args.word_parts)) db.close() Loading
sqad_db.py +17 −8 Original line number Diff line number Diff line Loading @@ -24,14 +24,23 @@ def word2id(vocabulary, word, lemma, tag, w2v): return key def id2word(vocabulary, key): def id2word(vocabulary, key, parts=''): result = {} word_parts = parts.strip().split(';') if 'w' in word_parts or not parts: result['word'] = vocabulary.id2wlt[key]['word'] if 'l' in word_parts or not parts: result['lemma'] = vocabulary.id2wlt[key]['lemma'] if 't' in word_parts or not parts: result['tag'] = vocabulary.id2wlt[key]['tag'] if 'v100' in word_parts or not parts: result['v100'] = vocabulary.vectors[key][0] if 'v300' in word_parts or not parts: result['v300'] = vocabulary.vectors[key][1] if 'v500' in word_parts or not parts: result['v500'] = vocabulary.vectors[key][2] if 'id' in word_parts or not parts: result['id'] = key return result Loading