Loading add_contains_answer_sentences.py +2 −2 Original line number Diff line number Diff line Loading @@ -23,9 +23,9 @@ def find_sentences_containing_answer(db): record = db.get_record(rid) containing_answer = persistent.list.PersistentList() for sent in get_content(record.answer_extraction, vocabulary): for sent in get_content(record.answer_extraction, vocabulary, old=False): ans_ext_lemma = ' '.join(replace_number_lemma(sent['sent'])) for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary)): for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old=False)): doc_sent_content = " ".join(replace_number_lemma(sent_and_phrs["sent"])) if ans_ext_lemma in doc_sent_content: containing_answer.append(idx) Loading add_similar_senteces.py +4 −4 Original line number Diff line number Diff line Loading @@ -87,7 +87,7 @@ def find_similar_senteces(db, tf_idf): record = db.get_record(rid) # sys.stderr.write(f'{rid}\n') for answer_selection_sent in get_content(record.answer_selection, vocabulary): for answer_selection_sent in get_content(record.answer_selection, vocabulary, old=False): # Answer selection vector enhanced by TF-IDF as_vec = [] Loading @@ -102,7 +102,7 @@ def find_similar_senteces(db, tf_idf): v_as = np.mean(as_vec, axis=0) # Computing similar sentences within document for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary)): for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old=False)): if idx != record.text_answer_position: vec_tf_idf = [] for x in sent_and_phrs['sent']: Loading Loading @@ -160,10 +160,10 @@ def main(): sorted_sim_sentences = sorted(similar_sentences, key=lambda x: x[1], reverse=True) record = db.get_record(rid) if args.verbose: print(' '.join(get_content(record.answer_selection, vocabulary, part='word')[0])) print(' '.join(get_content(record.answer_selection, vocabulary, old=False, part='word')[0])) for idx, score in sorted_sim_sentences[:10]: print('{}: {}'.format(score, ' '.join(get_content(kb.url2doc.get(record.text)['text'], vocabulary, part='word')[idx]['sent']))) vocabulary, old=False, part='word')[idx]['sent']))) if args.number == 0: record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences) Loading query_database.py +11 −9 Original line number Diff line number Diff line Loading @@ -193,7 +193,7 @@ def get_content(data, vocabulary, old, part='', context_type='', preloaded=False return result def get_record(db, record_id, old, word_parts='', context_type='', vocabulary=None, qa_type_dict=None, def get_record(db, record_id, old, word_parts='', context_type='all', vocabulary=None, qa_type_dict=None, kb=None, preloaded=False): """ :param db: ZODB object, link to database Loading Loading @@ -268,11 +268,16 @@ def print_record(db, record_id, old, context_type=''): print(f'\ts: {" ".join([x["word"] for x in i["sent"]])}') print('similar_answers:') for key, value in record.similar_answers.items(): print(f'\t{key}: {value}') # for idx, sent_and_phrs in enumerate(get_content_ctx(record.text, kb, vocabulary, part='word')): # if idx in value: # print(f'\t\ts: {" ".join(sent_and_phrs["sent"])}') for name, value in record.similar_answers.items(): if name == 'sents_similar': print(f'\t{name}:') for s_idx, score in value: for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old, part='w')): if idx == s_idx: print(f'\t\ts_{idx} ({score}): {" ".join([x["word"] for x in sent_and_phrs["sent"]])}') else: print(f'\t{name}: {value}') print(f'text_title:') for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, old, part="w"): Loading @@ -282,11 +287,8 @@ def print_record(db, record_id, old, context_type=''): for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old, part='w', context_type=context_type)): text_sents_total += 1 # print(sent_and_phrs['ctx'].keys()) print(f'\ts_{idx}: {" ".join([x["word"] for x in sent_and_phrs["sent"]])}') for key, phrs in sent_and_phrs['ctx'].items(): # print(phrs) try: print(f'\t\tctx_type: {key}') print_ctx(phrs) Loading sqad_db.py +5 −3 Original line number Diff line number Diff line Loading @@ -65,7 +65,8 @@ def id2word(vocabulary, key, parts='', preloaded=False): try: result['v_bert'] = vocabulary['vectors'][key]['v_bert'] except KeyError: sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary["id2wlt"][key]["word"]}\n') pass # sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary["id2wlt"][key]["word"]}\n') if 'id' in word_parts or not parts: result['id'] = key else: Loading @@ -77,7 +78,7 @@ def id2word(vocabulary, key, parts='', preloaded=False): result['tag'] = vocabulary.id2wlt[key]['tag'] # Backwards compatibility if isinstance(vocabulary.vectors[key], dict): # New if isinstance(vocabulary.vectors[key], BTree): # New if 'v100' in word_parts or not parts: result['v100'] = vocabulary.vectors[key]['v100'] if 'v300' in word_parts or not parts: Loading @@ -96,7 +97,8 @@ def id2word(vocabulary, key, parts='', preloaded=False): try: result['v_bert'] = vocabulary.vectors[key]['v_bert'] except (KeyError, TypeError): sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary.id2wlt[key]["word"]}\n') pass # sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary.id2wlt[key]["word"]}\n') if 'id' in word_parts or not parts: result['id'] = key return result Loading Loading
add_contains_answer_sentences.py +2 −2 Original line number Diff line number Diff line Loading @@ -23,9 +23,9 @@ def find_sentences_containing_answer(db): record = db.get_record(rid) containing_answer = persistent.list.PersistentList() for sent in get_content(record.answer_extraction, vocabulary): for sent in get_content(record.answer_extraction, vocabulary, old=False): ans_ext_lemma = ' '.join(replace_number_lemma(sent['sent'])) for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary)): for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old=False)): doc_sent_content = " ".join(replace_number_lemma(sent_and_phrs["sent"])) if ans_ext_lemma in doc_sent_content: containing_answer.append(idx) Loading
add_similar_senteces.py +4 −4 Original line number Diff line number Diff line Loading @@ -87,7 +87,7 @@ def find_similar_senteces(db, tf_idf): record = db.get_record(rid) # sys.stderr.write(f'{rid}\n') for answer_selection_sent in get_content(record.answer_selection, vocabulary): for answer_selection_sent in get_content(record.answer_selection, vocabulary, old=False): # Answer selection vector enhanced by TF-IDF as_vec = [] Loading @@ -102,7 +102,7 @@ def find_similar_senteces(db, tf_idf): v_as = np.mean(as_vec, axis=0) # Computing similar sentences within document for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary)): for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old=False)): if idx != record.text_answer_position: vec_tf_idf = [] for x in sent_and_phrs['sent']: Loading Loading @@ -160,10 +160,10 @@ def main(): sorted_sim_sentences = sorted(similar_sentences, key=lambda x: x[1], reverse=True) record = db.get_record(rid) if args.verbose: print(' '.join(get_content(record.answer_selection, vocabulary, part='word')[0])) print(' '.join(get_content(record.answer_selection, vocabulary, old=False, part='word')[0])) for idx, score in sorted_sim_sentences[:10]: print('{}: {}'.format(score, ' '.join(get_content(kb.url2doc.get(record.text)['text'], vocabulary, part='word')[idx]['sent']))) vocabulary, old=False, part='word')[idx]['sent']))) if args.number == 0: record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences) Loading
query_database.py +11 −9 Original line number Diff line number Diff line Loading @@ -193,7 +193,7 @@ def get_content(data, vocabulary, old, part='', context_type='', preloaded=False return result def get_record(db, record_id, old, word_parts='', context_type='', vocabulary=None, qa_type_dict=None, def get_record(db, record_id, old, word_parts='', context_type='all', vocabulary=None, qa_type_dict=None, kb=None, preloaded=False): """ :param db: ZODB object, link to database Loading Loading @@ -268,11 +268,16 @@ def print_record(db, record_id, old, context_type=''): print(f'\ts: {" ".join([x["word"] for x in i["sent"]])}') print('similar_answers:') for key, value in record.similar_answers.items(): print(f'\t{key}: {value}') # for idx, sent_and_phrs in enumerate(get_content_ctx(record.text, kb, vocabulary, part='word')): # if idx in value: # print(f'\t\ts: {" ".join(sent_and_phrs["sent"])}') for name, value in record.similar_answers.items(): if name == 'sents_similar': print(f'\t{name}:') for s_idx, score in value: for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old, part='w')): if idx == s_idx: print(f'\t\ts_{idx} ({score}): {" ".join([x["word"] for x in sent_and_phrs["sent"]])}') else: print(f'\t{name}: {value}') print(f'text_title:') for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, old, part="w"): Loading @@ -282,11 +287,8 @@ def print_record(db, record_id, old, context_type=''): for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old, part='w', context_type=context_type)): text_sents_total += 1 # print(sent_and_phrs['ctx'].keys()) print(f'\ts_{idx}: {" ".join([x["word"] for x in sent_and_phrs["sent"]])}') for key, phrs in sent_and_phrs['ctx'].items(): # print(phrs) try: print(f'\t\tctx_type: {key}') print_ctx(phrs) Loading
sqad_db.py +5 −3 Original line number Diff line number Diff line Loading @@ -65,7 +65,8 @@ def id2word(vocabulary, key, parts='', preloaded=False): try: result['v_bert'] = vocabulary['vectors'][key]['v_bert'] except KeyError: sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary["id2wlt"][key]["word"]}\n') pass # sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary["id2wlt"][key]["word"]}\n') if 'id' in word_parts or not parts: result['id'] = key else: Loading @@ -77,7 +78,7 @@ def id2word(vocabulary, key, parts='', preloaded=False): result['tag'] = vocabulary.id2wlt[key]['tag'] # Backwards compatibility if isinstance(vocabulary.vectors[key], dict): # New if isinstance(vocabulary.vectors[key], BTree): # New if 'v100' in word_parts or not parts: result['v100'] = vocabulary.vectors[key]['v100'] if 'v300' in word_parts or not parts: Loading @@ -96,7 +97,8 @@ def id2word(vocabulary, key, parts='', preloaded=False): try: result['v_bert'] = vocabulary.vectors[key]['v_bert'] except (KeyError, TypeError): sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary.id2wlt[key]["word"]}\n') pass # sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary.id2wlt[key]["word"]}\n') if 'id' in word_parts or not parts: result['id'] = key return result Loading