From 09f7d0db31c00998ff13643a37966673f45a9942 Mon Sep 17 00:00:00 2001 From: Marek Medved <marek.medved3@gmail.com> Date: Thu, 29 Apr 2021 13:57:48 +0200 Subject: [PATCH] backwards compatibility --- add_contains_answer_sentences.py | 4 ++-- add_similar_senteces.py | 8 ++++---- query_database.py | 20 +++++++++++--------- sqad_db.py | 8 +++++--- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/add_contains_answer_sentences.py b/add_contains_answer_sentences.py index fa4bc71..d3be7da 100755 --- a/add_contains_answer_sentences.py +++ b/add_contains_answer_sentences.py @@ -23,9 +23,9 @@ def find_sentences_containing_answer(db): record = db.get_record(rid) containing_answer = persistent.list.PersistentList() - for sent in get_content(record.answer_extraction, vocabulary): + for sent in get_content(record.answer_extraction, vocabulary, old=False): ans_ext_lemma = ' '.join(replace_number_lemma(sent['sent'])) - for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary)): + for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old=False)): doc_sent_content = " ".join(replace_number_lemma(sent_and_phrs["sent"])) if ans_ext_lemma in doc_sent_content: containing_answer.append(idx) diff --git a/add_similar_senteces.py b/add_similar_senteces.py index ae113bf..20c09fc 100755 --- a/add_similar_senteces.py +++ b/add_similar_senteces.py @@ -87,7 +87,7 @@ def find_similar_senteces(db, tf_idf): record = db.get_record(rid) # sys.stderr.write(f'{rid}\n') - for answer_selection_sent in get_content(record.answer_selection, vocabulary): + for answer_selection_sent in get_content(record.answer_selection, vocabulary, old=False): # Answer selection vector enhanced by TF-IDF as_vec = [] @@ -102,7 +102,7 @@ def find_similar_senteces(db, tf_idf): v_as = np.mean(as_vec, axis=0) # Computing similar sentences within document - for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary)): + for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old=False)): if idx != record.text_answer_position: vec_tf_idf = [] for x in sent_and_phrs['sent']: @@ -160,10 +160,10 @@ def main(): sorted_sim_sentences = sorted(similar_sentences, key=lambda x: x[1], reverse=True) record = db.get_record(rid) if args.verbose: - print(' '.join(get_content(record.answer_selection, vocabulary, part='word')[0])) + print(' '.join(get_content(record.answer_selection, vocabulary, old=False, part='word')[0])) for idx, score in sorted_sim_sentences[:10]: print('{}: {}'.format(score, ' '.join(get_content(kb.url2doc.get(record.text)['text'], - vocabulary, part='word')[idx]['sent']))) + vocabulary, old=False, part='word')[idx]['sent']))) if args.number == 0: record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences) diff --git a/query_database.py b/query_database.py index 9e8edda..9c6f805 100755 --- a/query_database.py +++ b/query_database.py @@ -193,7 +193,7 @@ def get_content(data, vocabulary, old, part='', context_type='', preloaded=False return result -def get_record(db, record_id, old, word_parts='', context_type='', vocabulary=None, qa_type_dict=None, +def get_record(db, record_id, old, word_parts='', context_type='all', vocabulary=None, qa_type_dict=None, kb=None, preloaded=False): """ :param db: ZODB object, link to database @@ -268,11 +268,16 @@ def print_record(db, record_id, old, context_type=''): print(f'\ts: {" ".join([x["word"] for x in i["sent"]])}') print('similar_answers:') - for key, value in record.similar_answers.items(): - print(f'\t{key}: {value}') - # for idx, sent_and_phrs in enumerate(get_content_ctx(record.text, kb, vocabulary, part='word')): - # if idx in value: - # print(f'\t\ts: {" ".join(sent_and_phrs["sent"])}') + for name, value in record.similar_answers.items(): + if name == 'sents_similar': + print(f'\t{name}:') + for s_idx, score in value: + for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old, + part='w')): + if idx == s_idx: + print(f'\t\ts_{idx} ({score}): {" ".join([x["word"] for x in sent_and_phrs["sent"]])}') + else: + print(f'\t{name}: {value}') print(f'text_title:') for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, old, part="w"): @@ -282,11 +287,8 @@ def print_record(db, record_id, old, context_type=''): for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old, part='w', context_type=context_type)): text_sents_total += 1 - # print(sent_and_phrs['ctx'].keys()) print(f'\ts_{idx}: {" ".join([x["word"] for x in sent_and_phrs["sent"]])}') for key, phrs in sent_and_phrs['ctx'].items(): - # print(phrs) - try: print(f'\t\tctx_type: {key}') print_ctx(phrs) diff --git a/sqad_db.py b/sqad_db.py index ad037fd..848c0ca 100755 --- a/sqad_db.py +++ b/sqad_db.py @@ -65,7 +65,8 @@ def id2word(vocabulary, key, parts='', preloaded=False): try: result['v_bert'] = vocabulary['vectors'][key]['v_bert'] except KeyError: - sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary["id2wlt"][key]["word"]}\n') + pass + # sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary["id2wlt"][key]["word"]}\n') if 'id' in word_parts or not parts: result['id'] = key else: @@ -77,7 +78,7 @@ def id2word(vocabulary, key, parts='', preloaded=False): result['tag'] = vocabulary.id2wlt[key]['tag'] # Backwards compatibility - if isinstance(vocabulary.vectors[key], dict): # New + if isinstance(vocabulary.vectors[key], BTree): # New if 'v100' in word_parts or not parts: result['v100'] = vocabulary.vectors[key]['v100'] if 'v300' in word_parts or not parts: @@ -96,7 +97,8 @@ def id2word(vocabulary, key, parts='', preloaded=False): try: result['v_bert'] = vocabulary.vectors[key]['v_bert'] except (KeyError, TypeError): - sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary.id2wlt[key]["word"]}\n') + pass + # sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary.id2wlt[key]["word"]}\n') if 'id' in word_parts or not parts: result['id'] = key return result -- GitLab