diff --git a/Makefile b/Makefile index fc4605424627fdef5ceb396ef6744329c4f96b95..a27a069e591a92537f6ad49582b33fae195e68bd 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base +DB_NAME_TEST=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_test VERSION=$(shell cat ./sqad_db/version) NEW_VERSION=$$(($(VERSION)+1)) #UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") @@ -15,51 +16,49 @@ create: ($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log) echo "$(hostname)" | mail -s "Done sqad_db created" "xmedved1@fi.muni.cz" +create_test: + printf "SQAD to DB\n=======================\n" >> $(DB_NAME_TEST).log + ($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database_devel/test -n $(DB_NAME_TEST) -v 1 2>> $(DB_NAME_TEST).log) + updates: @echo "creating updates $(DB)" - # Word Bert embeddings - ./make_copy.sh $(DB) $(DB)_Vbert - printf "add bert embeddings\n=======================\n" >> $(DB)_Vbert.log - ($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(DB)_Vbert 2>> $(DB)_Vbert.log) # Contains answer sentece - ./make_copy.sh $(DB)_Vbert $(DB)_Vbert_addAS - printf "Contains answer\n======================\n" >> $(DB)_Vbert_addAS.log - ($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(DB)_Vbert_addAS 2>> $(DB)_Vbert_addAS.log) + ./make_copy.sh $(DB) $(DB)_addAS + printf "Contains answer\n======================\n" >> $(DB)_addAS.log + ($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(DB)_addAS 2>> $(DB)_addAS.log) # Similar sentences - ./make_copy.sh $(DB)_Vbert_addAS $(DB)_Vbert_addAS_simS - printf "Similar answers\n======================\n" >> $(DB)_Vbert_addAS_simS.log - ($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -n 0 -d $(DB)_Vbert_addAS_simS 2>> $(DB)_Vbert_addAS_simS.log) + ./make_copy.sh $(DB)_addAS $(DB)_addAS_simS + printf "Similar answers\n======================\n" >> $(DB)_addAS_simS.log + ($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -n 0 -d $(DB)_addAS_simS 2>> $(DB)_addAS_simS.log) # Context NP - ./make_copy.sh $(DB)_Vbert_addAS_simS $(DB)_Vbert_addAS_simS_cNP - printf "Contex NP phrases context_window 3\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP.log - ($(CONDA_ACTIVATE) base; ./context_np.py --context_window 3 --phr_per_sent "longest" -d $(DB)_Vbert_addAS_simS_cNP 2>> $(DB)_Vbert_addAS_simS_cNP.log) - printf "Contex NP phrases context_window 2\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP.log - ($(CONDA_ACTIVATE) base; ./context_np.py --context_window 2 --phr_per_sent "longest" -d $(DB)_Vbert_addAS_simS_cNP 2>> $(DB)_Vbert_addAS_simS_cNP.log) + ./make_copy.sh $(DB)_addAS_simS $(DB)_addAS_simS_cNP + printf "Contex NP phrases context_window 3\n======================\n" >> $(DB)_addAS_simS_cNP.log + ($(CONDA_ACTIVATE) base; ./context_np.py --context_window 3 --phr_per_sent "longest" -d $(DB)_addAS_simS_cNP 2>> $(DB)_addAS_simS_cNP.log) # Context Previous sentences - ./make_copy.sh $(DB)_Vbert_addAS_simS_cNP $(DB)_Vbert_addAS_simS_cNP_cPS - printf "Context previous sentece 1\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS.log - ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py --number 1 -d $(DB)_Vbert_addAS_simS_cNP_cPS 2>> $(DB)_Vbert_addAS_simS_cNP_cPS.log) - printf "Context previous sentece 2\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS.log - ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py --number 2 -d $(DB)_Vbert_addAS_simS_cNP_cPS 2>> $(DB)_Vbert_addAS_simS_cNP_cPS.log) + ./make_copy.sh $(DB)_addAS_simS_cNP $(DB)_addAS_simS_cNP_cPS + printf "Context previous sentece 2\n======================\n" >> $(DB)_addAS_simS_cNP_cPS.log + ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py --number 2 -d $(DB)_addAS_simS_cNP_cPS 2>> $(DB)_addAS_simS_cNP_cPS.log) # Context NER - ./make_copy.sh $(DB)_Vbert_addAS_simS_cNP_cPS $(DB)_Vbert_addAS_simS_cNP_cPS_cNER - printf "Context wiki entity context_window 5\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log - ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -m named_entity_recognition/BERT-NER/ner_model_cz/ -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log) - printf "Context wiki entity context_window 2\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log - ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -m named_entity_recognition/BERT-NER/ner_model_cz/ -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log) + ./make_copy.sh $(DB)_addAS_simS_cNP_cPS $(DB)_addAS_simS_cNP_cPS_cNER + printf "Context wiki entity context_window 5\n======================\n" >> $(DB)_addAS_simS_cNP_cPS_cNER.log + ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -m named_entity_recognition/BERT-NER/ner_model_cz/ -d $(DB)_addAS_simS_cNP_cPS_cNER 2>> $(DB)_addAS_simS_cNP_cPS_cNER.log) # Sentece Bert - ./make_copy.sh $(DB)_Vbert_addAS_simS_cNP_cPS_cNER $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert - printf "Sentece to sentece bert embedding\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert.log - ($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert.log) + ./make_copy.sh $(DB)_addAS_simS_cNP_cPS_cNER $(DB)_addAS_simS_cNP_cPS_cNER_sBert + printf "Sentece to sentece bert embedding\n======================\n" >> $(DB)_addAS_simS_cNP_cPS_cNER_sBert.log + ($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(DB)_addAS_simS_cNP_cPS_cNER_sBert 2>> $(DB)_addAS_simS_cNP_cPS_cNER_sBert.log) # CLS Bert - ./make_copy.sh $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert - printf "Sentece to cls bert embedding\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log - ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log) + ./make_copy.sh $(DB)_addAS_simS_cNP_cPS_cNER_sBert $(DB)_addAS_simS_cNP_cPS_cNER_sBert_clsBert + printf "Sentece to cls bert embedding\n======================\n" >> $(DB)_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log + ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(DB)_addAS_simS_cNP_cPS_cNER_sBert_clsBert 2>> $(DB)_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log) + # Word Bert embeddings + ./make_copy.sh $(DB)_addAS_simS_cNP_cPS_cNER_sBert_clsBert $(DB)_addAS_simS_cNP_cPS_cNER_sBert_clsBert_Vbert + printf "add bert embeddings\n=======================\n" >> $(DB)_addAS_simS_cNP_cPS_cNER_sBert_clsBert_Vbert.log + ($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(DB)_addAS_simS_cNP_cPS_cNER_sBert_clsBert_Vbert 2>> $(DB)_addAS_simS_cNP_cPS_cNER_sBert_clsBert_Vbert.log) echo "$(hostname)" | mail -s "Done AQA job" "xmedved1@fi.muni.cz" run_ZODB_server: exec "/usr/bin/python3.6" -m "ZEO.runzeo" -C /nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/zeo_server.conf - #cd "$(HOME)/.local/lib/python3.6/site-packages/"; exec "/usr/bin/python3.6" -m "ZEO.runzeo" -a "0.0.0.0:9001" -f "/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/sqad_db/stable" + #cd "$(HOME)/.local/lib/python3.6/site-packages/"; exec "/usr/bin/python3.6" -m "ZEO.runzeo" -a "0.0.0.0:9001" -f "/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database_devel/sqad_db/stable" demo_query: ./query_database.py -d sqad_db/stable -r 000180 --simple | head -n 38 diff --git a/add_bert_emberdings.py b/add_bert_emberdings.py index 43dce44d8a5d3eeff3f83474b0416df3a0824b92..05cfc2884064faee544afbeb4f47aacc2af3f1e5 100755 --- a/add_bert_emberdings.py +++ b/add_bert_emberdings.py @@ -1,42 +1,85 @@ #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz +# according https://github.com/huggingface/transformers/issues/1950 +import os import sys from sqad_db import SqadDb import persistent.list import transaction from transformers import BertTokenizer, BertConfig, BertModel +dir_path = os.path.dirname(os.path.realpath(__file__)) -# created according http://docs.deeppavlov.ai/en/master/features/models/bert.html -# -class Bert_Embeddings: + +class BertEmbeddings: def __init__(self): + """ + Init model paths and load it + """ config = BertConfig.from_json_file( - '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/bert_config.json') + f'{dir_path}/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/bert_config.json') self.model = BertModel.from_pretrained( - '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/pytorch_model.bin', + f'{dir_path}/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/pytorch_model.bin', config=config, local_files_only=True) self.tokenizer = BertTokenizer.from_pretrained( - '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/') + f'{dir_path}/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/') - def word2embedding(self, word): - input_ids = self.tokenizer.encode(["[CLS]", word], return_tensors="pt", add_special_tokens=True) + def word2embedding_cls(self, word): + """ + CLS token form bert output + :param word: str, input word + :return: vector + """ + input_ids = self.tokenizer.encode(word, return_tensors="pt", add_special_tokens=True) output = self.model(input_ids) return output[0][0][0].detach().numpy() + def word2embedding_average(self, word, verbose=False): + """ + Average of sub-word embeddings + :param word: str, input word + :param verbose: debugging mode + :return: vector + """ + input_ids = self.tokenizer.encode(word, return_tensors="pt", add_special_tokens=False) + try: + output = self.model(input_ids) + + if verbose: + input_ids_2 = self.tokenizer.encode(word, add_special_tokens=False) + for idx, i in enumerate(input_ids_2): + print(f'{self.tokenizer.decode(i)} -> {i}') + + average_tensor = [] + for i in output[0][0]: + average_tensor.append(i.detach().numpy()) + average_tensor = [sum(items)/len(average_tensor) for items in zip(*average_tensor)] + + except RuntimeError: + sys.stderr.write(f'Cant create embedding for word: {word}\n') + average_tensor = [0] * 768 + + return average_tensor + -def add_bert_word_embeddings_word(vocabulary, model, db): +def add_bert_word_embeddings_word(vocabulary, model, db, v_type, verbose): # vocab_size = len(vocabulary.id2wlt.keys()) progress = 0 for w_id, value in vocabulary.id2wlt.items(): progress += 1 word = value['word'] - bert_embedding = model.word2embedding(word) - # sys.stderr.write(f'{progress}/{vocab_size}\n') - vocabulary.vectors[w_id]['v_bert'] = persistent.list.PersistentList(bert_embedding) - db._p_changed = True - transaction.commit() + if not vocabulary.vectors[w_id].get('v_bert'): + + if v_type == 'average': + bert_embedding = model.word2embedding_average(word, verbose=verbose) + elif v_type == 'cls': + bert_embedding = model.word2embedding_cls(word) + + vocabulary.vectors[w_id]['v_bert'] = persistent.list.PersistentList(bert_embedding) + db._p_changed = True + transaction.commit() def main(): @@ -54,6 +97,9 @@ def main(): parser.add_argument('-v', '--verbose', action='store_true', required=False, default=False, help='Verbose mode') + parser.add_argument('-t', '--vector_type', type=str, + required=False, default='average', + help='Word vector setting. Available: cls, average. If word is created from sub-words') args = parser.parse_args() if (args.url and args.port) or args.db_path: @@ -65,10 +111,10 @@ def main(): sys.stderr.write('Please specify --db_path or (--port and --url)') sys.exit() - model = Bert_Embeddings() + model = BertEmbeddings() vocabulary, _, kb = db.get_dicts() try: - add_bert_word_embeddings_word(vocabulary, model, db) + add_bert_word_embeddings_word(vocabulary, model, db, args.vector_type, args.verbose) db.update() db._p_changed = True transaction.commit() @@ -77,5 +123,6 @@ def main(): db.close() sys.exit() + if __name__ == "__main__": main() diff --git a/add_contains_answer_sentences.py b/add_contains_answer_sentences.py index f64af3964285ec8ba4f7acdf745790f57c35b34d..fa4bc71d5137b9b5a249bf732ca82e29e763564c 100755 --- a/add_contains_answer_sentences.py +++ b/add_contains_answer_sentences.py @@ -1,17 +1,17 @@ #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz from sqad_db import SqadDb import persistent.list from query_database import get_content -from query_database import get_content_ctx import transaction import sys -from BTrees.OOBTree import BTree def replace_number_lemma(sent): return [x['lemma'] if not x['lemma'] == '[number]' else x['word'] for x in sent] + def find_sentences_containing_answer(db): """ Searching for sentences containing the exact answer @@ -19,13 +19,13 @@ def find_sentences_containing_answer(db): :return: list of indexes with sentences within the text containing exact answer """ vocabulary, _, kb = db.get_dicts() - for rid in db.get_all_records(): + for rid in db.get_all_records_id(): record = db.get_record(rid) containing_answer = persistent.list.PersistentList() for sent in get_content(record.answer_extraction, vocabulary): - ans_ext_lemma = ' '.join(replace_number_lemma(sent)) - for idx, sent_and_phrs in enumerate(get_content_ctx(record.text, kb, vocabulary)): + ans_ext_lemma = ' '.join(replace_number_lemma(sent['sent'])) + for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary)): doc_sent_content = " ".join(replace_number_lemma(sent_and_phrs["sent"])) if ans_ext_lemma in doc_sent_content: containing_answer.append(idx) @@ -50,7 +50,7 @@ def main(): help='Verbose mode') args = parser.parse_args() - if (args.url and args.port) or args.db_path: + if (args.url and args.port) or args.db_path: if args.url and args.port: db = SqadDb(url=args.url, port=args.port) elif args.db_path: @@ -74,5 +74,6 @@ def main(): db.close() sys.exit() + if __name__ == "__main__": main() diff --git a/add_similar_senteces.py b/add_similar_senteces.py index 712efba92ed550fc323d26d836bd25b91c1b6d3b..ae113bffcedf41a888e2c4e43cfe77a659ca360d 100755 --- a/add_similar_senteces.py +++ b/add_similar_senteces.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz from sqad_db import SqadDb from sqad_db import id2word import persistent.list from query_database import get_content -from query_database import get_content_ctx import transaction import sys import numpy as np @@ -66,7 +66,8 @@ def compute_tfidf(kb, vocabulary, verbose=False): print(f'{url}') for w, c in value.items(): word = id2word(vocabulary, w) - print(f'{word["word"]}\t{word["lemma"]}\t{word["tag"]}: tf:{c}, idf: {idf[w]}, tf-idf: {tf_idf[url][w]}') + print(f'{word["word"]}\t{word["lemma"]}\t' + f'{word["tag"]}: tf:{c}, idf: {idf[w]}, tf-idf: {tf_idf[url][w]}') sys.exit() return tf_idf @@ -76,11 +77,12 @@ def find_similar_senteces(db, tf_idf): """ Searching for sentences containing the exact answer :param db: ZODB database + :param tf_idf: :return: list of indexes with sentences within the text containing exact answer """ vocabulary, _, kb = db.get_dicts() - for rid in db.get_all_records(): + for rid in db.get_all_records_id(): similar_senteces = [] record = db.get_record(rid) # sys.stderr.write(f'{rid}\n') @@ -89,7 +91,7 @@ def find_similar_senteces(db, tf_idf): # Answer selection vector enhanced by TF-IDF as_vec = [] - for x in answer_selection_sent: + for x in answer_selection_sent['sent']: try: asw_tf_idf = tf_idf[record.text][x['lemma']] tf_idf_vec = [y * asw_tf_idf for y in x['v300']] @@ -100,7 +102,7 @@ def find_similar_senteces(db, tf_idf): v_as = np.mean(as_vec, axis=0) # Computing similar sentences within document - for idx, sent_and_phrs in enumerate(get_content_ctx(record.text, kb, vocabulary)): + for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary)): if idx != record.text_answer_position: vec_tf_idf = [] for x in sent_and_phrs['sent']: @@ -113,7 +115,7 @@ def find_similar_senteces(db, tf_idf): cos_sim = 1 - spatial.distance.cosine(v_as, v_sent) # Filter exact answers - if not idx in record.similar_answers['sents_containing_ans_ext']: + if idx not in record.similar_answers['sents_containing_ans_ext']: similar_senteces.append((idx, cos_sim)) yield rid, similar_senteces @@ -155,12 +157,13 @@ def main(): tf_idf = compute_tfidf(kb, vocabulary) for rid, similar_sentences in find_similar_senteces(db, tf_idf): - sorted_sim_sentences = sorted(similar_sentences, key = lambda x: x[1], reverse=True) + sorted_sim_sentences = sorted(similar_sentences, key=lambda x: x[1], reverse=True) record = db.get_record(rid) if args.verbose: print(' '.join(get_content(record.answer_selection, vocabulary, part='word')[0])) for idx, score in sorted_sim_sentences[:10]: - print('{}: {}'.format(score, ' '.join(get_content_ctx(record.text, kb, vocabulary, part='word')[idx]['sent']))) + print('{}: {}'.format(score, ' '.join(get_content(kb.url2doc.get(record.text)['text'], + vocabulary, part='word')[idx]['sent']))) if args.number == 0: record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences) @@ -177,5 +180,6 @@ def main(): db.close() sys.exit() + if __name__ == "__main__": main() diff --git a/context_ner.py b/context_ner.py index d8d2cc97dc8d4ccc6e6189dbb1de637d41543dd1..1351c63808e3c04e13d9ec1d00e7e511f1259182 100755 --- a/context_ner.py +++ b/context_ner.py @@ -1,15 +1,23 @@ #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz import re from sqad_db import SqadDb import persistent.list from sqad_db import id2word import transaction +from BTrees.OOBTree import BTree import sys struct_re = re.compile('^<[^>]+>$') def normalize_out(ner_out, sent): + """ + Match result from NER recognizer with original sentece + :param ner_out: dict, output form NER + :param sent: list, original sentence + :return: list, corrected output + """ result = [] for idx, token in enumerate(sent): if token['word'] != ner_out[idx]['word']: @@ -30,8 +38,14 @@ def normalize_out(ner_out, sent): return result - def ner_phrases(text, context_window, model): + """ + Compute all entities in sentece + :param text: list, text content + :param context_window: int, context window + :param model: NER model + :return: list + """ text_context = persistent.list.PersistentList() ner_per_sentence = [] for sent in text: @@ -75,12 +89,13 @@ def ner_phrases(text, context_window, model): return text_context -def get_ner(text, context_window, model): - nps = ner_phrases(text, context_window, model) - return nps - - def get_content(text_by_url, vocabulary): + """ + Sentence content + :param text_by_url: dict, text + :param vocabulary: dict + :return: list + """ text_content = [] for sentence in text_by_url: sent = [] @@ -93,15 +108,23 @@ def get_content(text_by_url, vocabulary): def add_ner(db, context_window, model, verbose=False): + """ + Harvest entities and put them as context + :param db: ZODB object + :param context_window: int + :param model: kNER model + :param verbose: bool + :return: None + """ vocabulary, qa_type_dict, kb = db.get_dicts() for url, text in kb.url2doc.items(): - # if verbose: - print(f'Processing: {url}') + if verbose: + print(f'Processing: {url}') # text_title_vert = get_content(text['title'], vocabulary) text_vert = get_content(text['text'], vocabulary) - phrases = get_ner(text_vert, context_window, model) + phrases = ner_phrases(text_vert, context_window, model) for sent_num, sent in enumerate(text['text']): if verbose: @@ -110,33 +133,16 @@ def add_ner(db, context_window, model, verbose=False): for phr in phrs: print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}') if not sent['ctx'].get(f'ctx_ner_w{context_window}'): - sent['ctx'][f'ctx_ner_w{context_window}'] = phrases[sent_num] + sent['ctx'][f'ctx_ner_w{context_window}'] = persistent.list.PersistentList() + for ner_from_prev_sentce in phrases[sent_num]: + content_ner_from_prev_sentce = persistent.list.PersistentList() + for p in ner_from_prev_sentce: + content_ner_from_prev_sentce.append(BTree({'sent': p})) + sent['ctx'][f'ctx_ner_w{context_window}'].append(content_ner_from_prev_sentce) db._p_changed = True transaction.commit() -def get_ctx(phrs, vocabulary, part='', preloaded=False): - sentence_phrases = [] - for sent_phr in phrs: - phr_per_sent = [] - for p in sent_phr: - p_content = [] - for w_id_cx in p: - if part: - p_content.append(id2word(vocabulary, w_id_cx, parts=part, preloaded=preloaded)) - else: - p_content.append(id2word(vocabulary, w_id_cx, preloaded=preloaded)) - phr_per_sent.append(p_content) - sentence_phrases.append(phr_per_sent) - return sentence_phrases - - -def print_ctx(phrs): - for idx, sent_phr in enumerate(phrs): - for p in sent_phr: - print(f'\t\t\tc(-{idx+1}): {" ".join([x["word"] for x in p])}') - - def main(): import argparse parser = argparse.ArgumentParser(description='Add named entity as a context to sentence - Local option') diff --git a/context_np.py b/context_np.py index ce6e0dec6107cb7fa8417345f0575378b7a9afa2..2dfdb6585b5e4b11bd979ac721d087458ddc0e7e 100755 --- a/context_np.py +++ b/context_np.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz import os import re import sys @@ -10,6 +11,7 @@ import persistent.list from sqad_db import id2word from sqad_db import word2id import transaction +from BTrees.OOBTree import BTree struct_re = re.compile('^<[^>]+>$') # ================================================== @@ -26,16 +28,25 @@ locale.setlocale(locale.LC_ALL, '') class SetInterface: def __init__(self, grammar_path): + """ + Init set parser + :param grammar_path: path to grammar file + """ self.grammar_path = grammar_path self.grammar = os.path.join(self.grammar_path) def parse_input(self, lines): + """ + Pars input sentece and get all NP phrases + :param lines: str + :return: list, list of phrases + """ g = Grammar(self.grammar) p = Parser(g) s = Segment(lines) p.parse(s) return s.get_marx_phrases_vert(filter_phr=True) -# ================================================== + def filter_longest(phrases): """ @@ -57,8 +68,18 @@ def filter_longest(phrases): return result -def name_phrases(text, title, vocabulary, context_window, phr_per_sent, w2v): - set_parser = SetInterface("/nlp/projekty/set/set/grammar.set") +def name_phrases(text, title, vocabulary, context_window, phr_type, w2v): + """ + Harvest all NP phrases, reduce number according context window and type of phrases per sentence + :param text: str + :param title: str + :param vocabulary: dict + :param context_window: int + :param phr_type: str + :param w2v: word2vec model + :return: + """ + set_parser = SetInterface(f"{dir_path}/set/grammar.set") text_context = persistent.list.PersistentList() # Read file and create phrases for all sentences phrases_per_sentence = [] @@ -81,7 +102,7 @@ def name_phrases(text, title, vocabulary, context_window, phr_per_sent, w2v): wid = word2id(vocabulary, word, lemma, tag, w2v) phr.append(wid) phrases.append(phr) - if phr_per_sent == 'longest': + if phr_type == 'longest': phrases_per_sentence.append(filter_longest(phrases)) else: phrases_per_sentence.append(phrases) @@ -109,12 +130,13 @@ def name_phrases(text, title, vocabulary, context_window, phr_per_sent, w2v): return text_context -def get_context(text, title, vocabulary, context_window, phr_per_sent, w2v): - nps = name_phrases(text, title, vocabulary, context_window, phr_per_sent, w2v) - return nps - - def get_text_vert(text_by_url, vocabulary): + """ + Text to vert + :param text_by_url: str + :param vocabulary: dict + :return: str + """ vert = [] for sentence in text_by_url: vert.append('<s>') @@ -126,63 +148,45 @@ def get_text_vert(text_by_url, vocabulary): return vert -def get_title(text_by_url, vocabulary): - vert = [] - for sentence in text_by_url: - vert.append('<s>') - for token_id in sentence: - t = id2word(vocabulary, token_id) - vert.append(f'{t["word"]}\t{t["lemma"]}\t{t["tag"]}') - vert.append('</s>') - - return vert - - -def add_np_phrases(db, context_window, phr_per_sent, w2v, verbose=False): +def add_np_phrases(db, context_window, phr_type, w2v, verbose=False): + """ + Assign phrases as context to sentences + :param db: ZODB obj + :param context_window: int + :param phr_type: str + :param w2v: word2vec model + :param verbose: bool + :return: None + """ vocabulary, qa_type_dict, kb = db.get_dicts() for url, text in kb.url2doc.items(): if verbose: print(f'Processing: {url}') - text_title_vert = get_title(text['title'], vocabulary) + text_title_vert = get_text_vert(text['title'], vocabulary) text_vert = get_text_vert(text['text'], vocabulary) - phrases = get_context(text_vert, text_title_vert, vocabulary, - context_window, phr_per_sent, w2v) + phrases = name_phrases(text_vert, text_title_vert, vocabulary, + context_window, phr_type, w2v) + # print(phrases) for sent_num, sent in enumerate(text['text']): if verbose: print(f"s:{' '.join([id2word(vocabulary, x)['word'] for x in sent['sent']])}") for phrs in phrases[sent_num]: for phr in phrs: print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}') - if not sent['ctx'].get(f'name_phrs_w{context_window}_t{phr_per_sent}'): - sent['ctx'][f'name_phrs_w{context_window}_t{phr_per_sent}'] = phrases[sent_num] + if not sent['ctx'].get(f'name_phrs_w{context_window}_t{phr_type}'): + sent['ctx'][f'name_phrs_w{context_window}_t{phr_type}'] = persistent.list.PersistentList() + for phr_from_prev_sentce in phrases[sent_num]: + content_phr_from_prev_sentce = persistent.list.PersistentList() + for p in phr_from_prev_sentce: + content_phr_from_prev_sentce.append(BTree({'sent': p})) + sent['ctx'][f'name_phrs_w{context_window}_t{phr_type}'].append(content_phr_from_prev_sentce) + # print(dict(sent['ctx'][f'name_phrs_w{context_window}_t{phr_type}'][0][0])) db._p_changed = True transaction.commit() -def get_ctx(phrs, vocabulary, part='', preloaded=False): - sentence_phrases = [] - for sent_phr in phrs: - phr_per_sent = [] - for p in sent_phr: - p_content = [] - for w_id_cx in p: - if part: - p_content.append(id2word(vocabulary, w_id_cx, parts=part, preloaded=preloaded)) - else: - p_content.append(id2word(vocabulary, w_id_cx, preloaded=preloaded)) - phr_per_sent.append(p_content) - sentence_phrases.append(phr_per_sent) - return sentence_phrases - - -def print_ctx(phrs): - for idx, sent_phr in enumerate(phrs): - for p in sent_phr: - print(f'\t\t\tc(-{idx+1}): {" ".join([x["word"] for x in p])}') - - def main(): import argparse parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences') diff --git a/context_previous_senteces.py b/context_previous_senteces.py index a676232128f3b2142a6c9111034b9a3e2edc3e19..a8f3319aa30127ecad8a013bf1e8ae466a1289b5 100755 --- a/context_previous_senteces.py +++ b/context_previous_senteces.py @@ -1,49 +1,46 @@ #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz import sys from sqad_db import SqadDb from sqad_db import id2word import persistent.list import transaction +from BTrees.OOBTree import BTree def add_ctx(db, number, verbose=False): + """ + Put previous sentece as context + :param db: ZODB object + :param number: int + :param verbose: bool + :return: None + """ vocabulary, _, kb = db.get_dicts() for url, text in kb.url2doc.items(): - print(f'Processing: {url}') + # print(f'Processing: {url}') for sent_num, sent in enumerate(text['text']): if verbose: print(f"s:{' '.join([id2word(vocabulary, x)['word'] for x in sent['sent']])}") if not sent['ctx'].get(f'prev_sent_w{number}'): - if sent_num == 0: - if verbose: - print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['title'][0]])}") - sent['ctx'][f'prev_sent_w{number}'] = persistent.list.PersistentList([text['title'][0]]) - else: - if verbose: - print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['text'][sent_num - 1]['sent']])}") - sent['ctx'][f'prev_sent_w{number}'] = persistent.list.PersistentList([text['text'][sent_num - 1]['sent']]) - db._p_changed = True - transaction.commit() - -def get_ctx(phrs, vocabulary, part='', preloaded=False): - content = [] - for p in phrs: - p_content = [] - for w_id_cx in p: - if part: - p_content.append(id2word(vocabulary, w_id_cx, parts=part, preloaded=preloaded)) - else: - p_content.append(id2word(vocabulary, w_id_cx, preloaded=preloaded)) - content.append(p_content) - - return content - - -def print_ctx(phrs): - for idx, p in enumerate(phrs): - print(f'\t\t\tc(-{idx+1}): {" ".join([x["word"] for x in p])}') + sent['ctx'][f'prev_sent_w{number}'] = persistent.list.PersistentList() + for i in range(1, number + 1): + if sent_num == 0: + if verbose: + print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['title']['sent']])}") + prev_sentece = persistent.list.PersistentList() + prev_sentece.append(BTree({'sent': persistent.list.PersistentList(text['title'][0]['sent'])})) + sent['ctx'][f'prev_sent_w{number}'].append(prev_sentece) + else: + if verbose: + print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['text'][sent_num - i]['sent']])}") + prev_sentece = persistent.list.PersistentList() + prev_sentece.append(BTree({'sent': persistent.list.PersistentList(text['text'][sent_num - i]['sent'])})) + sent['ctx'][f'prev_sent_w{number}'].append(prev_sentece) + db._p_changed = True + transaction.commit() def main(): diff --git a/get_vector.py b/get_vector.py index 27795f91796190b1a042c1ccd73b3a1bb23dacdf..8d48d17ff08b98bcdb6e105a82f702b576ae4873 100644 --- a/get_vector.py +++ b/get_vector.py @@ -1,19 +1,28 @@ - #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz import fasttext import os import sys dir_path = os.path.dirname(os.path.realpath(__file__)) -class Word2vec(): +class Word2vec: def __init__(self): + """ + Load pretrained models + """ self.model_100 = fasttext.load_model(f'{dir_path}/../../fasttext/models/cstenten_17_100_5_5_0.05_skip') self.model_300 = fasttext.load_model(f'{dir_path}/../../fasttext/models/cstenten_17_300_5_5_0.05_skip') self.model_500 = fasttext.load_model(f'{dir_path}/../../fasttext/models/cstenten_17_500_5_5_0.05_skip') def get_vector(self, word, dim): + """ + Return vector according dimension + :param word: str + :param dim: int + :return: list + """ if dim == 100: return self.model_100[word] elif dim == 300: diff --git a/make_copy.sh b/make_copy.sh new file mode 100755 index 0000000000000000000000000000000000000000..256a2787f07357286fd8a1bd6343b10193a16734 --- /dev/null +++ b/make_copy.sh @@ -0,0 +1,3 @@ +#! /usr/bin/env bash +rsync -avu $1 $2 +rsync -avu $1.index $2.index diff --git a/query_database.py b/query_database.py index dfed3d94ab783bc649833964d28930210038ad5a..9e8edda07b9fb003b046fdba7ecbb4f3fd707c6b 100755 --- a/query_database.py +++ b/query_database.py @@ -1,16 +1,117 @@ #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz from sqad_db import SqadDb from sqad_db import id2word from sqad_db import id2qt from pprint import pprint import sys -import context_np -import context_previous_senteces -import context_ner +import persistent.list +# import context_np +# import context_previous_senteces +# import context_ner +import BTrees.OOBTree -def get_ctx(data, vocabulary, part='', context_type='', preloaded=False): +def print_ctx(phrs): + """ + Print content of phrases + :param phrs: dict + :return: None + """ + for idx, sent_phr in enumerate(phrs): + for p in sent_phr: + print(f'\t\t\tc(-{idx+1}): {" ".join([x["word"] for x in p["sent"]])}') + + +def get_ctx_phrs(phrs, vocabulary, ctx_type, part='', preloaded=False): + """ + Transform each phrase into actual sentence wit all addons + :param phrs: dict + :param vocabulary: dict + :param part: str, specify word parts + :param preloaded: preloaded data in memeory + :return: list, lsit of phrases + """ + sentence_phrases = [] + for prev_sent in phrs: + phr_per_sent = [] + for p in prev_sent: + result_phr = {} + p_content = [] + for w_id_cx in p['sent']: + if part: + p_content.append(id2word(vocabulary, w_id_cx, parts=part, preloaded=preloaded)) + else: + p_content.append(id2word(vocabulary, w_id_cx, preloaded=preloaded)) + result_phr['sent'] = p_content + if p.get('cls_bert'): + result_phr['cls_bert'] = p['cls_bert'] + else: + result_phr['cls_bert'] = None + if p.get('sbert'): + result_phr['sbert'] = p['sbert'] + else: + result_phr['sbert'] = None + phr_per_sent.append(result_phr) + sentence_phrases.append(phr_per_sent) + return sentence_phrases + + +def get_ctx_phrs_old(phrs, vocabulary, ctx_type, part='', preloaded=False): + """ + Transform each phrase into actual sentence wit all addons + :param phrs: dict + :param vocabulary: dict + :param part: str, specify word parts + :param preloaded: preloaded data in memeory + :return: list, lsit of phrases + """ + sentence_phrases = [] + if ctx_type.startswith('prev_sent'): + for prev_sent in phrs: + phr_per_sent = {} + p_content = [] + for w_id_cx in prev_sent: + if part: + p_content.append(id2word(vocabulary, w_id_cx, parts=part, preloaded=preloaded)) + else: + p_content.append(id2word(vocabulary, w_id_cx, preloaded=preloaded)) + phr_per_sent['sent'] = p_content + phr_per_sent['cls_bert'] = None + phr_per_sent['sbert'] = None + + sentence_phrases.append([phr_per_sent]) + else: + for prev_sent in phrs: + phr_per_sent = [] + for p in prev_sent: + result_phr = {} + p_content = [] + phr_content = p + for w_id_cx in phr_content: + if part: + p_content.append(id2word(vocabulary, w_id_cx, parts=part, preloaded=preloaded)) + else: + p_content.append(id2word(vocabulary, w_id_cx, preloaded=preloaded)) + result_phr['sent'] = p_content + result_phr['cls_bert'] = None + result_phr['sbert'] = None + phr_per_sent.append(result_phr) + sentence_phrases.append(phr_per_sent) + return sentence_phrases + + +def get_ctx(data, vocabulary, old, part='', context_type='', preloaded=False): + """ + Get sentence context ofr KB[text] + :param data: dict + :param vocabulary: dict + :param part: str, specify word parts + :param context_type: str, specify context type + :param preloaded: preloaded data in memeory + :return: dict + """ sentence_phrases = {} if context_type: required_ctx = context_type.strip().split(';') @@ -19,17 +120,32 @@ def get_ctx(data, vocabulary, part='', context_type='', preloaded=False): for ctx_type, phrs in data.items(): if ctx_type in required_ctx or 'all' in required_ctx: if ctx_type.startswith('name_phrs'): - sentence_phrases[ctx_type] = context_np.get_ctx(phrs, vocabulary, part=part, preloaded=preloaded) + if old: + sentence_phrases[ctx_type] = get_ctx_phrs_old(phrs, vocabulary, ctx_type, part=part, preloaded=preloaded) + else: + sentence_phrases[ctx_type] = get_ctx_phrs(phrs, vocabulary, ctx_type, part=part, preloaded=preloaded) elif ctx_type.startswith('ctx_ner'): - sentence_phrases[ctx_type] = context_ner.get_ctx(phrs, vocabulary, part=part, preloaded=preloaded) - else: - sentence_phrases[ctx_type] = context_previous_senteces.get_ctx(phrs, vocabulary, part=part, - preloaded=preloaded) - + if old: + sentence_phrases[ctx_type] = get_ctx_phrs_old(phrs, vocabulary, ctx_type, part=part, preloaded=preloaded) + else: + sentence_phrases[ctx_type] = get_ctx_phrs(phrs, vocabulary, ctx_type, part=part, preloaded=preloaded) + elif ctx_type.startswith('prev_sent'): + if old: + sentence_phrases[ctx_type] = get_ctx_phrs_old(phrs, vocabulary, ctx_type, part=part, preloaded=preloaded) + else: + sentence_phrases[ctx_type] = get_ctx_phrs(phrs, vocabulary, ctx_type, part=part, preloaded=preloaded) return sentence_phrases def get_senence(data, vocabulary, part='', preloaded=False): + """ + Transform word ids to actual words + :param data: list of word ids + :param vocabulary: dict + :param part: str, specify word parts + :param preloaded: preloaded data in memeory + :return: + """ sent = [] for w_id in data: if part: @@ -40,65 +156,95 @@ def get_senence(data, vocabulary, part='', preloaded=False): return sent -def get_content(data, vocabulary, part='', preloaded=False): - result = [] - for sentence in data: - result.append(get_senence(sentence, vocabulary, part=part, preloaded=preloaded)) - return result - +def get_content(data, vocabulary, old, part='', context_type='', preloaded=False): + """ + Harvest content from data + :param data: list or dict + :param vocabulary: dict + :param part: str, specify word parts + :param context_type: str, specify context type + :param preloaded: preloaded data in memeory + :return: list, content of data + """ -def get_content_ctx(url, kb, vocabulary, part='', context_type='', preloaded=False): result = [] - for sentence in kb.url2doc.get(url)['text']: - result.append({'sent': get_senence(sentence['sent'], vocabulary, part=part, preloaded=preloaded), - 'ctx': get_ctx(sentence['ctx'], vocabulary, part=part, context_type=context_type, - preloaded=preloaded), - 'sbert': sentence['sbert'], - 'cls_bert': sentence['cls_bert']}) + for sentence in data: + s_data = {} + try: + s_data['sent'] = get_senence(sentence['sent'], vocabulary, part=part, preloaded=preloaded) + except (KeyError, TypeError): + s_data['sent'] = get_senence(sentence, vocabulary, part=part, preloaded=preloaded) + + if context_type: + s_data['ctx'] = get_ctx(sentence['ctx'], vocabulary, old, part=part, context_type=context_type, + preloaded=preloaded) + + try: + s_data['sbert'] = sentence['sbert'] + except (KeyError, TypeError): + s_data['sbert'] = None + + try: + s_data['cls_bert'] = sentence['cls_bert'] + except (KeyError, TypeError): + s_data['cls_bert'] = None + + result.append(s_data) return result -def get_record(db, record_id, word_parts='', context_type='', vocabulary=None, qa_type_dict=None, kb=None, preloaded=False): +def get_record(db, record_id, old, word_parts='', context_type='', vocabulary=None, qa_type_dict=None, + kb=None, preloaded=False): + """ + :param db: ZODB object, link to database + :param record_id: str, record id + :param word_parts: str, specify word parts + :param context_type: str, specify context type + :param vocabulary: dict + :param qa_type_dict: dict + :param kb: dict, Knowledge base + :param preloaded: preloaded data in memeory + :return: + """ record = db.get_record(record_id) if not vocabulary and not qa_type_dict and not kb: print('Not preloaded data') vocabulary, qa_type_dict, kb = db.get_dicts() - """ - result data structure - {rec_id: str, - q_type: str, - a_type: str, - question: list of sentences where sentence['sentence'] is list of tokens (word,lemma,tag,vector) and - sentence['phrs'] is list of all phrases belonging to this sentence and each phrase is list - of tokens (word,lemma,tag,vector) - a_sel: same as question structure - a_ext: same as question structure - text: same as question structure - } - """ data = {} data['rec_id'] = record.rec_id data['q_type'] = id2qt(qa_type_dict, record.q_type) data['a_type'] = id2qt(qa_type_dict, record.a_type) - data['question'] = get_content(record.question, vocabulary, part=word_parts, preloaded=preloaded) - data['a_sel'] = get_content(record.answer_selection, vocabulary, part=word_parts, preloaded=preloaded) + data['question'] = get_content(record.question, vocabulary, old, part=word_parts, preloaded=preloaded) + data['a_sel'] = get_content(record.answer_selection, vocabulary, old, part=word_parts, preloaded=preloaded) data['a_sel_pos'] = record.text_answer_position - data['a_ext'] = get_content(record.answer_extraction, vocabulary, part=word_parts, preloaded=preloaded) - data['similar_answers'] = record.similar_answers - data['text_title'] = get_content(kb.url2doc.get(record.text)["title"], vocabulary, part=word_parts, + data['a_ext'] = get_content(record.answer_extraction, vocabulary, old, part=word_parts, preloaded=preloaded) + data['similar_answers'] = dict(record.similar_answers) + data['text_title'] = get_content(kb.url2doc.get(record.text)["title"], vocabulary, old, part=word_parts, preloaded=preloaded) - data['text'] = get_content_ctx(record.text, kb, vocabulary, part=word_parts, context_type=context_type, - preloaded=preloaded) + data['text'] = get_content(kb.url2doc.get(record.text)['text'], vocabulary, old, part=word_parts, + context_type=context_type, preloaded=preloaded) try: data['contain_answer'] = len(record.similar_answers["sents_containing_ans_ext"]) data['not_contain_answer'] = len(data['text'])-len(record.similar_answers["sents_containing_ans_ext"]) except KeyError: sys.stderr.write('No sents_containing_ans_ext\n') + """ + result data structure + {rec_id: str, + q_type: str, + a_type: str, + question: list of sentences where sentence is list of dictionaries, each dictionary contain 'sent' part where + word ids are stored. Can contain other parts like 'ctx' , 'cls_bert', 's_bert' .... + a_sel: same as question structure + a_ext: same as question structure + text: same as question structure + } + """ return data -def print_record(db, record_id, context_type=''): +def print_record(db, record_id, old, context_type=''): text_sents_total = 0 record = db.get_record(record_id) vocabulary, qa_type_dict, kb = db.get_dicts() @@ -108,18 +254,18 @@ def print_record(db, record_id, context_type=''): print(f'a_type: {id2qt(qa_type_dict, record.a_type)}') print('question:') - for i in get_content(record.question, vocabulary, part='w'): - print(f'\ts: {" ".join([x["word"] for x in i])}') + for i in get_content(record.question, vocabulary, old, part='w'): + print(f'\ts: {" ".join([x["word"] for x in i["sent"]])}') print('a_sel:') - for i in get_content(record.answer_selection, vocabulary, part='w'): - print(f'\ts: {" ".join([x["word"] for x in i])}') + for i in get_content(record.answer_selection, vocabulary, old, part='w'): + print(f'\ts: {" ".join([x["word"] for x in i["sent"]])}') print(f'a_sel_pos: {record.text_answer_position}') print('a_ext:') - for i in get_content(record.answer_extraction, vocabulary, part='w'): - print(f'\ts: {" ".join([x["word"] for x in i])}') + for i in get_content(record.answer_extraction, vocabulary, old, part='w'): + print(f'\ts: {" ".join([x["word"] for x in i["sent"]])}') print('similar_answers:') for key, value in record.similar_answers.items(): @@ -129,22 +275,23 @@ def print_record(db, record_id, context_type=''): # print(f'\t\ts: {" ".join(sent_and_phrs["sent"])}') print(f'text_title:') - for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, part="w"): - print(f'\ts: {" ".join([x["word"] for x in i])}') + for i in get_content(kb.url2doc.get(record.text)["title"], vocabulary, old, part="w"): + print(f'\ts: {" ".join([x["word"] for x in i["sent"]])}') print('text:') - for idx, sent_and_phrs in enumerate(get_content_ctx(record.text, kb, vocabulary, - part='w', context_type=context_type)): + for idx, sent_and_phrs in enumerate(get_content(kb.url2doc.get(record.text)['text'], vocabulary, old, part='w', + context_type=context_type)): text_sents_total += 1 + # print(sent_and_phrs['ctx'].keys()) print(f'\ts_{idx}: {" ".join([x["word"] for x in sent_and_phrs["sent"]])}') for key, phrs in sent_and_phrs['ctx'].items(): - print(f'\t\tctx_type: {key}') - if key.startswith('name_phrs'): - context_np.print_ctx(phrs) - elif key.startswith('ctx_ner'): - context_ner.print_ctx(phrs) - else: - context_previous_senteces.print_ctx(phrs) + # print(phrs) + + try: + print(f'\t\tctx_type: {key}') + print_ctx(phrs) + except KeyError: + pass print('No. text sentences that contain answer') try: @@ -182,6 +329,9 @@ def main(): parser.add_argument('--list_ctx_types', action='store_true', required=False, default=False, help='List context types') + parser.add_argument('--old', action='store_true', + required=False, default=False, + help='Old database format') # ================================================================ # Optional parameters # ================================================================ @@ -193,8 +343,8 @@ def main(): help='Which word parts will be provided. Semicolon separated. For example "w;l;t;v100" ' 'will return word, lemma, tag and 100 dim. vector') parser.add_argument('--context_type', type=str, - required=False, default='', - help='List of context types separated by semicolon. Example "name_phrs_w5_n5;prev_sent_n1"') + required=False, default='all', + help='List of context types separated by semicolon. Example "name_phrs;prev_sent;ctx_ner"') args = parser.parse_args() if args.database_file == '' and args.url == '': @@ -222,9 +372,9 @@ def main(): db = SqadDb(read_only=True, url=args.url, port=args.port) if args.simple: - print_record(db, args.record_id, args.context_type) + print_record(db, args.record_id, args.old, args.context_type) else: - pprint(get_record(db, args.record_id, args.word_parts, args.context_type)) + pprint(get_record(db, args.record_id, args.old, args.word_parts, args.context_type)) db.close() else: sys.stderr.write('Please specify one of attributes: record_id, list_ctx_types') diff --git a/sentece2cls_bert.py b/sentece2cls_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..919dd4e5af1a784645d1ab9f5cee2e19764b8639 --- /dev/null +++ b/sentece2cls_bert.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +# coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz +# according https://github.com/huggingface/transformers/issues/1950 +from sqad_db import SqadDb +from sqad_db import id2word +import sys +import persistent.list +import transaction +from sentece_vector import add_vector_record +from sentece_vector import add_vector_text +from transformers import BertTokenizer, BertConfig, BertModel +import os +dir_path = os.path.dirname(os.path.realpath(__file__)) + + +class S2ClsBert: + def __init__(self): + """ + Load BERT models + """ + config = BertConfig.from_json_file( + f'{dir_path}/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/bert_config.json') + self.model = BertModel.from_pretrained( + f'{dir_path}/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/pytorch_model.bin', + config=config, local_files_only=True) + + self.tokenizer = BertTokenizer.from_pretrained( + f'{dir_path}/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/') + + def get_sent_embedding(self, sentece, verbose=False): + """ + Get BERT sentence embedding from CLS token + :param sentece: + :param verbose: + :return: + """ + input_ids = self.tokenizer.encode(sentece, return_tensors="pt", add_special_tokens=True) + if verbose: + input_ids_2 = self.tokenizer.encode(sentece, add_special_tokens=True) + for i in input_ids_2: + print(f'{self.tokenizer.decode(i)} -> {i}') + outputs = self.model(input_ids) + cls_emb = outputs[0][0][0].detach().numpy() + return cls_emb + + +def compute_vector(data, model, vocabulary, ctx=False, verbose=False): + """ + Add BERT vector to sentece + :param data: dict + :param model: S2ClsBert object + :param vocabulary: dict + :param ctx: bool + :param verbose: bool + :return: None + """ + for sentece in data: + s_content = [] + for w_id in sentece['sent']: + s_content.append(id2word(vocabulary, w_id, parts='w')['word']) + sent_v = model.get_sent_embedding(s_content, verbose=verbose) + sentece['cls_bert'] = persistent.list.PersistentList(sent_v) + + if verbose: + print(f"{' '.join(s_content)}\t{sent_v}") + + if ctx: + for ctx_type, ctx_sents in sentece['ctx'].items(): + for phr in ctx_sents: + for sentece in phr: + phr_content = [] + for w_id in sentece['sent']: + phr_content.append(id2word(vocabulary, w_id, parts='w')['word']) + if phr_content: + phr_sent_v = model.get_sent_embedding(phr_content, verbose=verbose) + sentece['cls_bert'] = persistent.list.PersistentList(phr_sent_v) + else: + sentece['cls_bert'] = persistent.list.PersistentList([0]*768) + + + +def main(): + import argparse + parser = argparse.ArgumentParser(description='Compute CLS-BERT sentence embeddings') + parser.add_argument('-u', '--url', type=str, + required=False, default='', + help='Database URL') + parser.add_argument('-p', '--port', type=int, + required=False, default=None, + help='Server port') + parser.add_argument('-d', '--db_path', type=str, + required=False, default='', + help='Database path') + parser.add_argument('-v', '--verbose', action='store_true', + required=False, + help='Verbose mode') + + args = parser.parse_args() + + if (args.url and args.port) or args.db_path: + if args.url and args.port: + db = SqadDb(url=args.url, port=args.port) + elif args.db_path: + db = SqadDb(file_name=args.db_path) + else: + sys.stderr.write('Please specify --db_path or (--port and --url)') + sys.exit() + + model = S2ClsBert() + try: + add_vector_text(db, model, compute_vector, verbose=args.verbose) + add_vector_record(db, model, compute_vector, verbose=args.verbose) + db.update() + db._p_changed = True + transaction.commit() + db.close() + except KeyboardInterrupt: + db.close() + sys.exit() + +if __name__ == "__main__": + main() diff --git a/sentece2s_bert.py b/sentece2s_bert.py index 38a2d42601c1f9f4849597cfb561e076fb16d66d..6ffb901d4b87337fe96074dbd5f21f4a638593a7 100755 --- a/sentece2s_bert.py +++ b/sentece2s_bert.py @@ -1,30 +1,48 @@ #!/usr/bin/env python3 # coding: utf-8 - +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz # created according: https://github.com/UKPLab/sentence-transformers # models https://www.sbert.net/docs/pretrained_models.html + import sys from sentence_transformers import SentenceTransformer import persistent.list import transaction from sqad_db import SqadDb from sqad_db import id2word +from sentece_vector import add_vector_record +from sentece_vector import add_vector_text + +def compute_vector(data, model, vocabulary, ctx=False, verbose=False): + """ + Compute sentence BERT vector + :param data: dict + :param model: SentenceTransformer model + :param vocabulary: dict + :param ctx: bool + :param verbose: bool + :return: None + """ + for sentece in data: + s_content = [] + for w_id in sentece['sent']: + s_content.append(id2word(vocabulary, w_id, parts='w')['word']) + sent_v = model.encode([' '.join(s_content)]) + sentece['sbert'] = persistent.list.PersistentList(sent_v) -def add_vector(db, model, verbose=False): - vocabulary, _, kb = db.get_dicts() + if verbose: + print(f"{' '.join(s_content)}\t{sent_v}") - for url, text in kb.url2doc.items(): - for sentece in text['text']: - sent = [] - for w_id in sentece['sent']: - sent.append(id2word(vocabulary, w_id, parts='w')['word']) - sent_v = model.encode([' '.join(sent)]) - sentece['sbert'] = persistent.list.PersistentList(sent_v) - if verbose: - print(f"{' '.join(sent)}\t{sent_v}") - db._p_changed = True - transaction.commit() + if ctx: + for ctx_type, ctx_sents in sentece['ctx'].items(): + for phr in ctx_sents: + for sentece in phr: + phr_content = [] + for w_id in sentece['sent']: + phr_content.append(id2word(vocabulary, w_id, parts='w')['word']) + phr_sent_v = model.encode([' '.join(phr_content)]) + sentece['sbert'] = persistent.list.PersistentList(phr_sent_v) def main(): @@ -56,8 +74,8 @@ def main(): model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1') try: - add_vector(db, model, verbose=args.verbose) - + add_vector_text(db, model, compute_vector, verbose=args.verbose) + add_vector_record(db, model, compute_vector, verbose=args.verbose) db.update() db._p_changed = True transaction.commit() @@ -66,5 +84,6 @@ def main(): db.close() sys.exit() + if __name__ == "__main__": main() diff --git a/sqad2database.py b/sqad2database.py index 5832a56c3823126c6e598885c7c827d126815a1e..853919d97f48f36dfec289a829e5c4106b45eefd 100755 --- a/sqad2database.py +++ b/sqad2database.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz import re import os -import datetime from sqad_db import SqadDb from sqad_db import Vocabulary from sqad_db import W2V @@ -25,6 +25,12 @@ hash_re = re.compile('^#$') def get_struct(data, struct_id): + """ + Get structure from vertical + :param data: str, vertical + :param struct_id: str + :return: str, senteces + """ struct = [] struct_start = False struct_end = False @@ -45,26 +51,52 @@ def get_struct(data, struct_id): def fill(data, rec_part, vocabulary, w2v): - for s in get_struct(data, 's'): - sent = persistent.list.PersistentList() - for token in s: - if not struct_re.match(token): - if hash_re.match(token.strip()): - word, lemma, tag = ('#', '#', '#') - else: - word, lemma, tag = token.strip().split('\t')[:3] - wid = word2id(vocabulary, word, lemma, tag, w2v) - sent.append(wid) - rec_part.append(sent) + """ + Fill content to database + :param data: str + :param rec_part: Record object + :param vocabulary: dict + :param w2v: word2vec model + :return: None + """ + for s in get_struct(data, 's'): + sent = persistent.list.PersistentList() + for token in s: + if not struct_re.match(token): + if hash_re.match(token.strip()): + word, lemma, tag = ('#', '#', '#') + else: + word, lemma, tag = token.strip().split('\t')[:3] + wid = word2id(vocabulary, word, lemma, tag, w2v) + sent.append(wid) + rec_part.append(BTree({'sent': sent})) def compare_sent(s1, s2): + """ + Compare two senteces + :param s1: str + :param s2: str + :return: bool + """ s1_words = [x.strip().split('\t')[0] for x in s1] s2_words = [x.strip().split('\t')[0] for x in s2] return s1_words == s2_words def add_text(knowledge_base, url, text_content, title, answer_selection_content, vocabulary, w2v, rec_id): + """ + Add text to Knowledge base + :param knowledge_base: kb object + :param url: str + :param text_content: str + :param title: str + :param answer_selection_content: str + :param vocabulary: dict + :param w2v: word2vec model + :param rec_id: str + :return: None + """ answer_sel_pos = -1 answer_selection = [] @@ -103,6 +135,12 @@ def add_text(knowledge_base, url, text_content, title, answer_selection_content, def fill_qa_type(data, qa_type): + """ + Fill question answer types + :param data: + :param qa_type: + :return: + """ q_type = -1 a_type = -1 for line in data: @@ -130,7 +168,6 @@ def main(): help='Version of database') args = parser.parse_args() - # db_name = 'sqad_db/{0}_{1:%d_%m_%Y-%H:%M:%S}'.format(args.name, datetime.datetime.now()) db_name = args.name db = SqadDb(file_name=db_name) rec_id_re = re.compile('(\d+)') diff --git a/sqad_db.py b/sqad_db.py index aa394656db185c478cc5210276b85eae84ad713c..ad037fd75ac1fa3c15e8776f461986439ac39284 100755 --- a/sqad_db.py +++ b/sqad_db.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 # coding: utf-8 +# Created by Marek MedveÄŹ - xmedved1@fi.muni.cz import ZODB import ZODB.FileStorage from ZEO import ClientStorage @@ -9,12 +10,22 @@ from BTrees.OOBTree import BTree from persistent import Persistent from get_vector import Word2vec import sys +import persistent.list # ============================================= # Vocabulary # ============================================= def word2id(vocabulary, word, lemma, tag, w2v): + """ + Transform word to specific ID + :param vocabulary: dict + :param word: str + :param lemma: str + :param tag: str + :param w2v: word2vec model + :return: int + """ if vocabulary.wlt2id.get(f'{word}|{lemma}|{tag}', None): return vocabulary.wlt2id[f'{word}|{lemma}|{tag}'] else: @@ -26,6 +37,14 @@ def word2id(vocabulary, word, lemma, tag, w2v): def id2word(vocabulary, key, parts='', preloaded=False): + """ + Transform ID to word, lemma, tag + :param vocabulary: dict + :param key: int + :param parts: str, word parts to be returned + :param preloaded: list + :return: all word inforamtion + """ result = {} word_parts = parts.strip().split(';') @@ -56,16 +75,27 @@ def id2word(vocabulary, key, parts='', preloaded=False): result['lemma'] = vocabulary.id2wlt[key]['lemma'] if 't' in word_parts or not parts: result['tag'] = vocabulary.id2wlt[key]['tag'] - if 'v100' in word_parts or not parts: - result['v100'] = vocabulary.vectors[key]['v100'] - if 'v300' in word_parts or not parts: - result['v300'] = vocabulary.vectors[key]['v300'] - if 'v500' in word_parts or not parts: - result['v500'] = vocabulary.vectors[key]['v500'] + + # Backwards compatibility + if isinstance(vocabulary.vectors[key], dict): # New + if 'v100' in word_parts or not parts: + result['v100'] = vocabulary.vectors[key]['v100'] + if 'v300' in word_parts or not parts: + result['v300'] = vocabulary.vectors[key]['v300'] + if 'v500' in word_parts or not parts: + result['v500'] = vocabulary.vectors[key]['v500'] + elif isinstance(vocabulary.vectors[key], persistent.list.PersistentList): # Old + if 'v100' in word_parts or not parts: + result['v100'] = vocabulary.vectors[key][0] + if 'v300' in word_parts or not parts: + result['v300'] = vocabulary.vectors[key][1] + if 'v500' in word_parts or not parts: + result['v500'] = vocabulary.vectors[key][2] + if 'v_bert' in word_parts or not parts: try: result['v_bert'] = vocabulary.vectors[key]['v_bert'] - except KeyError: + except (KeyError, TypeError): sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary.id2wlt[key]["word"]}\n') if 'id' in word_parts or not parts: result['id'] = key @@ -73,12 +103,20 @@ def id2word(vocabulary, key, parts='', preloaded=False): class W2V: + """ + Add word vectors to each word inside vocabulary + """ def __init__(self, test=False): self.test = test if not self.test: self.w2v = Word2vec() def get_vect(self, word): + """ + Get word vector + :param word: str + :return: list + """ result = BTree() if self.test: result['v100'] = None @@ -91,10 +129,20 @@ class W2V: return result def add_vector(self, vocabulary, key, word): + """ + Add vector to word form vocabulary + :param vocabulary: dict + :param key: id + :param word: str + :return: None + """ vocabulary.vectors[key] = self.get_vect(word) class Vocabulary(Persistent): + """ + Vocabulary of all words inside Knowledge base wit all features: word, lemma, tag, vectors + """ def __init__(self): self.id2wlt = BTree() # key: id, value: word, lemma, tag self.wlt2id = BTree() # key: word, value: id @@ -102,6 +150,10 @@ class Vocabulary(Persistent): self.vectors = BTree() # key: word_id, value: v100, v300, v500 def new_id(self): + """ + Create specific id for each word + :return: int + """ self.key += 1 return self.key @@ -110,6 +162,12 @@ class Vocabulary(Persistent): # QA type # ============================================= def qt2id(qa_type, qt_type): + """ + Question type to ID + :param qa_type: dict + :param qt_type: str + :return: + """ if qa_type.t2id.get(qt_type, None): return qa_type.t2id[qt_type] else: @@ -120,10 +178,19 @@ def qt2id(qa_type, qt_type): def id2qt(qa_type, key): + """ + ID to question type + :param qa_type: dict + :param key: int, the ID + :return: + """ return qa_type.id2t.get(key, -1) class QAType(Persistent): + """ + Question type object to store question ans answer types + """ def __init__(self): self.id2t = BTree() # key: id, value: type self.t2id = BTree() # key: type, value: id @@ -138,14 +205,30 @@ class QAType(Persistent): # Knowledge base # ============================================= def present_in_kb(kb, url): + """ + Check if documentis in knowledge base + :param kb: dict, kb object + :param url: str + :return: bool + """ return kb.url2doc.get(url) def add2kb(kb, url, text): + """ + Add processed text to knowledge base + :param kb: dict, kb object + :param url: str + :param text: dict, text object + :return: None + """ kb.url2doc[url] = text class KnowledgeBase(Persistent): + """ + QuestKnowledgeBase object to store all knowledge extracted form raw text + """ def __init__(self): self.url2doc = BTree() @@ -154,6 +237,9 @@ class KnowledgeBase(Persistent): # Record # ============================================= class Record(Persistent): + """ + Record object to store all information about each record + """ def __init__(self, rec_id): self.rec_id = rec_id self.question = persistent.list.PersistentList() # List of sentences where each item contains list of words @@ -170,7 +256,17 @@ class Record(Persistent): # Sqad database # ============================================= class SqadDb: + """ + SQAD database in ZODB form + """ def __init__(self, file_name=None, read_only=False, url=None, port=None): + """ + Init ZODB database or connect database if already exists, via file or ZEO server + :param file_name: str + :param read_only: bool + :param url: str + :param port: str + """ if url and port and not file_name: addr = url, port @@ -192,39 +288,86 @@ class SqadDb: self.root = self.connection.root() def add(self, rec_id, record_object): + """ + Add new record to database + :param rec_id: int + :param record_object: dict + :return: None + """ self.root[rec_id] = record_object transaction.commit() def add_vocab(self, vocab): + """ + Add vocabulary object to database + :param vocab: dict + :return: None + """ self.root['__vocabulary__'] = vocab transaction.commit() def add_qa_type(self, qa_type): + """ + Add question/answer type object to database + :param qa_type: dict + :return: None + """ self.root['__qa_type__'] = qa_type transaction.commit() def add_kb(self, knowledge_base): + """ + Add knowledge base object to database + :param knowledge_base: dict + :return: None + """ self.root['__knowledge_base__'] = knowledge_base transaction.commit() def get_ctx_types(self): + """ + Add knowledge base object to database + :param knowledge_base: dict + :return: None + """ return self.root['__ctx_types__'] def get_dicts(self): + """ + Get vocabulary, question type and knowledge base objects + :return: tuple + """ return self.root['__vocabulary__'], self.root['__qa_type__'], self.root['__knowledge_base__'] def init_ctx_types(self): + """ + Init context types object + :return: None + """ self.root['__ctx_types__'] = persistent.list.PersistentList() transaction.commit() def version(self, version): + """ + Assagin database a new version + :param version: int + :return: None + """ self.root['__version__'] = version transaction.commit() def get_version(self): + """ + Get database version + :return: int + """ return self.root['__version__'] def update(self): + """ + Assign database a new update + :return: None + """ if self.root.get('__update__', None): self.root['__update__'] += 1 else: @@ -232,17 +375,38 @@ class SqadDb: transaction.commit() def get_update(self): + """ + Get database update + :return: int + """ return self.root['__update__'] def get_record(self, rec_id): - return self.root[rec_id] + """ + Get record according to ID + :param rec_id: int + :return: None + """ + try: + return self.root[rec_id] + except KeyError: + sys.stderr.write(f'ERROR: Record {rec_id} not present in DB!') + sys.exit() def close(self): + """ + Close database connection + :return: None + """ self.connection.close() self.db.close() self.storage.close() - def get_all_records(self): + def get_all_records_id(self): + """ + Get IDs of all records present in DB + :return: dict + """ for i in self.root: if i not in ['__vocabulary__', '__knowledge_base__', '__ctx_types__', '__qa_type__', '__update__', '__version__']: