diff --git a/Makefile b/Makefile index 4106f794a9eb40741f841d0ffd9781a8eba8f769..acc66b06101dfbb14670f69d104d7cce07a45ec8 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,7 @@ DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base VERSION=$(shell cat ./sqad_db/version) NEW_VERSION=$$(($(VERSION)+1)) +UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") # Need to specify bash in order for conda activate to work. SHELL=/bin/bash @@ -11,32 +12,46 @@ create: printf "SQAD to DB\n=======================\n" >> $(DB_NAME).log echo $(NEW_VERSION) > ./sqad_db/version ($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log) - echo "$(hostname)" | mail -s "Done sqad_db created" "marek.medved3@gmail.com" + echo "$(hostname)" | mail -s "Done sqad_db created" "xmedved1@fi.muni.cz" updates: - UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") + @echo "creating update $(DB) -> $(UPDB)" cp $(DB) $(UPDB) cp $(DB).index $(UPDB).index cp $(DB).lock $(UPDB).lock - cp $(DB).log $(UPDB).log cp $(DB).tmp $(UPDB).tmp + cat ./Makefile >> $(UPDB).log + # Word Bert embeddings printf "add bert embeddings\n=======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) deeppavlov; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log) + ($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log) + # Contains answer sentece printf "Contains answer\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(UPDB) 2>> $(UPDB).log) + # Similar sentences printf "Similar answers\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -d $(UPDB) -n 0 2>> $(UPDB).log) - printf "Contex NP frases\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) 2>> $(UPDB).log) - printf "Context previous sentece\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) 2>> $(UPDB).log) - printf "Context wiki entity\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) + # Context NP + printf "Contex NP phrases context_window 3\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 3 --phr_per_sent "longest" 2>> $(UPDB).log) + printf "Contex NP phrases context_window 2\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 2 --phr_per_sent "longest" 2>> $(UPDB).log) + # Context Previous sentences + printf "Context previous sentece 1\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 1 2>> $(UPDB).log) + printf "Context previous sentece 2\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 2 2>> $(UPDB).log) + # Context NER + printf "Context wiki entity context_window 5\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) + printf "Context wiki entity context_window 2\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) + # Sentece Bert printf "Sentece to sentece bert embedding\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(UPDB) 2>> $(UPDB).log) -# printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log -# ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log) - echo "$(hostname)" | mail -s "Done AQA job" "marek.medved3@gmail.com" + # CLS Bert + printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log) + echo "$(hostname)" | mail -s "Done AQA job" "xmedved1@fi.muni.cz" run_ZODB_server: exec "/usr/bin/python3.6" -m "ZEO.runzeo" -C /nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/zeo_server.conf diff --git a/add_bert_emberdings.py b/add_bert_emberdings.py index 9c75d8a1da55f9797398ff62ac8efee65a064c97..1c977be4dde127ec7a031013cab4a5ffc6feaa97 100755 --- a/add_bert_emberdings.py +++ b/add_bert_emberdings.py @@ -4,29 +4,25 @@ import sys from sqad_db import SqadDb import persistent.list import transaction -from deeppavlov.core.common.file import read_json -from deeppavlov import build_model, configs +from transformers import BertTokenizer, BertConfig, BertModel # created according http://docs.deeppavlov.ai/en/master/features/models/bert.html # class Bert_Embeddings: def __init__(self): - bert_config = read_json(configs.embedder.bert_embedder) - bert_config['metadata']['variables'][ - 'BERT_PATH'] = '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt' + config = BertConfig.from_json_file( + '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/bert_config.json') + self.model = BertModel.from_pretrained( + '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/pytorch_model.bin', + config=config, local_files_only=True) - # tokens, token_embs, subtokens, subtoken_embs, sent_max_embs, sent_mean_embs, bert_pooler_outputs = self.model - self.model = build_model(bert_config) + self.tokenizer = BertTokenizer.from_pretrained( + '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/') def word2embedding(self, word): - tokens, token_embs, _, _, _, _, _ = self.model(word) - return token_embs[0][0] - - # def sent2embedding(self, sent): - # sent = '' - # tokens, _, _, _, _, sent_mean_embs, _ = self.model(sent) - # pass - # + input_ids = self.tokenizer.encode(["[CLS]", word], return_tensors="pt", add_special_tokens=True) + output = self.model(input_ids) + return output[0][0][0] def add_bert_word_embeddings_word(vocabulary, model, db): @@ -72,7 +68,6 @@ def main(): vocabulary, _, kb = db.get_dicts() try: add_bert_word_embeddings_word(vocabulary, model, db) - # add_bert_word_embeddings_sent(vocabulary, kb, model) db.update() db._p_changed = True transaction.commit() diff --git a/add_contains_answer_sentences.py b/add_contains_answer_sentences.py index edf52af8c91d58019b8cc7213364398a535a2477..f64af3964285ec8ba4f7acdf745790f57c35b34d 100755 --- a/add_contains_answer_sentences.py +++ b/add_contains_answer_sentences.py @@ -35,7 +35,7 @@ def find_sentences_containing_answer(db): def main(): import argparse - parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences') + parser = argparse.ArgumentParser(description='Sentences containing exact answer.') parser.add_argument('-u', '--url', type=str, required=False, default='', help='Database URL') diff --git a/add_similar_senteces.py b/add_similar_senteces.py index ffc3a4180b4fdfc6f60fee9ae97af88d75647fbc..712efba92ed550fc323d26d836bd25b91c1b6d3b 100755 --- a/add_similar_senteces.py +++ b/add_similar_senteces.py @@ -121,10 +121,11 @@ def find_similar_senteces(db, tf_idf): def main(): import argparse - parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences') + parser = argparse.ArgumentParser(description='Similar senteces using TF-IDF') parser.add_argument('-n', '--number', type=int, required=False, default=100, - help='Number of previous sentences as a context. For all similar sentences put "0".') + help='Number of previous sentences used for seraching similar senteces.' + ' For all similar sentences put "0".') parser.add_argument('-u', '--url', type=str, required=False, default='', help='Database URL') @@ -164,7 +165,7 @@ def main(): if args.number == 0: record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences) else: - record.similar_answers[f'sents_similar_{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number]) + record.similar_answers[f'sents_similar_w{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number]) db._p_changed = True transaction.commit() diff --git a/context_ner.py b/context_ner.py index 8d7b615aa0ddccb7e096485df6535619fe3c12e8..d8d2cc97dc8d4ccc6e6189dbb1de637d41543dd1 100755 --- a/context_ner.py +++ b/context_ner.py @@ -39,7 +39,7 @@ def ner_phrases(text, context_window, model): sent_w = ' '.join([x["word"] for x in sent]) w_idx = 0 - print(f'sent_w: {sent_w}\n') + # print(f'sent_w: {sent_w}\n') ner_out = model.predict(sent_w) ner_out = normalize_out(ner_out, sent) @@ -109,8 +109,8 @@ def add_ner(db, context_window, model, verbose=False): for phrs in phrases[sent_num]: for phr in phrs: print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}') - if not sent['ctx'].get(f'ctx_ner_{context_window}'): - sent['ctx'][f'ctx_ner_{context_window}'] = phrases[sent_num] + if not sent['ctx'].get(f'ctx_ner_w{context_window}'): + sent['ctx'][f'ctx_ner_w{context_window}'] = phrases[sent_num] db._p_changed = True transaction.commit() @@ -175,7 +175,7 @@ def main(): add_ner(db, args.context_window, model, args.verbose) - db.root['__ctx_types__'].append(f'ctx_ner_{args.context_window}') + db.root['__ctx_types__'].append(f'ctx_ner_w{args.context_window}') db.update() db._p_changed = True transaction.commit() diff --git a/context_np.py b/context_np.py index 1555463a1891a8d83933265e328bbc0776e9cea9..ce6e0dec6107cb7fa8417345f0575378b7a9afa2 100755 --- a/context_np.py +++ b/context_np.py @@ -37,8 +37,27 @@ class SetInterface: return s.get_marx_phrases_vert(filter_phr=True) # ================================================== - -def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v): +def filter_longest(phrases): + """ + Filter only the longest phrases. No intersection between results + :param phrases: list of phases + :return: + """ + to_remove_idx = [] + for idx_i, i in enumerate(phrases): + for idx_j, j in enumerate(phrases): + if idx_i != idx_j: + if i in j: + to_remove_idx.append(idx_i) + result = [] + for idx, x in enumerate(phrases): + if idx not in to_remove_idx: + result.append(x) + + return result + + +def name_phrases(text, title, vocabulary, context_window, phr_per_sent, w2v): set_parser = SetInterface("/nlp/projekty/set/set/grammar.set") text_context = persistent.list.PersistentList() # Read file and create phrases for all sentences @@ -62,15 +81,17 @@ def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v) wid = word2id(vocabulary, word, lemma, tag, w2v) phr.append(wid) phrases.append(phr) - - phrases_per_sentence.append(phrases) + if phr_per_sent == 'longest': + phrases_per_sentence.append(filter_longest(phrases)) + else: + phrases_per_sentence.append(phrases) # Crating context according to args.context_length and args.number for curr_sent_pos in range(len(phrases_per_sentence)): context_phrases = persistent.list.PersistentList() context_position = curr_sent_pos - 1 while (context_position >= 0) and (curr_sent_pos - context_position <= context_window): - context_phrases.append(phrases_per_sentence[context_position][:num_phr_per_sent]) + context_phrases.append(phrases_per_sentence[context_position]) context_position -= 1 # Title as a context for first sentence in document @@ -88,8 +109,8 @@ def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v) return text_context -def get_context(text, title, vocabulary, context_window, num_phr_per_sent, w2v): - nps = name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v) +def get_context(text, title, vocabulary, context_window, phr_per_sent, w2v): + nps = name_phrases(text, title, vocabulary, context_window, phr_per_sent, w2v) return nps @@ -117,7 +138,7 @@ def get_title(text_by_url, vocabulary): return vert -def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False): +def add_np_phrases(db, context_window, phr_per_sent, w2v, verbose=False): vocabulary, qa_type_dict, kb = db.get_dicts() for url, text in kb.url2doc.items(): if verbose: @@ -126,7 +147,7 @@ def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False): text_vert = get_text_vert(text['text'], vocabulary) phrases = get_context(text_vert, text_title_vert, vocabulary, - context_window, num_phr_per_sent, w2v) + context_window, phr_per_sent, w2v) for sent_num, sent in enumerate(text['text']): if verbose: @@ -134,8 +155,8 @@ def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False): for phrs in phrases[sent_num]: for phr in phrs: print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}') - if not sent['ctx'].get(f'name_phrs_w{context_window}_n{num_phr_per_sent}'): - sent['ctx'][f'name_phrs_w{context_window}_n{num_phr_per_sent}'] = phrases[sent_num] + if not sent['ctx'].get(f'name_phrs_w{context_window}_t{phr_per_sent}'): + sent['ctx'][f'name_phrs_w{context_window}_t{phr_per_sent}'] = phrases[sent_num] db._p_changed = True transaction.commit() @@ -168,9 +189,9 @@ def main(): parser.add_argument('--context_window', type=int, required=False, default=5, help='How many sentences account for context.') - parser.add_argument('--num_phr_per_sent', type=int, - required=False, default=5, - help='How many phrases are taken form each sentence within context window.') + parser.add_argument('--phr_per_sent', type=str, + required=False, default='longest', + help='Category of phreses as a context. Available: all, longest') parser.add_argument('--test', action='store_true', required=False, default=False, help='Dont load vectors - for testing purposes.') @@ -199,9 +220,9 @@ def main(): sys.exit() try: - add_np_phrases(db, args.context_window, args.num_phr_per_sent, w2v, args.verbose) + add_np_phrases(db, args.context_window, args.phr_per_sent, w2v, args.verbose) - db.root['__ctx_types__'].append(f'name_phrs_w{args.context_window}_n{args.num_phr_per_sent}') + db.root['__ctx_types__'].append(f'name_phrs_w{args.context_window}_t{args.phr_per_sent}') db.update() db._p_changed = True transaction.commit() diff --git a/context_previous_senteces.py b/context_previous_senteces.py index 4ec0b227dce9cd0e91e8643083260bafcd433cf7..a676232128f3b2142a6c9111034b9a3e2edc3e19 100755 --- a/context_previous_senteces.py +++ b/context_previous_senteces.py @@ -15,15 +15,15 @@ def add_ctx(db, number, verbose=False): if verbose: print(f"s:{' '.join([id2word(vocabulary, x)['word'] for x in sent['sent']])}") - if not sent['ctx'].get(f'prev_sent_n{number}'): + if not sent['ctx'].get(f'prev_sent_w{number}'): if sent_num == 0: if verbose: print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['title'][0]])}") - sent['ctx'][f'prev_sent_n{number}'] = persistent.list.PersistentList([text['title'][0]]) + sent['ctx'][f'prev_sent_w{number}'] = persistent.list.PersistentList([text['title'][0]]) else: if verbose: print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['text'][sent_num - 1]['sent']])}") - sent['ctx'][f'prev_sent_n{number}'] = persistent.list.PersistentList([text['text'][sent_num - 1]['sent']]) + sent['ctx'][f'prev_sent_w{number}'] = persistent.list.PersistentList([text['text'][sent_num - 1]['sent']]) db._p_changed = True transaction.commit() @@ -77,7 +77,7 @@ def main(): try: add_ctx(db, args.number, args.verbose) - db.root['__ctx_types__'].append(f'prev_sent_n{args.number}') + db.root['__ctx_types__'].append(f'prev_sent_w{args.number}') db.update() db._p_changed = True transaction.commit()