Loading Makefile +28 −13 Original line number Diff line number Diff line DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base VERSION=$(shell cat ./sqad_db/version) NEW_VERSION=$$(($(VERSION)+1)) UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") # Need to specify bash in order for conda activate to work. SHELL=/bin/bash Loading @@ -11,32 +12,46 @@ create: printf "SQAD to DB\n=======================\n" >> $(DB_NAME).log echo $(NEW_VERSION) > ./sqad_db/version ($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log) echo "$(hostname)" | mail -s "Done sqad_db created" "marek.medved3@gmail.com" echo "$(hostname)" | mail -s "Done sqad_db created" "xmedved1@fi.muni.cz" updates: UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") @echo "creating update $(DB) -> $(UPDB)" cp $(DB) $(UPDB) cp $(DB).index $(UPDB).index cp $(DB).lock $(UPDB).lock cp $(DB).log $(UPDB).log cp $(DB).tmp $(UPDB).tmp cat ./Makefile >> $(UPDB).log # Word Bert embeddings printf "add bert embeddings\n=======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) deeppavlov; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log) ($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log) # Contains answer sentece printf "Contains answer\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(UPDB) 2>> $(UPDB).log) # Similar sentences printf "Similar answers\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -d $(UPDB) -n 0 2>> $(UPDB).log) printf "Contex NP frases\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) 2>> $(UPDB).log) printf "Context previous sentece\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) 2>> $(UPDB).log) printf "Context wiki entity\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) # Context NP printf "Contex NP phrases context_window 3\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 3 --phr_per_sent "longest" 2>> $(UPDB).log) printf "Contex NP phrases context_window 2\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 2 --phr_per_sent "longest" 2>> $(UPDB).log) # Context Previous sentences printf "Context previous sentece 1\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 1 2>> $(UPDB).log) printf "Context previous sentece 2\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 2 2>> $(UPDB).log) # Context NER printf "Context wiki entity context_window 5\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) printf "Context wiki entity context_window 2\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) # Sentece Bert printf "Sentece to sentece bert embedding\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(UPDB) 2>> $(UPDB).log) # printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log # ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log) echo "$(hostname)" | mail -s "Done AQA job" "marek.medved3@gmail.com" # CLS Bert printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log) echo "$(hostname)" | mail -s "Done AQA job" "xmedved1@fi.muni.cz" run_ZODB_server: exec "/usr/bin/python3.6" -m "ZEO.runzeo" -C /nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/zeo_server.conf Loading add_bert_emberdings.py +11 −16 Original line number Diff line number Diff line Loading @@ -4,29 +4,25 @@ import sys from sqad_db import SqadDb import persistent.list import transaction from deeppavlov.core.common.file import read_json from deeppavlov import build_model, configs from transformers import BertTokenizer, BertConfig, BertModel # created according http://docs.deeppavlov.ai/en/master/features/models/bert.html # class Bert_Embeddings: def __init__(self): bert_config = read_json(configs.embedder.bert_embedder) bert_config['metadata']['variables'][ 'BERT_PATH'] = '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt' config = BertConfig.from_json_file( '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/bert_config.json') self.model = BertModel.from_pretrained( '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/pytorch_model.bin', config=config, local_files_only=True) # tokens, token_embs, subtokens, subtoken_embs, sent_max_embs, sent_mean_embs, bert_pooler_outputs = self.model self.model = build_model(bert_config) self.tokenizer = BertTokenizer.from_pretrained( '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/') def word2embedding(self, word): tokens, token_embs, _, _, _, _, _ = self.model(word) return token_embs[0][0] # def sent2embedding(self, sent): # sent = '' # tokens, _, _, _, _, sent_mean_embs, _ = self.model(sent) # pass # input_ids = self.tokenizer.encode(["[CLS]", word], return_tensors="pt", add_special_tokens=True) output = self.model(input_ids) return output[0][0][0] def add_bert_word_embeddings_word(vocabulary, model, db): Loading Loading @@ -72,7 +68,6 @@ def main(): vocabulary, _, kb = db.get_dicts() try: add_bert_word_embeddings_word(vocabulary, model, db) # add_bert_word_embeddings_sent(vocabulary, kb, model) db.update() db._p_changed = True transaction.commit() Loading add_contains_answer_sentences.py +1 −1 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ def find_sentences_containing_answer(db): def main(): import argparse parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences') parser = argparse.ArgumentParser(description='Sentences containing exact answer.') parser.add_argument('-u', '--url', type=str, required=False, default='', help='Database URL') Loading add_similar_senteces.py +4 −3 Original line number Diff line number Diff line Loading @@ -121,10 +121,11 @@ def find_similar_senteces(db, tf_idf): def main(): import argparse parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences') parser = argparse.ArgumentParser(description='Similar senteces using TF-IDF') parser.add_argument('-n', '--number', type=int, required=False, default=100, help='Number of previous sentences as a context. For all similar sentences put "0".') help='Number of previous sentences used for seraching similar senteces.' ' For all similar sentences put "0".') parser.add_argument('-u', '--url', type=str, required=False, default='', help='Database URL') Loading Loading @@ -164,7 +165,7 @@ def main(): if args.number == 0: record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences) else: record.similar_answers[f'sents_similar_{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number]) record.similar_answers[f'sents_similar_w{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number]) db._p_changed = True transaction.commit() Loading context_ner.py +4 −4 Original line number Diff line number Diff line Loading @@ -39,7 +39,7 @@ def ner_phrases(text, context_window, model): sent_w = ' '.join([x["word"] for x in sent]) w_idx = 0 print(f'sent_w: {sent_w}\n') # print(f'sent_w: {sent_w}\n') ner_out = model.predict(sent_w) ner_out = normalize_out(ner_out, sent) Loading Loading @@ -109,8 +109,8 @@ def add_ner(db, context_window, model, verbose=False): for phrs in phrases[sent_num]: for phr in phrs: print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}') if not sent['ctx'].get(f'ctx_ner_{context_window}'): sent['ctx'][f'ctx_ner_{context_window}'] = phrases[sent_num] if not sent['ctx'].get(f'ctx_ner_w{context_window}'): sent['ctx'][f'ctx_ner_w{context_window}'] = phrases[sent_num] db._p_changed = True transaction.commit() Loading Loading @@ -175,7 +175,7 @@ def main(): add_ner(db, args.context_window, model, args.verbose) db.root['__ctx_types__'].append(f'ctx_ner_{args.context_window}') db.root['__ctx_types__'].append(f'ctx_ner_w{args.context_window}') db.update() db._p_changed = True transaction.commit() Loading Loading
Makefile +28 −13 Original line number Diff line number Diff line DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base VERSION=$(shell cat ./sqad_db/version) NEW_VERSION=$$(($(VERSION)+1)) UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") # Need to specify bash in order for conda activate to work. SHELL=/bin/bash Loading @@ -11,32 +12,46 @@ create: printf "SQAD to DB\n=======================\n" >> $(DB_NAME).log echo $(NEW_VERSION) > ./sqad_db/version ($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log) echo "$(hostname)" | mail -s "Done sqad_db created" "marek.medved3@gmail.com" echo "$(hostname)" | mail -s "Done sqad_db created" "xmedved1@fi.muni.cz" updates: UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") @echo "creating update $(DB) -> $(UPDB)" cp $(DB) $(UPDB) cp $(DB).index $(UPDB).index cp $(DB).lock $(UPDB).lock cp $(DB).log $(UPDB).log cp $(DB).tmp $(UPDB).tmp cat ./Makefile >> $(UPDB).log # Word Bert embeddings printf "add bert embeddings\n=======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) deeppavlov; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log) ($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log) # Contains answer sentece printf "Contains answer\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(UPDB) 2>> $(UPDB).log) # Similar sentences printf "Similar answers\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -d $(UPDB) -n 0 2>> $(UPDB).log) printf "Contex NP frases\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) 2>> $(UPDB).log) printf "Context previous sentece\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) 2>> $(UPDB).log) printf "Context wiki entity\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) # Context NP printf "Contex NP phrases context_window 3\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 3 --phr_per_sent "longest" 2>> $(UPDB).log) printf "Contex NP phrases context_window 2\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 2 --phr_per_sent "longest" 2>> $(UPDB).log) # Context Previous sentences printf "Context previous sentece 1\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 1 2>> $(UPDB).log) printf "Context previous sentece 2\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 2 2>> $(UPDB).log) # Context NER printf "Context wiki entity context_window 5\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) printf "Context wiki entity context_window 2\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) # Sentece Bert printf "Sentece to sentece bert embedding\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(UPDB) 2>> $(UPDB).log) # printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log # ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log) echo "$(hostname)" | mail -s "Done AQA job" "marek.medved3@gmail.com" # CLS Bert printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log) echo "$(hostname)" | mail -s "Done AQA job" "xmedved1@fi.muni.cz" run_ZODB_server: exec "/usr/bin/python3.6" -m "ZEO.runzeo" -C /nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/zeo_server.conf Loading
add_bert_emberdings.py +11 −16 Original line number Diff line number Diff line Loading @@ -4,29 +4,25 @@ import sys from sqad_db import SqadDb import persistent.list import transaction from deeppavlov.core.common.file import read_json from deeppavlov import build_model, configs from transformers import BertTokenizer, BertConfig, BertModel # created according http://docs.deeppavlov.ai/en/master/features/models/bert.html # class Bert_Embeddings: def __init__(self): bert_config = read_json(configs.embedder.bert_embedder) bert_config['metadata']['variables'][ 'BERT_PATH'] = '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt' config = BertConfig.from_json_file( '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/bert_config.json') self.model = BertModel.from_pretrained( '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/pytorch_model.bin', config=config, local_files_only=True) # tokens, token_embs, subtokens, subtoken_embs, sent_max_embs, sent_mean_embs, bert_pooler_outputs = self.model self.model = build_model(bert_config) self.tokenizer = BertTokenizer.from_pretrained( '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/') def word2embedding(self, word): tokens, token_embs, _, _, _, _, _ = self.model(word) return token_embs[0][0] # def sent2embedding(self, sent): # sent = '' # tokens, _, _, _, _, sent_mean_embs, _ = self.model(sent) # pass # input_ids = self.tokenizer.encode(["[CLS]", word], return_tensors="pt", add_special_tokens=True) output = self.model(input_ids) return output[0][0][0] def add_bert_word_embeddings_word(vocabulary, model, db): Loading Loading @@ -72,7 +68,6 @@ def main(): vocabulary, _, kb = db.get_dicts() try: add_bert_word_embeddings_word(vocabulary, model, db) # add_bert_word_embeddings_sent(vocabulary, kb, model) db.update() db._p_changed = True transaction.commit() Loading
add_contains_answer_sentences.py +1 −1 Original line number Diff line number Diff line Loading @@ -35,7 +35,7 @@ def find_sentences_containing_answer(db): def main(): import argparse parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences') parser = argparse.ArgumentParser(description='Sentences containing exact answer.') parser.add_argument('-u', '--url', type=str, required=False, default='', help='Database URL') Loading
add_similar_senteces.py +4 −3 Original line number Diff line number Diff line Loading @@ -121,10 +121,11 @@ def find_similar_senteces(db, tf_idf): def main(): import argparse parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences') parser = argparse.ArgumentParser(description='Similar senteces using TF-IDF') parser.add_argument('-n', '--number', type=int, required=False, default=100, help='Number of previous sentences as a context. For all similar sentences put "0".') help='Number of previous sentences used for seraching similar senteces.' ' For all similar sentences put "0".') parser.add_argument('-u', '--url', type=str, required=False, default='', help='Database URL') Loading Loading @@ -164,7 +165,7 @@ def main(): if args.number == 0: record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences) else: record.similar_answers[f'sents_similar_{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number]) record.similar_answers[f'sents_similar_w{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number]) db._p_changed = True transaction.commit() Loading
context_ner.py +4 −4 Original line number Diff line number Diff line Loading @@ -39,7 +39,7 @@ def ner_phrases(text, context_window, model): sent_w = ' '.join([x["word"] for x in sent]) w_idx = 0 print(f'sent_w: {sent_w}\n') # print(f'sent_w: {sent_w}\n') ner_out = model.predict(sent_w) ner_out = normalize_out(ner_out, sent) Loading Loading @@ -109,8 +109,8 @@ def add_ner(db, context_window, model, verbose=False): for phrs in phrases[sent_num]: for phr in phrs: print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}') if not sent['ctx'].get(f'ctx_ner_{context_window}'): sent['ctx'][f'ctx_ner_{context_window}'] = phrases[sent_num] if not sent['ctx'].get(f'ctx_ner_w{context_window}'): sent['ctx'][f'ctx_ner_w{context_window}'] = phrases[sent_num] db._p_changed = True transaction.commit() Loading Loading @@ -175,7 +175,7 @@ def main(): add_ner(db, args.context_window, model, args.verbose) db.root['__ctx_types__'].append(f'ctx_ner_{args.context_window}') db.root['__ctx_types__'].append(f'ctx_ner_w{args.context_window}') db.update() db._p_changed = True transaction.commit() Loading