diff --git a/Makefile b/Makefile index acc66b06101dfbb14670f69d104d7cce07a45ec8..fc4605424627fdef5ceb396ef6744329c4f96b95 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base VERSION=$(shell cat ./sqad_db/version) NEW_VERSION=$$(($(VERSION)+1)) -UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") +#UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") # Need to specify bash in order for conda activate to work. SHELL=/bin/bash @@ -10,47 +10,51 @@ CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activ create: printf "SQAD to DB\n=======================\n" >> $(DB_NAME).log + cat ./Makefile >> $(DB_NAME).log echo $(NEW_VERSION) > ./sqad_db/version ($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log) echo "$(hostname)" | mail -s "Done sqad_db created" "xmedved1@fi.muni.cz" updates: - @echo "creating update $(DB) -> $(UPDB)" - cp $(DB) $(UPDB) - cp $(DB).index $(UPDB).index - cp $(DB).lock $(UPDB).lock - cp $(DB).tmp $(UPDB).tmp - cat ./Makefile >> $(UPDB).log + @echo "creating updates $(DB)" # Word Bert embeddings - printf "add bert embeddings\n=======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log) + ./make_copy.sh $(DB) $(DB)_Vbert + printf "add bert embeddings\n=======================\n" >> $(DB)_Vbert.log + ($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(DB)_Vbert 2>> $(DB)_Vbert.log) # Contains answer sentece - printf "Contains answer\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(UPDB) 2>> $(UPDB).log) + ./make_copy.sh $(DB)_Vbert $(DB)_Vbert_addAS + printf "Contains answer\n======================\n" >> $(DB)_Vbert_addAS.log + ($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(DB)_Vbert_addAS 2>> $(DB)_Vbert_addAS.log) # Similar sentences - printf "Similar answers\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -d $(UPDB) -n 0 2>> $(UPDB).log) + ./make_copy.sh $(DB)_Vbert_addAS $(DB)_Vbert_addAS_simS + printf "Similar answers\n======================\n" >> $(DB)_Vbert_addAS_simS.log + ($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -n 0 -d $(DB)_Vbert_addAS_simS 2>> $(DB)_Vbert_addAS_simS.log) # Context NP - printf "Contex NP phrases context_window 3\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 3 --phr_per_sent "longest" 2>> $(UPDB).log) - printf "Contex NP phrases context_window 2\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 2 --phr_per_sent "longest" 2>> $(UPDB).log) + ./make_copy.sh $(DB)_Vbert_addAS_simS $(DB)_Vbert_addAS_simS_cNP + printf "Contex NP phrases context_window 3\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP.log + ($(CONDA_ACTIVATE) base; ./context_np.py --context_window 3 --phr_per_sent "longest" -d $(DB)_Vbert_addAS_simS_cNP 2>> $(DB)_Vbert_addAS_simS_cNP.log) + printf "Contex NP phrases context_window 2\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP.log + ($(CONDA_ACTIVATE) base; ./context_np.py --context_window 2 --phr_per_sent "longest" -d $(DB)_Vbert_addAS_simS_cNP 2>> $(DB)_Vbert_addAS_simS_cNP.log) # Context Previous sentences - printf "Context previous sentece 1\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 1 2>> $(UPDB).log) - printf "Context previous sentece 2\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 2 2>> $(UPDB).log) + ./make_copy.sh $(DB)_Vbert_addAS_simS_cNP $(DB)_Vbert_addAS_simS_cNP_cPS + printf "Context previous sentece 1\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS.log + ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py --number 1 -d $(DB)_Vbert_addAS_simS_cNP_cPS 2>> $(DB)_Vbert_addAS_simS_cNP_cPS.log) + printf "Context previous sentece 2\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS.log + ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py --number 2 -d $(DB)_Vbert_addAS_simS_cNP_cPS 2>> $(DB)_Vbert_addAS_simS_cNP_cPS.log) # Context NER - printf "Context wiki entity context_window 5\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) - printf "Context wiki entity context_window 2\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) + ./make_copy.sh $(DB)_Vbert_addAS_simS_cNP_cPS $(DB)_Vbert_addAS_simS_cNP_cPS_cNER + printf "Context wiki entity context_window 5\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log + ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -m named_entity_recognition/BERT-NER/ner_model_cz/ -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log) + printf "Context wiki entity context_window 2\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log + ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -m named_entity_recognition/BERT-NER/ner_model_cz/ -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log) # Sentece Bert - printf "Sentece to sentece bert embedding\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(UPDB) 2>> $(UPDB).log) + ./make_copy.sh $(DB)_Vbert_addAS_simS_cNP_cPS_cNER $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert + printf "Sentece to sentece bert embedding\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert.log + ($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert.log) # CLS Bert - printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log - ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log) + ./make_copy.sh $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert + printf "Sentece to cls bert embedding\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log + ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log) echo "$(hostname)" | mail -s "Done AQA job" "xmedved1@fi.muni.cz" run_ZODB_server: diff --git a/add_bert_emberdings.py b/add_bert_emberdings.py index 1c977be4dde127ec7a031013cab4a5ffc6feaa97..43dce44d8a5d3eeff3f83474b0416df3a0824b92 100755 --- a/add_bert_emberdings.py +++ b/add_bert_emberdings.py @@ -22,7 +22,8 @@ class Bert_Embeddings: def word2embedding(self, word): input_ids = self.tokenizer.encode(["[CLS]", word], return_tensors="pt", add_special_tokens=True) output = self.model(input_ids) - return output[0][0][0] + + return output[0][0][0].detach().numpy() def add_bert_word_embeddings_word(vocabulary, model, db): diff --git a/query_database.py b/query_database.py index 02d1fa7993bf3d1f2ffb49b5ca07e9fbc0e0e706..dfed3d94ab783bc649833964d28930210038ad5a 100755 --- a/query_database.py +++ b/query_database.py @@ -52,7 +52,9 @@ def get_content_ctx(url, kb, vocabulary, part='', context_type='', preloaded=Fal for sentence in kb.url2doc.get(url)['text']: result.append({'sent': get_senence(sentence['sent'], vocabulary, part=part, preloaded=preloaded), 'ctx': get_ctx(sentence['ctx'], vocabulary, part=part, context_type=context_type, - preloaded=preloaded)}) + preloaded=preloaded), + 'sbert': sentence['sbert'], + 'cls_bert': sentence['cls_bert']}) return result @@ -87,8 +89,11 @@ def get_record(db, record_id, word_parts='', context_type='', vocabulary=None, q preloaded=preloaded) data['text'] = get_content_ctx(record.text, kb, vocabulary, part=word_parts, context_type=context_type, preloaded=preloaded) - data['contain_answer'] = len(record.similar_answers["sents_containing_ans_ext"]) - data['not_contain_answer'] = len(data['text'])-len(record.similar_answers["sents_containing_ans_ext"]) + try: + data['contain_answer'] = len(record.similar_answers["sents_containing_ans_ext"]) + data['not_contain_answer'] = len(data['text'])-len(record.similar_answers["sents_containing_ans_ext"]) + except KeyError: + sys.stderr.write('No sents_containing_ans_ext\n') return data @@ -142,9 +147,15 @@ def print_record(db, record_id, context_type=''): context_previous_senteces.print_ctx(phrs) print('No. text sentences that contain answer') - print(f'\t{len(record.similar_answers["sents_containing_ans_ext"])}') + try: + print(f'\t{len(record.similar_answers["sents_containing_ans_ext"])}') + except KeyError: + print('\tNo info') print('No. text sentences that does NOT contain answer') - print(f'\t{text_sents_total - len(record.similar_answers["sents_containing_ans_ext"])}') + try: + print(f'\t{text_sents_total - len(record.similar_answers["sents_containing_ans_ext"])}') + except KeyError: + print('\tNo info') def main(): diff --git a/sqad_db.py b/sqad_db.py index 9d83a167cb532faca9af80aa570ab1e0b4ff969a..aa394656db185c478cc5210276b85eae84ad713c 100755 --- a/sqad_db.py +++ b/sqad_db.py @@ -43,7 +43,10 @@ def id2word(vocabulary, key, parts='', preloaded=False): if 'v500' in word_parts or not parts: result['v500'] = vocabulary['vectors'][key]['v500'] if 'v_bert' in word_parts or not parts: - result['v_bert'] = vocabulary['vectors'][key]['v_bert'] + try: + result['v_bert'] = vocabulary['vectors'][key]['v_bert'] + except KeyError: + sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary["id2wlt"][key]["word"]}\n') if 'id' in word_parts or not parts: result['id'] = key else: @@ -60,7 +63,10 @@ def id2word(vocabulary, key, parts='', preloaded=False): if 'v500' in word_parts or not parts: result['v500'] = vocabulary.vectors[key]['v500'] if 'v_bert' in word_parts or not parts: - result['v_bert'] = vocabulary.vectors[key]['v_bert'] + try: + result['v_bert'] = vocabulary.vectors[key]['v_bert'] + except KeyError: + sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary.id2wlt[key]["word"]}\n') if 'id' in word_parts or not parts: result['id'] = key return result