Skip to content
Snippets Groups Projects
Commit bcf2beb9 authored by Marek Medved's avatar Marek Medved
Browse files

s_bert and cls_bert for KB

parent 08881ebb
No related branches found
No related tags found
No related merge requests found
DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base
VERSION=$(shell cat ./sqad_db/version)
NEW_VERSION=$$(($(VERSION)+1))
UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S")
#UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S")
# Need to specify bash in order for conda activate to work.
SHELL=/bin/bash
......@@ -10,47 +10,51 @@ CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activ
create:
printf "SQAD to DB\n=======================\n" >> $(DB_NAME).log
cat ./Makefile >> $(DB_NAME).log
echo $(NEW_VERSION) > ./sqad_db/version
($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log)
echo "$(hostname)" | mail -s "Done sqad_db created" "xmedved1@fi.muni.cz"
updates:
@echo "creating update $(DB) -> $(UPDB)"
cp $(DB) $(UPDB)
cp $(DB).index $(UPDB).index
cp $(DB).lock $(UPDB).lock
cp $(DB).tmp $(UPDB).tmp
cat ./Makefile >> $(UPDB).log
@echo "creating updates $(DB)"
# Word Bert embeddings
printf "add bert embeddings\n=======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log)
./make_copy.sh $(DB) $(DB)_Vbert
printf "add bert embeddings\n=======================\n" >> $(DB)_Vbert.log
($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(DB)_Vbert 2>> $(DB)_Vbert.log)
# Contains answer sentece
printf "Contains answer\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(UPDB) 2>> $(UPDB).log)
./make_copy.sh $(DB)_Vbert $(DB)_Vbert_addAS
printf "Contains answer\n======================\n" >> $(DB)_Vbert_addAS.log
($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(DB)_Vbert_addAS 2>> $(DB)_Vbert_addAS.log)
# Similar sentences
printf "Similar answers\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -d $(UPDB) -n 0 2>> $(UPDB).log)
./make_copy.sh $(DB)_Vbert_addAS $(DB)_Vbert_addAS_simS
printf "Similar answers\n======================\n" >> $(DB)_Vbert_addAS_simS.log
($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -n 0 -d $(DB)_Vbert_addAS_simS 2>> $(DB)_Vbert_addAS_simS.log)
# Context NP
printf "Contex NP phrases context_window 3\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 3 --phr_per_sent "longest" 2>> $(UPDB).log)
printf "Contex NP phrases context_window 2\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 2 --phr_per_sent "longest" 2>> $(UPDB).log)
./make_copy.sh $(DB)_Vbert_addAS_simS $(DB)_Vbert_addAS_simS_cNP
printf "Contex NP phrases context_window 3\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP.log
($(CONDA_ACTIVATE) base; ./context_np.py --context_window 3 --phr_per_sent "longest" -d $(DB)_Vbert_addAS_simS_cNP 2>> $(DB)_Vbert_addAS_simS_cNP.log)
printf "Contex NP phrases context_window 2\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP.log
($(CONDA_ACTIVATE) base; ./context_np.py --context_window 2 --phr_per_sent "longest" -d $(DB)_Vbert_addAS_simS_cNP 2>> $(DB)_Vbert_addAS_simS_cNP.log)
# Context Previous sentences
printf "Context previous sentece 1\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 1 2>> $(UPDB).log)
printf "Context previous sentece 2\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 2 2>> $(UPDB).log)
./make_copy.sh $(DB)_Vbert_addAS_simS_cNP $(DB)_Vbert_addAS_simS_cNP_cPS
printf "Context previous sentece 1\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS.log
($(CONDA_ACTIVATE) base; ./context_previous_senteces.py --number 1 -d $(DB)_Vbert_addAS_simS_cNP_cPS 2>> $(DB)_Vbert_addAS_simS_cNP_cPS.log)
printf "Context previous sentece 2\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS.log
($(CONDA_ACTIVATE) base; ./context_previous_senteces.py --number 2 -d $(DB)_Vbert_addAS_simS_cNP_cPS 2>> $(DB)_Vbert_addAS_simS_cNP_cPS.log)
# Context NER
printf "Context wiki entity context_window 5\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log)
printf "Context wiki entity context_window 2\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log)
./make_copy.sh $(DB)_Vbert_addAS_simS_cNP_cPS $(DB)_Vbert_addAS_simS_cNP_cPS_cNER
printf "Context wiki entity context_window 5\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log
($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -m named_entity_recognition/BERT-NER/ner_model_cz/ -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log)
printf "Context wiki entity context_window 2\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log
($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -m named_entity_recognition/BERT-NER/ner_model_cz/ -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER.log)
# Sentece Bert
printf "Sentece to sentece bert embedding\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(UPDB) 2>> $(UPDB).log)
./make_copy.sh $(DB)_Vbert_addAS_simS_cNP_cPS_cNER $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert
printf "Sentece to sentece bert embedding\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert.log
($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert.log)
# CLS Bert
printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log)
./make_copy.sh $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert
printf "Sentece to cls bert embedding\n======================\n" >> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log
($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert 2>> $(DB)_Vbert_addAS_simS_cNP_cPS_cNER_sBert_clsBert.log)
echo "$(hostname)" | mail -s "Done AQA job" "xmedved1@fi.muni.cz"
run_ZODB_server:
......
......@@ -22,7 +22,8 @@ class Bert_Embeddings:
def word2embedding(self, word):
input_ids = self.tokenizer.encode(["[CLS]", word], return_tensors="pt", add_special_tokens=True)
output = self.model(input_ids)
return output[0][0][0]
return output[0][0][0].detach().numpy()
def add_bert_word_embeddings_word(vocabulary, model, db):
......
......@@ -52,7 +52,9 @@ def get_content_ctx(url, kb, vocabulary, part='', context_type='', preloaded=Fal
for sentence in kb.url2doc.get(url)['text']:
result.append({'sent': get_senence(sentence['sent'], vocabulary, part=part, preloaded=preloaded),
'ctx': get_ctx(sentence['ctx'], vocabulary, part=part, context_type=context_type,
preloaded=preloaded)})
preloaded=preloaded),
'sbert': sentence['sbert'],
'cls_bert': sentence['cls_bert']})
return result
......@@ -87,8 +89,11 @@ def get_record(db, record_id, word_parts='', context_type='', vocabulary=None, q
preloaded=preloaded)
data['text'] = get_content_ctx(record.text, kb, vocabulary, part=word_parts, context_type=context_type,
preloaded=preloaded)
data['contain_answer'] = len(record.similar_answers["sents_containing_ans_ext"])
data['not_contain_answer'] = len(data['text'])-len(record.similar_answers["sents_containing_ans_ext"])
try:
data['contain_answer'] = len(record.similar_answers["sents_containing_ans_ext"])
data['not_contain_answer'] = len(data['text'])-len(record.similar_answers["sents_containing_ans_ext"])
except KeyError:
sys.stderr.write('No sents_containing_ans_ext\n')
return data
......@@ -142,9 +147,15 @@ def print_record(db, record_id, context_type=''):
context_previous_senteces.print_ctx(phrs)
print('No. text sentences that contain answer')
print(f'\t{len(record.similar_answers["sents_containing_ans_ext"])}')
try:
print(f'\t{len(record.similar_answers["sents_containing_ans_ext"])}')
except KeyError:
print('\tNo info')
print('No. text sentences that does NOT contain answer')
print(f'\t{text_sents_total - len(record.similar_answers["sents_containing_ans_ext"])}')
try:
print(f'\t{text_sents_total - len(record.similar_answers["sents_containing_ans_ext"])}')
except KeyError:
print('\tNo info')
def main():
......
......@@ -43,7 +43,10 @@ def id2word(vocabulary, key, parts='', preloaded=False):
if 'v500' in word_parts or not parts:
result['v500'] = vocabulary['vectors'][key]['v500']
if 'v_bert' in word_parts or not parts:
result['v_bert'] = vocabulary['vectors'][key]['v_bert']
try:
result['v_bert'] = vocabulary['vectors'][key]['v_bert']
except KeyError:
sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary["id2wlt"][key]["word"]}\n')
if 'id' in word_parts or not parts:
result['id'] = key
else:
......@@ -60,7 +63,10 @@ def id2word(vocabulary, key, parts='', preloaded=False):
if 'v500' in word_parts or not parts:
result['v500'] = vocabulary.vectors[key]['v500']
if 'v_bert' in word_parts or not parts:
result['v_bert'] = vocabulary.vectors[key]['v_bert']
try:
result['v_bert'] = vocabulary.vectors[key]['v_bert']
except KeyError:
sys.stderr.write(f'ERROR: not "v_bert" for: {vocabulary.id2wlt[key]["word"]}\n')
if 'id' in word_parts or not parts:
result['id'] = key
return result
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment