Commit 08881ebb authored by Marek Medved's avatar Marek Medved
Browse files

sentnce bert via cls plus normalization of names

parent 830c3724
Loading
Loading
Loading
Loading
+28 −13
Original line number Diff line number Diff line
DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base
VERSION=$(shell cat ./sqad_db/version)
NEW_VERSION=$$(($(VERSION)+1))
UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S")

# Need to specify bash in order for conda activate to work.
SHELL=/bin/bash
@@ -11,32 +12,46 @@ create:
	printf "SQAD to DB\n=======================\n" >> $(DB_NAME).log
	echo $(NEW_VERSION) > ./sqad_db/version
	($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log)
	echo "$(hostname)" | mail -s "Done sqad_db created" "marek.medved3@gmail.com"
	echo "$(hostname)" | mail -s "Done sqad_db created" "xmedved1@fi.muni.cz"

updates:
	UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S")
	@echo "creating update $(DB) -> $(UPDB)"
	cp $(DB) $(UPDB)
	cp $(DB).index $(UPDB).index
	cp $(DB).lock $(UPDB).lock
	cp $(DB).log $(UPDB).log
	cp $(DB).tmp $(UPDB).tmp
	cat ./Makefile >> $(UPDB).log
	# Word Bert embeddings
	printf "add bert embeddings\n=======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) deeppavlov; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log)
	($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log)
	# Contains answer sentece
	printf "Contains answer\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(UPDB) 2>> $(UPDB).log)
	# Similar sentences
	printf "Similar answers\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -d $(UPDB) -n 0 2>> $(UPDB).log)
	printf "Contex NP frases\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) 2>> $(UPDB).log)
	printf "Context previous sentece\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) 2>> $(UPDB).log)
	printf "Context wiki entity\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) mypy3; python ./context_ner.py -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log)
	# Context NP
	printf "Contex NP phrases context_window 3\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 3 --phr_per_sent "longest" 2>> $(UPDB).log)
	printf "Contex NP phrases context_window 2\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 2 --phr_per_sent "longest" 2>> $(UPDB).log)
	# Context Previous sentences
	printf "Context previous sentece 1\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 1 2>> $(UPDB).log)
	printf "Context previous sentece 2\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 2 2>> $(UPDB).log)
	# Context NER
	printf "Context wiki entity context_window 5\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log)
	printf "Context wiki entity context_window 2\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log)
	# Sentece Bert
	printf "Sentece to sentece bert embedding\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(UPDB) 2>> $(UPDB).log)
#	printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log
#	($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log)
	echo "$(hostname)" | mail -s "Done AQA job" "marek.medved3@gmail.com"
	# CLS Bert
	printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log
	($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log)
	echo "$(hostname)" | mail -s "Done AQA job" "xmedved1@fi.muni.cz"

run_ZODB_server:
	exec "/usr/bin/python3.6" -m "ZEO.runzeo" -C /nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/zeo_server.conf
+11 −16
Original line number Diff line number Diff line
@@ -4,29 +4,25 @@ import sys
from sqad_db import SqadDb
import persistent.list
import transaction
from deeppavlov.core.common.file import read_json
from deeppavlov import build_model, configs
from transformers import BertTokenizer, BertConfig, BertModel

# created according http://docs.deeppavlov.ai/en/master/features/models/bert.html
#
class Bert_Embeddings:
    def __init__(self):
        bert_config = read_json(configs.embedder.bert_embedder)
        bert_config['metadata']['variables'][
            'BERT_PATH'] = '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt'
        config = BertConfig.from_json_file(
            '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/bert_config.json')
        self.model = BertModel.from_pretrained(
            '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/pytorch_model.bin',
            config=config, local_files_only=True)

        # tokens, token_embs, subtokens, subtoken_embs, sent_max_embs, sent_mean_embs, bert_pooler_outputs = self.model
        self.model = build_model(bert_config)
        self.tokenizer = BertTokenizer.from_pretrained(
            '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/')

    def word2embedding(self, word):
        tokens, token_embs, _, _, _, _, _ = self.model(word)
        return token_embs[0][0]

    # def sent2embedding(self, sent):
    #     sent = ''
    #     tokens, _, _, _, _, sent_mean_embs, _ = self.model(sent)
    #     pass
    #
        input_ids = self.tokenizer.encode(["[CLS]", word], return_tensors="pt", add_special_tokens=True)
        output = self.model(input_ids)
        return output[0][0][0]


def add_bert_word_embeddings_word(vocabulary, model, db):
@@ -72,7 +68,6 @@ def main():
    vocabulary, _, kb = db.get_dicts()
    try:
        add_bert_word_embeddings_word(vocabulary, model, db)
        # add_bert_word_embeddings_sent(vocabulary, kb, model)
        db.update()
        db._p_changed = True
        transaction.commit()
+1 −1
Original line number Diff line number Diff line
@@ -35,7 +35,7 @@ def find_sentences_containing_answer(db):

def main():
    import argparse
    parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences')
    parser = argparse.ArgumentParser(description='Sentences containing exact answer.')
    parser.add_argument('-u', '--url', type=str,
                        required=False, default='',
                        help='Database URL')
+4 −3
Original line number Diff line number Diff line
@@ -121,10 +121,11 @@ def find_similar_senteces(db, tf_idf):

def main():
    import argparse
    parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences')
    parser = argparse.ArgumentParser(description='Similar senteces using TF-IDF')
    parser.add_argument('-n', '--number', type=int,
                        required=False, default=100,
                        help='Number of previous sentences as a context. For all similar sentences put "0".')
                        help='Number of previous sentences used for seraching similar senteces.'
                             ' For all similar sentences put "0".')
    parser.add_argument('-u', '--url', type=str,
                        required=False, default='',
                        help='Database URL')
@@ -164,7 +165,7 @@ def main():
            if args.number == 0:
                record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences)
            else:
                record.similar_answers[f'sents_similar_{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number])
                record.similar_answers[f'sents_similar_w{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number])
            db._p_changed = True
            transaction.commit()

+4 −4
Original line number Diff line number Diff line
@@ -39,7 +39,7 @@ def ner_phrases(text, context_window, model):
        sent_w = ' '.join([x["word"] for x in sent])

        w_idx = 0
        print(f'sent_w: {sent_w}\n')
        # print(f'sent_w: {sent_w}\n')
        ner_out = model.predict(sent_w)
        ner_out = normalize_out(ner_out, sent)

@@ -109,8 +109,8 @@ def add_ner(db, context_window, model, verbose=False):
                for phrs in phrases[sent_num]:
                    for phr in phrs:
                        print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}')
            if not sent['ctx'].get(f'ctx_ner_{context_window}'):
                sent['ctx'][f'ctx_ner_{context_window}'] = phrases[sent_num]
            if not sent['ctx'].get(f'ctx_ner_w{context_window}'):
                sent['ctx'][f'ctx_ner_w{context_window}'] = phrases[sent_num]
                db._p_changed = True
                transaction.commit()

@@ -175,7 +175,7 @@ def main():

        add_ner(db, args.context_window, model, args.verbose)

        db.root['__ctx_types__'].append(f'ctx_ner_{args.context_window}')
        db.root['__ctx_types__'].append(f'ctx_ner_w{args.context_window}')
        db.update()
        db._p_changed = True
        transaction.commit()
Loading