Skip to content
Snippets Groups Projects
Commit 08881ebb authored by Marek Medved's avatar Marek Medved
Browse files

sentnce bert via cls plus normalization of names

parent 830c3724
No related branches found
No related tags found
No related merge requests found
DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base
VERSION=$(shell cat ./sqad_db/version) VERSION=$(shell cat ./sqad_db/version)
NEW_VERSION=$$(($(VERSION)+1)) NEW_VERSION=$$(($(VERSION)+1))
UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S")
# Need to specify bash in order for conda activate to work. # Need to specify bash in order for conda activate to work.
SHELL=/bin/bash SHELL=/bin/bash
...@@ -11,32 +12,46 @@ create: ...@@ -11,32 +12,46 @@ create:
printf "SQAD to DB\n=======================\n" >> $(DB_NAME).log printf "SQAD to DB\n=======================\n" >> $(DB_NAME).log
echo $(NEW_VERSION) > ./sqad_db/version echo $(NEW_VERSION) > ./sqad_db/version
($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log) ($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log)
echo "$(hostname)" | mail -s "Done sqad_db created" "marek.medved3@gmail.com" echo "$(hostname)" | mail -s "Done sqad_db created" "xmedved1@fi.muni.cz"
updates: updates:
UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") @echo "creating update $(DB) -> $(UPDB)"
cp $(DB) $(UPDB) cp $(DB) $(UPDB)
cp $(DB).index $(UPDB).index cp $(DB).index $(UPDB).index
cp $(DB).lock $(UPDB).lock cp $(DB).lock $(UPDB).lock
cp $(DB).log $(UPDB).log
cp $(DB).tmp $(UPDB).tmp cp $(DB).tmp $(UPDB).tmp
cat ./Makefile >> $(UPDB).log
# Word Bert embeddings
printf "add bert embeddings\n=======================\n" >> $(UPDB).log printf "add bert embeddings\n=======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) deeppavlov; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log) ($(CONDA_ACTIVATE) bert; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log)
# Contains answer sentece
printf "Contains answer\n======================\n" >> $(UPDB).log printf "Contains answer\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(UPDB) 2>> $(UPDB).log) ($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(UPDB) 2>> $(UPDB).log)
# Similar sentences
printf "Similar answers\n======================\n" >> $(UPDB).log printf "Similar answers\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -d $(UPDB) -n 0 2>> $(UPDB).log) ($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -d $(UPDB) -n 0 2>> $(UPDB).log)
printf "Contex NP frases\n======================\n" >> $(UPDB).log # Context NP
($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) 2>> $(UPDB).log) printf "Contex NP phrases context_window 3\n======================\n" >> $(UPDB).log
printf "Context previous sentece\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 3 --phr_per_sent "longest" 2>> $(UPDB).log)
($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) 2>> $(UPDB).log) printf "Contex NP phrases context_window 2\n======================\n" >> $(UPDB).log
printf "Context wiki entity\n======================\n" >> $(UPDB).log ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) --context_window 2 --phr_per_sent "longest" 2>> $(UPDB).log)
($(CONDA_ACTIVATE) mypy3; python ./context_ner.py -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) # Context Previous sentences
printf "Context previous sentece 1\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 1 2>> $(UPDB).log)
printf "Context previous sentece 2\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) --number 2 2>> $(UPDB).log)
# Context NER
printf "Context wiki entity context_window 5\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 5 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log)
printf "Context wiki entity context_window 2\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) mypy3; python ./context_ner.py --context_window 2 -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log)
# Sentece Bert
printf "Sentece to sentece bert embedding\n======================\n" >> $(UPDB).log printf "Sentece to sentece bert embedding\n======================\n" >> $(UPDB).log
($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(UPDB) 2>> $(UPDB).log) ($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(UPDB) 2>> $(UPDB).log)
# printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log # CLS Bert
# ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log) printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log
echo "$(hostname)" | mail -s "Done AQA job" "marek.medved3@gmail.com" ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log)
echo "$(hostname)" | mail -s "Done AQA job" "xmedved1@fi.muni.cz"
run_ZODB_server: run_ZODB_server:
exec "/usr/bin/python3.6" -m "ZEO.runzeo" -C /nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/zeo_server.conf exec "/usr/bin/python3.6" -m "ZEO.runzeo" -C /nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/zeo_server.conf
......
...@@ -4,29 +4,25 @@ import sys ...@@ -4,29 +4,25 @@ import sys
from sqad_db import SqadDb from sqad_db import SqadDb
import persistent.list import persistent.list
import transaction import transaction
from deeppavlov.core.common.file import read_json from transformers import BertTokenizer, BertConfig, BertModel
from deeppavlov import build_model, configs
# created according http://docs.deeppavlov.ai/en/master/features/models/bert.html # created according http://docs.deeppavlov.ai/en/master/features/models/bert.html
# #
class Bert_Embeddings: class Bert_Embeddings:
def __init__(self): def __init__(self):
bert_config = read_json(configs.embedder.bert_embedder) config = BertConfig.from_json_file(
bert_config['metadata']['variables'][ '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/bert_config.json')
'BERT_PATH'] = '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt' self.model = BertModel.from_pretrained(
'/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/pytorch_model.bin',
config=config, local_files_only=True)
# tokens, token_embs, subtokens, subtoken_embs, sent_max_embs, sent_mean_embs, bert_pooler_outputs = self.model self.tokenizer = BertTokenizer.from_pretrained(
self.model = build_model(bert_config) '/nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/bert_embeder_models/bg_cs_pl_ru_cased_L-12_H-768_A-12_pt/')
def word2embedding(self, word): def word2embedding(self, word):
tokens, token_embs, _, _, _, _, _ = self.model(word) input_ids = self.tokenizer.encode(["[CLS]", word], return_tensors="pt", add_special_tokens=True)
return token_embs[0][0] output = self.model(input_ids)
return output[0][0][0]
# def sent2embedding(self, sent):
# sent = ''
# tokens, _, _, _, _, sent_mean_embs, _ = self.model(sent)
# pass
#
def add_bert_word_embeddings_word(vocabulary, model, db): def add_bert_word_embeddings_word(vocabulary, model, db):
...@@ -72,7 +68,6 @@ def main(): ...@@ -72,7 +68,6 @@ def main():
vocabulary, _, kb = db.get_dicts() vocabulary, _, kb = db.get_dicts()
try: try:
add_bert_word_embeddings_word(vocabulary, model, db) add_bert_word_embeddings_word(vocabulary, model, db)
# add_bert_word_embeddings_sent(vocabulary, kb, model)
db.update() db.update()
db._p_changed = True db._p_changed = True
transaction.commit() transaction.commit()
......
...@@ -35,7 +35,7 @@ def find_sentences_containing_answer(db): ...@@ -35,7 +35,7 @@ def find_sentences_containing_answer(db):
def main(): def main():
import argparse import argparse
parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences') parser = argparse.ArgumentParser(description='Sentences containing exact answer.')
parser.add_argument('-u', '--url', type=str, parser.add_argument('-u', '--url', type=str,
required=False, default='', required=False, default='',
help='Database URL') help='Database URL')
......
...@@ -121,10 +121,11 @@ def find_similar_senteces(db, tf_idf): ...@@ -121,10 +121,11 @@ def find_similar_senteces(db, tf_idf):
def main(): def main():
import argparse import argparse
parser = argparse.ArgumentParser(description='Add noun phrases as context to sentences') parser = argparse.ArgumentParser(description='Similar senteces using TF-IDF')
parser.add_argument('-n', '--number', type=int, parser.add_argument('-n', '--number', type=int,
required=False, default=100, required=False, default=100,
help='Number of previous sentences as a context. For all similar sentences put "0".') help='Number of previous sentences used for seraching similar senteces.'
' For all similar sentences put "0".')
parser.add_argument('-u', '--url', type=str, parser.add_argument('-u', '--url', type=str,
required=False, default='', required=False, default='',
help='Database URL') help='Database URL')
...@@ -164,7 +165,7 @@ def main(): ...@@ -164,7 +165,7 @@ def main():
if args.number == 0: if args.number == 0:
record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences) record.similar_answers[f'sents_similar'] = persistent.list.PersistentList(sorted_sim_sentences)
else: else:
record.similar_answers[f'sents_similar_{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number]) record.similar_answers[f'sents_similar_w{args.number}'] = persistent.list.PersistentList(sorted_sim_sentences[:args.number])
db._p_changed = True db._p_changed = True
transaction.commit() transaction.commit()
......
...@@ -39,7 +39,7 @@ def ner_phrases(text, context_window, model): ...@@ -39,7 +39,7 @@ def ner_phrases(text, context_window, model):
sent_w = ' '.join([x["word"] for x in sent]) sent_w = ' '.join([x["word"] for x in sent])
w_idx = 0 w_idx = 0
print(f'sent_w: {sent_w}\n') # print(f'sent_w: {sent_w}\n')
ner_out = model.predict(sent_w) ner_out = model.predict(sent_w)
ner_out = normalize_out(ner_out, sent) ner_out = normalize_out(ner_out, sent)
...@@ -109,8 +109,8 @@ def add_ner(db, context_window, model, verbose=False): ...@@ -109,8 +109,8 @@ def add_ner(db, context_window, model, verbose=False):
for phrs in phrases[sent_num]: for phrs in phrases[sent_num]:
for phr in phrs: for phr in phrs:
print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}') print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}')
if not sent['ctx'].get(f'ctx_ner_{context_window}'): if not sent['ctx'].get(f'ctx_ner_w{context_window}'):
sent['ctx'][f'ctx_ner_{context_window}'] = phrases[sent_num] sent['ctx'][f'ctx_ner_w{context_window}'] = phrases[sent_num]
db._p_changed = True db._p_changed = True
transaction.commit() transaction.commit()
...@@ -175,7 +175,7 @@ def main(): ...@@ -175,7 +175,7 @@ def main():
add_ner(db, args.context_window, model, args.verbose) add_ner(db, args.context_window, model, args.verbose)
db.root['__ctx_types__'].append(f'ctx_ner_{args.context_window}') db.root['__ctx_types__'].append(f'ctx_ner_w{args.context_window}')
db.update() db.update()
db._p_changed = True db._p_changed = True
transaction.commit() transaction.commit()
......
...@@ -37,8 +37,27 @@ class SetInterface: ...@@ -37,8 +37,27 @@ class SetInterface:
return s.get_marx_phrases_vert(filter_phr=True) return s.get_marx_phrases_vert(filter_phr=True)
# ================================================== # ==================================================
def filter_longest(phrases):
def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v): """
Filter only the longest phrases. No intersection between results
:param phrases: list of phases
:return:
"""
to_remove_idx = []
for idx_i, i in enumerate(phrases):
for idx_j, j in enumerate(phrases):
if idx_i != idx_j:
if i in j:
to_remove_idx.append(idx_i)
result = []
for idx, x in enumerate(phrases):
if idx not in to_remove_idx:
result.append(x)
return result
def name_phrases(text, title, vocabulary, context_window, phr_per_sent, w2v):
set_parser = SetInterface("/nlp/projekty/set/set/grammar.set") set_parser = SetInterface("/nlp/projekty/set/set/grammar.set")
text_context = persistent.list.PersistentList() text_context = persistent.list.PersistentList()
# Read file and create phrases for all sentences # Read file and create phrases for all sentences
...@@ -62,15 +81,17 @@ def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v) ...@@ -62,15 +81,17 @@ def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v)
wid = word2id(vocabulary, word, lemma, tag, w2v) wid = word2id(vocabulary, word, lemma, tag, w2v)
phr.append(wid) phr.append(wid)
phrases.append(phr) phrases.append(phr)
if phr_per_sent == 'longest':
phrases_per_sentence.append(phrases) phrases_per_sentence.append(filter_longest(phrases))
else:
phrases_per_sentence.append(phrases)
# Crating context according to args.context_length and args.number # Crating context according to args.context_length and args.number
for curr_sent_pos in range(len(phrases_per_sentence)): for curr_sent_pos in range(len(phrases_per_sentence)):
context_phrases = persistent.list.PersistentList() context_phrases = persistent.list.PersistentList()
context_position = curr_sent_pos - 1 context_position = curr_sent_pos - 1
while (context_position >= 0) and (curr_sent_pos - context_position <= context_window): while (context_position >= 0) and (curr_sent_pos - context_position <= context_window):
context_phrases.append(phrases_per_sentence[context_position][:num_phr_per_sent]) context_phrases.append(phrases_per_sentence[context_position])
context_position -= 1 context_position -= 1
# Title as a context for first sentence in document # Title as a context for first sentence in document
...@@ -88,8 +109,8 @@ def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v) ...@@ -88,8 +109,8 @@ def name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v)
return text_context return text_context
def get_context(text, title, vocabulary, context_window, num_phr_per_sent, w2v): def get_context(text, title, vocabulary, context_window, phr_per_sent, w2v):
nps = name_phrases(text, title, vocabulary, context_window, num_phr_per_sent, w2v) nps = name_phrases(text, title, vocabulary, context_window, phr_per_sent, w2v)
return nps return nps
...@@ -117,7 +138,7 @@ def get_title(text_by_url, vocabulary): ...@@ -117,7 +138,7 @@ def get_title(text_by_url, vocabulary):
return vert return vert
def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False): def add_np_phrases(db, context_window, phr_per_sent, w2v, verbose=False):
vocabulary, qa_type_dict, kb = db.get_dicts() vocabulary, qa_type_dict, kb = db.get_dicts()
for url, text in kb.url2doc.items(): for url, text in kb.url2doc.items():
if verbose: if verbose:
...@@ -126,7 +147,7 @@ def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False): ...@@ -126,7 +147,7 @@ def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False):
text_vert = get_text_vert(text['text'], vocabulary) text_vert = get_text_vert(text['text'], vocabulary)
phrases = get_context(text_vert, text_title_vert, vocabulary, phrases = get_context(text_vert, text_title_vert, vocabulary,
context_window, num_phr_per_sent, w2v) context_window, phr_per_sent, w2v)
for sent_num, sent in enumerate(text['text']): for sent_num, sent in enumerate(text['text']):
if verbose: if verbose:
...@@ -134,8 +155,8 @@ def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False): ...@@ -134,8 +155,8 @@ def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False):
for phrs in phrases[sent_num]: for phrs in phrases[sent_num]:
for phr in phrs: for phr in phrs:
print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}') print(f'\t\tp:{" ".join([id2word(vocabulary, x)["word"] for x in phr])}')
if not sent['ctx'].get(f'name_phrs_w{context_window}_n{num_phr_per_sent}'): if not sent['ctx'].get(f'name_phrs_w{context_window}_t{phr_per_sent}'):
sent['ctx'][f'name_phrs_w{context_window}_n{num_phr_per_sent}'] = phrases[sent_num] sent['ctx'][f'name_phrs_w{context_window}_t{phr_per_sent}'] = phrases[sent_num]
db._p_changed = True db._p_changed = True
transaction.commit() transaction.commit()
...@@ -168,9 +189,9 @@ def main(): ...@@ -168,9 +189,9 @@ def main():
parser.add_argument('--context_window', type=int, parser.add_argument('--context_window', type=int,
required=False, default=5, required=False, default=5,
help='How many sentences account for context.') help='How many sentences account for context.')
parser.add_argument('--num_phr_per_sent', type=int, parser.add_argument('--phr_per_sent', type=str,
required=False, default=5, required=False, default='longest',
help='How many phrases are taken form each sentence within context window.') help='Category of phreses as a context. Available: all, longest')
parser.add_argument('--test', action='store_true', parser.add_argument('--test', action='store_true',
required=False, default=False, required=False, default=False,
help='Dont load vectors - for testing purposes.') help='Dont load vectors - for testing purposes.')
...@@ -199,9 +220,9 @@ def main(): ...@@ -199,9 +220,9 @@ def main():
sys.exit() sys.exit()
try: try:
add_np_phrases(db, args.context_window, args.num_phr_per_sent, w2v, args.verbose) add_np_phrases(db, args.context_window, args.phr_per_sent, w2v, args.verbose)
db.root['__ctx_types__'].append(f'name_phrs_w{args.context_window}_n{args.num_phr_per_sent}') db.root['__ctx_types__'].append(f'name_phrs_w{args.context_window}_t{args.phr_per_sent}')
db.update() db.update()
db._p_changed = True db._p_changed = True
transaction.commit() transaction.commit()
......
...@@ -15,15 +15,15 @@ def add_ctx(db, number, verbose=False): ...@@ -15,15 +15,15 @@ def add_ctx(db, number, verbose=False):
if verbose: if verbose:
print(f"s:{' '.join([id2word(vocabulary, x)['word'] for x in sent['sent']])}") print(f"s:{' '.join([id2word(vocabulary, x)['word'] for x in sent['sent']])}")
if not sent['ctx'].get(f'prev_sent_n{number}'): if not sent['ctx'].get(f'prev_sent_w{number}'):
if sent_num == 0: if sent_num == 0:
if verbose: if verbose:
print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['title'][0]])}") print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['title'][0]])}")
sent['ctx'][f'prev_sent_n{number}'] = persistent.list.PersistentList([text['title'][0]]) sent['ctx'][f'prev_sent_w{number}'] = persistent.list.PersistentList([text['title'][0]])
else: else:
if verbose: if verbose:
print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['text'][sent_num - 1]['sent']])}") print(f"\tc:{' '.join([id2word(vocabulary, x)['word'] for x in text['text'][sent_num - 1]['sent']])}")
sent['ctx'][f'prev_sent_n{number}'] = persistent.list.PersistentList([text['text'][sent_num - 1]['sent']]) sent['ctx'][f'prev_sent_w{number}'] = persistent.list.PersistentList([text['text'][sent_num - 1]['sent']])
db._p_changed = True db._p_changed = True
transaction.commit() transaction.commit()
...@@ -77,7 +77,7 @@ def main(): ...@@ -77,7 +77,7 @@ def main():
try: try:
add_ctx(db, args.number, args.verbose) add_ctx(db, args.number, args.verbose)
db.root['__ctx_types__'].append(f'prev_sent_n{args.number}') db.root['__ctx_types__'].append(f'prev_sent_w{args.number}')
db.update() db.update()
db._p_changed = True db._p_changed = True
transaction.commit() transaction.commit()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment