diff --git a/Makefile b/Makefile index 5e1d37602a71a4c5da9d54c56e07f92cc9b1659c..4106f794a9eb40741f841d0ffd9781a8eba8f769 100644 --- a/Makefile +++ b/Makefile @@ -1,21 +1,42 @@ -DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S") -#VERSION=$(file < ./sqad_db/version) -VERSION=1 +DB_NAME=sqad_db/devel/sqad_v3_$(shell date +"%d-%m-%Y_%H-%M-%S")_base +VERSION=$(shell cat ./sqad_db/version) NEW_VERSION=$$(($(VERSION)+1)) + +# Need to specify bash in order for conda activate to work. +SHELL=/bin/bash +# Note that the extra activate is needed to ensure that the activate floats env to the front of PATH +CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate + create: - printf "SQAD to DB\n=======================" >> $(DB_NAME).log + printf "SQAD to DB\n=======================\n" >> $(DB_NAME).log echo $(NEW_VERSION) > ./sqad_db/version - ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log - printf "Contains answer\n======================" >> $(DB_NAME).log - ./add_contains_answer_sentences.py -d $(DB_NAME) 2>> $(DB_NAME).log - printf "Similar answers\n======================" >> $(DB_NAME).log - ./add_similar_senteces.py -d $(DB_NAME) -n 0 2>> $(DB_NAME).log - printf "Contex NP frases\n======================" >> $(DB_NAME).log - ./context_np.py -d $(DB_NAME) 2>> $(DB_NAME).log - printf "Context previous sentece\n======================" >> $(DB_NAME).log - ./context_previous_senteces.py -d $(DB_NAME) 2>> $(DB_NAME).log - printf "Context wiki entity\n======================" >> $(DB_NAME).log - #conda activate mypy3; python ./context_ner.py -d $(DB_NAME) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(DB_NAME).log + ($(CONDA_ACTIVATE) base; ./sqad2database.py -p /nlp/projekty/sqad/sqad_v3/data -n $(DB_NAME) -v $(NEW_VERSION) 2>> $(DB_NAME).log) + echo "$(hostname)" | mail -s "Done sqad_db created" "marek.medved3@gmail.com" + +updates: + UPDB=$(DB)_with_updates_$(shell date +"%d-%m-%Y_%H-%M-%S") + cp $(DB) $(UPDB) + cp $(DB).index $(UPDB).index + cp $(DB).lock $(UPDB).lock + cp $(DB).log $(UPDB).log + cp $(DB).tmp $(UPDB).tmp + printf "add bert embeddings\n=======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) deeppavlov; ./add_bert_emberdings.py -d $(UPDB) 2>> $(UPDB).log) + printf "Contains answer\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) base; ./add_contains_answer_sentences.py -d $(UPDB) 2>> $(UPDB).log) + printf "Similar answers\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) base; ./add_similar_senteces.py -d $(UPDB) -n 0 2>> $(UPDB).log) + printf "Contex NP frases\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) base; ./context_np.py -d $(UPDB) 2>> $(UPDB).log) + printf "Context previous sentece\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) base; ./context_previous_senteces.py -d $(UPDB) 2>> $(UPDB).log) + printf "Context wiki entity\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) mypy3; python ./context_ner.py -d $(UPDB) -m named_entity_recognition/BERT-NER/ner_model_cz/ 2>> $(UPDB).log) + printf "Sentece to sentece bert embedding\n======================\n" >> $(UPDB).log + ($(CONDA_ACTIVATE) base; python ./sentece2s_bert.py -d $(UPDB) 2>> $(UPDB).log) +# printf "Sentece to cls bert embedding\n======================\n" >> $(UPDB).log +# ($(CONDA_ACTIVATE) bert; python ./sentece2cls_bert.py -d $(UPDB) 2>> $(UPDB).log) + echo "$(hostname)" | mail -s "Done AQA job" "marek.medved3@gmail.com" run_ZODB_server: exec "/usr/bin/python3.6" -m "ZEO.runzeo" -C /nlp/projekty/question_answering/AQA_v2/sqad_tools/sqad2database/zeo_server.conf diff --git a/add_bert_emberdings.py b/add_bert_emberdings.py index 28a41d9d2765677a1de465c4dac8a7650435f3af..9c75d8a1da55f9797398ff62ac8efee65a064c97 100755 --- a/add_bert_emberdings.py +++ b/add_bert_emberdings.py @@ -1,4 +1,4 @@ -#! /usr/bin/python3 +#!/usr/bin/env python3 # coding: utf-8 import sys from sqad_db import SqadDb @@ -7,6 +7,8 @@ import transaction from deeppavlov.core.common.file import read_json from deeppavlov import build_model, configs +# created according http://docs.deeppavlov.ai/en/master/features/models/bert.html +# class Bert_Embeddings: def __init__(self): bert_config = read_json(configs.embedder.bert_embedder) @@ -28,14 +30,14 @@ class Bert_Embeddings: def add_bert_word_embeddings_word(vocabulary, model, db): - vocab_size = len(vocabulary.id2wlt.keys()) + # vocab_size = len(vocabulary.id2wlt.keys()) progress = 0 for w_id, value in vocabulary.id2wlt.items(): progress += 1 word = value['word'] bert_embedding = model.word2embedding(word) - sys.stderr.write(f'{progress}/{vocab_size}\r') - vocabulary.vectors[w_id].append(persistent.list.PersistentList(bert_embedding)) + # sys.stderr.write(f'{progress}/{vocab_size}\n') + vocabulary.vectors[w_id]['v_bert'] = persistent.list.PersistentList(bert_embedding) db._p_changed = True transaction.commit() diff --git a/add_contains_answer_sentences.py b/add_contains_answer_sentences.py index 7c59113e8905df4dee7c295c4b8e196532f2859b..edf52af8c91d58019b8cc7213364398a535a2477 100755 --- a/add_contains_answer_sentences.py +++ b/add_contains_answer_sentences.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # coding: utf-8 from sqad_db import SqadDb import persistent.list diff --git a/add_similar_senteces.py b/add_similar_senteces.py index fd9ec611eee45b7e08caae57599f9a11507f6d8c..ffc3a4180b4fdfc6f60fee9ae97af88d75647fbc 100755 --- a/add_similar_senteces.py +++ b/add_similar_senteces.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # coding: utf-8 from sqad_db import SqadDb from sqad_db import id2word @@ -83,7 +83,7 @@ def find_similar_senteces(db, tf_idf): for rid in db.get_all_records(): similar_senteces = [] record = db.get_record(rid) - sys.stderr.write(f'{rid}\n') + # sys.stderr.write(f'{rid}\n') for answer_selection_sent in get_content(record.answer_selection, vocabulary): diff --git a/context_ner.py b/context_ner.py index 54917c673ab3f8d9330dc36fa8ea9714cb1e4680..8d7b615aa0ddccb7e096485df6535619fe3c12e8 100755 --- a/context_ner.py +++ b/context_ner.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python3 # coding: utf-8 import re from sqad_db import SqadDb diff --git a/context_np.py b/context_np.py index 893d505f8f800ada953e9dae2d7ec936c32ceec7..1555463a1891a8d83933265e328bbc0776e9cea9 100755 --- a/context_np.py +++ b/context_np.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # coding: utf-8 import os import re diff --git a/context_previous_senteces.py b/context_previous_senteces.py index c0fb8b419f3e6d0cf954d1bfd2086d87d4f27837..4ec0b227dce9cd0e91e8643083260bafcd433cf7 100755 --- a/context_previous_senteces.py +++ b/context_previous_senteces.py @@ -1,4 +1,4 @@ -#!/usr/bin/python3 +#!/usr/bin/env python3 # coding: utf-8 import sys from sqad_db import SqadDb diff --git a/get_vector.py b/get_vector.py index e9861f2df132bc5d063239f744eeb9697a8bceb9..27795f91796190b1a042c1ccd73b3a1bb23dacdf 100644 --- a/get_vector.py +++ b/get_vector.py @@ -1,4 +1,6 @@ -#! /usr/bin/python3 + +#!/usr/bin/env python3 +# coding: utf-8 import fasttext import os import sys diff --git a/query_database.py b/query_database.py index 84f24339a4e7b0a569f68b6522392a1c0ce2ef53..02d1fa7993bf3d1f2ffb49b5ca07e9fbc0e0e706 100755 --- a/query_database.py +++ b/query_database.py @@ -1,4 +1,4 @@ -#! /usr/bin/python3 +#!/usr/bin/env python3 # coding: utf-8 from sqad_db import SqadDb from sqad_db import id2word diff --git a/sentece2s_bert.py b/sentece2s_bert.py new file mode 100755 index 0000000000000000000000000000000000000000..38a2d42601c1f9f4849597cfb561e076fb16d66d --- /dev/null +++ b/sentece2s_bert.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +# created according: https://github.com/UKPLab/sentence-transformers +# models https://www.sbert.net/docs/pretrained_models.html +import sys +from sentence_transformers import SentenceTransformer +import persistent.list +import transaction +from sqad_db import SqadDb +from sqad_db import id2word + + +def add_vector(db, model, verbose=False): + vocabulary, _, kb = db.get_dicts() + + for url, text in kb.url2doc.items(): + for sentece in text['text']: + sent = [] + for w_id in sentece['sent']: + sent.append(id2word(vocabulary, w_id, parts='w')['word']) + sent_v = model.encode([' '.join(sent)]) + sentece['sbert'] = persistent.list.PersistentList(sent_v) + if verbose: + print(f"{' '.join(sent)}\t{sent_v}") + db._p_changed = True + transaction.commit() + + +def main(): + import argparse + parser = argparse.ArgumentParser(description='Compute S-BERT sentence embeddings') + parser.add_argument('-u', '--url', type=str, + required=False, default='', + help='Database URL') + parser.add_argument('-p', '--port', type=int, + required=False, default=None, + help='Server port') + parser.add_argument('-d', '--db_path', type=str, + required=False, default='', + help='Database path') + parser.add_argument('-v', '--verbose', action='store_true', + required=False, + help='Verbose mode') + + args = parser.parse_args() + + if (args.url and args.port) or args.db_path: + if args.url and args.port: + db = SqadDb(url=args.url, port=args.port) + elif args.db_path: + db = SqadDb(file_name=args.db_path) + else: + sys.stderr.write('Please specify --db_path or (--port and --url)') + sys.exit() + + model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1') + try: + add_vector(db, model, verbose=args.verbose) + + db.update() + db._p_changed = True + transaction.commit() + db.close() + except KeyboardInterrupt: + db.close() + sys.exit() + +if __name__ == "__main__": + main() diff --git a/sqad2database.py b/sqad2database.py index 38a245be3d9752aea3fe0a5595762f5e09d68d9a..5832a56c3823126c6e598885c7c827d126815a1e 100755 --- a/sqad2database.py +++ b/sqad2database.py @@ -1,4 +1,4 @@ -#! /usr/bin/python3 +#!/usr/bin/env python3 # coding: utf-8 import re import os @@ -149,7 +149,7 @@ def main(): answer_selection_content = '' title = '' rec_id = rec_id_re.match(root.split('/')[-1]).group(1) - sys.stderr.write(f'{rec_id}({counter})\n') + # sys.stderr.write(f'{rec_id}({counter})\n') record = Record(rec_id) for file_name in files: if file_name in ['01question.vert', '03text.vert', diff --git a/sqad_db.py b/sqad_db.py index da6ba40e63054e2f0c8c5b4a5959484694d71b19..9d83a167cb532faca9af80aa570ab1e0b4ff969a 100755 --- a/sqad_db.py +++ b/sqad_db.py @@ -1,4 +1,4 @@ -#! /usr/bin/python3 +#!/usr/bin/env python3 # coding: utf-8 import ZODB import ZODB.FileStorage @@ -37,13 +37,13 @@ def id2word(vocabulary, key, parts='', preloaded=False): if 't' in word_parts or not parts: result['tag'] = vocabulary['id2wlt'][key]['tag'] if 'v100' in word_parts or not parts: - result['v100'] = vocabulary['vectors'][key][0] + result['v100'] = vocabulary['vectors'][key]['v100'] if 'v300' in word_parts or not parts: - result['v300'] = vocabulary['vectors'][key][1] + result['v300'] = vocabulary['vectors'][key]['v300'] if 'v500' in word_parts or not parts: - result['v500'] = vocabulary['vectors'][key][2] + result['v500'] = vocabulary['vectors'][key]['v500'] if 'v_bert' in word_parts or not parts: - result['v_bert'] = vocabulary['vectors'][key][3] + result['v_bert'] = vocabulary['vectors'][key]['v_bert'] if 'id' in word_parts or not parts: result['id'] = key else: @@ -54,13 +54,13 @@ def id2word(vocabulary, key, parts='', preloaded=False): if 't' in word_parts or not parts: result['tag'] = vocabulary.id2wlt[key]['tag'] if 'v100' in word_parts or not parts: - result['v100'] = vocabulary.vectors[key][0] + result['v100'] = vocabulary.vectors[key]['v100'] if 'v300' in word_parts or not parts: - result['v300'] = vocabulary.vectors[key][1] + result['v300'] = vocabulary.vectors[key]['v300'] if 'v500' in word_parts or not parts: - result['v500'] = vocabulary.vectors[key][2] + result['v500'] = vocabulary.vectors[key]['v500'] if 'v_bert' in word_parts or not parts: - result['v_bert'] = vocabulary.vectors[key][3] + result['v_bert'] = vocabulary.vectors[key]['v_bert'] if 'id' in word_parts or not parts: result['id'] = key return result @@ -73,15 +73,15 @@ class W2V: self.w2v = Word2vec() def get_vect(self, word): - result = persistent.list.PersistentList() + result = BTree() if self.test: - result.append(None) - result.append(None) - result.append(None) + result['v100'] = None + result['v300'] = None + result['v500'] = None else: - result.append(persistent.list.PersistentList(self.w2v.get_vector(word, 100))) - result.append(persistent.list.PersistentList(self.w2v.get_vector(word, 300))) - result.append(persistent.list.PersistentList(self.w2v.get_vector(word, 500))) + result['v100'] = persistent.list.PersistentList(self.w2v.get_vector(word, 100)) + result['v300'] = persistent.list.PersistentList(self.w2v.get_vector(word, 300)) + result['v500'] = persistent.list.PersistentList(self.w2v.get_vector(word, 500)) return result def add_vector(self, vocabulary, key, word): @@ -93,7 +93,7 @@ class Vocabulary(Persistent): self.id2wlt = BTree() # key: id, value: word, lemma, tag self.wlt2id = BTree() # key: word, value: id self.key = 0 - self.vectors = BTree() # key: word_id, value: v100, v300, v300 + self.vectors = BTree() # key: word_id, value: v100, v300, v500 def new_id(self): self.key += 1