diff --git a/context_ner.py b/context_ner.py index 33dba8b609c864ee74384c237117ae38e270686a..a43ad8586297410584a9071127be6c3c1a103c71 100755 --- a/context_ner.py +++ b/context_ner.py @@ -117,7 +117,7 @@ def add_ner(db, context_window, model, verbose=False): transaction.commit() -def get_ctx(phrs, vocabulary, part): +def get_ctx(phrs, vocabulary, part='', preloaded=False): sentence_phrases = [] for sent_phr in phrs: phr_per_sent = [] @@ -125,9 +125,9 @@ def get_ctx(phrs, vocabulary, part): p_content = [] for w_id_cx in p: if part: - p_content.append(id2word(vocabulary, w_id_cx, part)) + p_content.append(id2word(vocabulary, w_id_cx, parts=part, preloaded=preloaded)) else: - p_content.append(id2word(vocabulary, w_id_cx)) + p_content.append(id2word(vocabulary, w_id_cx, preloaded=preloaded)) phr_per_sent.append(p_content) sentence_phrases.append(phr_per_sent) return sentence_phrases diff --git a/context_np.py b/context_np.py index da676a4b8be7595a32d11b9b1257e57c97119391..e3e66fcf13e5d461f9ad33ac92369ded58b48a34 100755 --- a/context_np.py +++ b/context_np.py @@ -140,7 +140,7 @@ def add_np_phrases(db, context_window, num_phr_per_sent, w2v, verbose=False): transaction.commit() -def get_ctx(phrs, vocabulary, part): +def get_ctx(phrs, vocabulary, part='', preloaded=False): sentence_phrases = [] for sent_phr in phrs: phr_per_sent = [] @@ -148,9 +148,9 @@ def get_ctx(phrs, vocabulary, part): p_content = [] for w_id_cx in p: if part: - p_content.append(id2word(vocabulary, w_id_cx, part)) + p_content.append(id2word(vocabulary, w_id_cx, parts=part, preloaded=preloaded)) else: - p_content.append(id2word(vocabulary, w_id_cx)) + p_content.append(id2word(vocabulary, w_id_cx, preloaded=preloaded)) phr_per_sent.append(p_content) sentence_phrases.append(phr_per_sent) return sentence_phrases diff --git a/context_previous_senteces.py b/context_previous_senteces.py index ca1e91d4f7113774501a47d9d439e1815a9e9077..c5389fd79781584845ebd45ee793d63588391876 100755 --- a/context_previous_senteces.py +++ b/context_previous_senteces.py @@ -26,15 +26,15 @@ def add_ctx(db, number, verbose=False): db._p_changed = True transaction.commit() -def get_ctx(phrs, vocabulary, part): +def get_ctx(phrs, vocabulary, part='', preloaded=False): content = [] for p in phrs: p_content = [] for w_id_cx in p: if part: - p_content.append(id2word(vocabulary, w_id_cx, part)) + p_content.append(id2word(vocabulary, w_id_cx, parts=part, preloaded=preloaded)) else: - p_content.append(id2word(vocabulary, w_id_cx)) + p_content.append(id2word(vocabulary, w_id_cx, preloaded=preloaded)) content.append(p_content) return content diff --git a/query_database.py b/query_database.py index 973c9a0754241207b6e97aa79dfffc29d9f83a5e..b63a131e59a92d4ec0228231c3006fdfb18b5cc1 100755 --- a/query_database.py +++ b/query_database.py @@ -10,7 +10,7 @@ import context_previous_senteces import context_ner -def get_ctx(data, vocabulary, part='', context_type=''): +def get_ctx(data, vocabulary, part='', context_type='', preloaded=False): sentence_phrases = {} if context_type: required_ctx = context_type.strip().split(';') @@ -19,44 +19,48 @@ def get_ctx(data, vocabulary, part='', context_type=''): for ctx_type, phrs in data.items(): if ctx_type in required_ctx or 'all' in required_ctx: if ctx_type.startswith('name_phrs'): - sentence_phrases[ctx_type] = context_np.get_ctx(phrs, vocabulary, part) + sentence_phrases[ctx_type] = context_np.get_ctx(phrs, vocabulary, part=part, preloaded=preloaded) elif ctx_type.startswith('ctx_ner'): - sentence_phrases[ctx_type] = context_ner.get_ctx(phrs, vocabulary, part) + sentence_phrases[ctx_type] = context_ner.get_ctx(phrs, vocabulary, part=part, preloaded=preloaded) else: - sentence_phrases[ctx_type] = context_previous_senteces.get_ctx(phrs, vocabulary, part) + sentence_phrases[ctx_type] = context_previous_senteces.get_ctx(phrs, vocabulary, part=part, + preloaded=preloaded) return sentence_phrases -def get_senence(data, vocabulary, part=''): +def get_senence(data, vocabulary, part='', preloaded=False): sent = [] for w_id in data: if part: - sent.append(id2word(vocabulary, w_id, part)) + sent.append(id2word(vocabulary, w_id, parts=part, preloaded=preloaded)) else: - sent.append(id2word(vocabulary, w_id)) + sent.append(id2word(vocabulary, w_id, preloaded=preloaded)) return sent -def get_content(data, vocabulary, part=''): +def get_content(data, vocabulary, part='', preloaded=False): result = [] for sentence in data: - result.append(get_senence(sentence, vocabulary, part)) + result.append(get_senence(sentence, vocabulary, part=part, preloaded=preloaded)) return result -def get_content_ctx(url, kb, vocabulary, part='', context_type=''): +def get_content_ctx(url, kb, vocabulary, part='', context_type='', preloaded=False): result = [] for sentence in kb.url2doc.get(url)['text']: - result.append({'sent': get_senence(sentence['sent'], vocabulary, part), - 'ctx': get_ctx(sentence['ctx'], vocabulary, part, context_type)}) + result.append({'sent': get_senence(sentence['sent'], vocabulary, part=part, preloaded=preloaded), + 'ctx': get_ctx(sentence['ctx'], vocabulary, part=part, context_type=context_type, + preloaded=preloaded)}) return result -def get_record(db, record_id, word_parts='', context_type=''): +def get_record(db, record_id, word_parts='', context_type='', vocabulary=None, qa_type_dict=None, kb=None, preloaded=False): record = db.get_record(record_id) - vocabulary, qa_type_dict, kb = db.get_dicts() + if not vocabulary and not qa_type_dict and not kb: + print('Not preloaded data') + vocabulary, qa_type_dict, kb = db.get_dicts() """ result data structure {rec_id: str, @@ -74,13 +78,15 @@ def get_record(db, record_id, word_parts='', context_type=''): data['rec_id'] = record.rec_id data['q_type'] = id2qt(qa_type_dict, record.q_type) data['a_type'] = id2qt(qa_type_dict, record.a_type) - data['question'] = get_content(record.question, vocabulary, word_parts) - data['a_sel'] = get_content(record.answer_selection, vocabulary, word_parts) + data['question'] = get_content(record.question, vocabulary, part=word_parts, preloaded=preloaded) + data['a_sel'] = get_content(record.answer_selection, vocabulary, part=word_parts, preloaded=preloaded) data['a_sel_pos'] = record.text_answer_position - data['a_ext'] = get_content(record.answer_extraction, vocabulary, word_parts) + data['a_ext'] = get_content(record.answer_extraction, vocabulary, part=word_parts, preloaded=preloaded) data['similar_answers'] = record.similar_answers - data['text_title'] = get_content(kb.url2doc.get(record.text)["title"], vocabulary, word_parts) - data['text'] = get_content_ctx(record.text, kb, vocabulary, word_parts, context_type) + data['text_title'] = get_content(kb.url2doc.get(record.text)["title"], vocabulary, part=word_parts, + preloaded=preloaded) + data['text'] = get_content_ctx(record.text, kb, vocabulary, part=word_parts, context_type=context_type, + preloaded=preloaded) data['contain_answer'] = len(record.similar_answers["sents_containing_ans_ext"]) data['not_contain_answer'] = len(data['text'])-len(record.similar_answers["sents_containing_ans_ext"]) @@ -131,7 +137,7 @@ def print_record(db, record_id, context_type=''): if key.startswith('name_phrs'): context_np.print_ctx(phrs) elif key.startswith('ctx_ner'): - context_ner.print_ctx(phrs) + context_ner.print_ctx(phrs) else: context_previous_senteces.print_ctx(phrs) diff --git a/sqad_db.py b/sqad_db.py index a93dace51cfd6fab781ae845e56bdf7c4efcca6d..b498dabbb921b10efde554570066f0d8e149f4a0 100755 --- a/sqad_db.py +++ b/sqad_db.py @@ -25,24 +25,40 @@ def word2id(vocabulary, word, lemma, tag, w2v): return key -def id2word(vocabulary, key, parts=''): +def id2word(vocabulary, key, parts='', preloaded=False): result = {} word_parts = parts.strip().split(';') - if 'w' in word_parts or not parts: - result['word'] = vocabulary.id2wlt[key]['word'] - if 'l' in word_parts or not parts: - result['lemma'] = vocabulary.id2wlt[key]['lemma'] - if 't' in word_parts or not parts: - result['tag'] = vocabulary.id2wlt[key]['tag'] - if 'v100' in word_parts or not parts: - result['v100'] = vocabulary.vectors[key][0] - if 'v300' in word_parts or not parts: - result['v300'] = vocabulary.vectors[key][1] - if 'v500' in word_parts or not parts: - result['v500'] = vocabulary.vectors[key][2] - if 'id' in word_parts or not parts: - result['id'] = key + if preloaded: + if 'w' in word_parts or not parts: + result['word'] = vocabulary['id2wlt'][key]['word'] + if 'l' in word_parts or not parts: + result['lemma'] = vocabulary['id2wlt'][key]['lemma'] + if 't' in word_parts or not parts: + result['tag'] = vocabulary['id2wlt'][key]['tag'] + if 'v100' in word_parts or not parts: + result['v100'] = vocabulary['vectors'][key][0] + if 'v300' in word_parts or not parts: + result['v300'] = vocabulary['vectors'][key][1] + if 'v500' in word_parts or not parts: + result['v500'] = vocabulary['vectors'][key][2] + if 'id' in word_parts or not parts: + result['id'] = key + else: + if 'w' in word_parts or not parts: + result['word'] = vocabulary.id2wlt[key]['word'] + if 'l' in word_parts or not parts: + result['lemma'] = vocabulary.id2wlt[key]['lemma'] + if 't' in word_parts or not parts: + result['tag'] = vocabulary.id2wlt[key]['tag'] + if 'v100' in word_parts or not parts: + result['v100'] = vocabulary.vectors[key][0] + if 'v300' in word_parts or not parts: + result['v300'] = vocabulary.vectors[key][1] + if 'v500' in word_parts or not parts: + result['v500'] = vocabulary.vectors[key][2] + if 'id' in word_parts or not parts: + result['id'] = key return result