Loading add_similar_senteces.py +10 −9 Original line number Diff line number Diff line Loading @@ -7,6 +7,7 @@ import persistent.list from query_database import get_content import transaction import sys import math import numpy as np from scipy import spatial Loading @@ -17,8 +18,10 @@ def precompute_tfidf(kb, vocabulary): for url, text in kb.url2doc.items(): number_of_documents += 1 number_of_words_per_document = 0 for sentence in text['text']: for word_id in sentence['sent']: number_of_words_per_document += 1 word_lemma = id2word(vocabulary, word_id)['lemma'] if idf_per_doc.get(url): Loading @@ -27,12 +30,14 @@ def precompute_tfidf(kb, vocabulary): idf_per_doc[url] = {word_lemma} if tf_per_doc.get(url): if tf_per_doc.get(url).get(word_lemma): if tf_per_doc[url].get(word_lemma): tf_per_doc[url][word_lemma] += 1 else: tf_per_doc[url][word_lemma] = 1 else: tf_per_doc[url] = {word_lemma: 1} for word_lemma in tf_per_doc[url].keys(): tf_per_doc[url][word_lemma] = float(tf_per_doc[url][word_lemma]) / number_of_words_per_document return number_of_documents, idf_per_doc, tf_per_doc Loading @@ -56,18 +61,16 @@ def compute_tfidf(kb, vocabulary, verbose=False): print('Error: multiple tf-idf') sys.exit() else: tf_idf[url][word_lemma] = word_tf * idf[word_lemma] tf_idf[url][word_lemma] = word_tf * math.log(float(number_of_documents) / idf[word_lemma]) else: tf_idf[url] = {word_lemma: word_tf * idf[word_lemma]} tf_idf[url] = {word_lemma: word_tf * math.log(float(number_of_documents) / idf[word_lemma])} if verbose: # Check TF-IDF score manually for url, value in tf_per_doc.items(): print(f'{url}') for w, c in value.items(): word = id2word(vocabulary, w) print(f'{word["word"]}\t{word["lemma"]}\t' f'{word["tag"]}: tf:{c}, idf: {idf[w]}, tf-idf: {tf_idf[url][w]}') print(f'{w}\t: tf:{c}, idf: {idf[w]}, tf-idf: {tf_idf[url][w]}') sys.exit() return tf_idf Loading Loading @@ -114,8 +117,6 @@ def find_similar_senteces(db, tf_idf): cos_sim = 1 - spatial.distance.cosine(v_as, v_sent) # Filter exact answers if idx not in record.similar_answers['sents_containing_ans_ext']: similar_senteces.append((idx, cos_sim)) yield rid, similar_senteces Loading Loading
add_similar_senteces.py +10 −9 Original line number Diff line number Diff line Loading @@ -7,6 +7,7 @@ import persistent.list from query_database import get_content import transaction import sys import math import numpy as np from scipy import spatial Loading @@ -17,8 +18,10 @@ def precompute_tfidf(kb, vocabulary): for url, text in kb.url2doc.items(): number_of_documents += 1 number_of_words_per_document = 0 for sentence in text['text']: for word_id in sentence['sent']: number_of_words_per_document += 1 word_lemma = id2word(vocabulary, word_id)['lemma'] if idf_per_doc.get(url): Loading @@ -27,12 +30,14 @@ def precompute_tfidf(kb, vocabulary): idf_per_doc[url] = {word_lemma} if tf_per_doc.get(url): if tf_per_doc.get(url).get(word_lemma): if tf_per_doc[url].get(word_lemma): tf_per_doc[url][word_lemma] += 1 else: tf_per_doc[url][word_lemma] = 1 else: tf_per_doc[url] = {word_lemma: 1} for word_lemma in tf_per_doc[url].keys(): tf_per_doc[url][word_lemma] = float(tf_per_doc[url][word_lemma]) / number_of_words_per_document return number_of_documents, idf_per_doc, tf_per_doc Loading @@ -56,18 +61,16 @@ def compute_tfidf(kb, vocabulary, verbose=False): print('Error: multiple tf-idf') sys.exit() else: tf_idf[url][word_lemma] = word_tf * idf[word_lemma] tf_idf[url][word_lemma] = word_tf * math.log(float(number_of_documents) / idf[word_lemma]) else: tf_idf[url] = {word_lemma: word_tf * idf[word_lemma]} tf_idf[url] = {word_lemma: word_tf * math.log(float(number_of_documents) / idf[word_lemma])} if verbose: # Check TF-IDF score manually for url, value in tf_per_doc.items(): print(f'{url}') for w, c in value.items(): word = id2word(vocabulary, w) print(f'{word["word"]}\t{word["lemma"]}\t' f'{word["tag"]}: tf:{c}, idf: {idf[w]}, tf-idf: {tf_idf[url][w]}') print(f'{w}\t: tf:{c}, idf: {idf[w]}, tf-idf: {tf_idf[url][w]}') sys.exit() return tf_idf Loading Loading @@ -114,8 +117,6 @@ def find_similar_senteces(db, tf_idf): cos_sim = 1 - spatial.distance.cosine(v_as, v_sent) # Filter exact answers if idx not in record.similar_answers['sents_containing_ans_ext']: similar_senteces.append((idx, cos_sim)) yield rid, similar_senteces Loading