Commit c963ec12 authored by Marek Medved's avatar Marek Medved
Browse files

Fix TF-IDF computing

parent 09f7d0db
Loading
Loading
Loading
Loading
+10 −9
Original line number Diff line number Diff line
@@ -7,6 +7,7 @@ import persistent.list
from query_database import get_content
import transaction
import sys
import math
import numpy as np
from scipy import spatial

@@ -17,8 +18,10 @@ def precompute_tfidf(kb, vocabulary):

    for url, text in kb.url2doc.items():
        number_of_documents += 1
        number_of_words_per_document = 0
        for sentence in text['text']:
            for word_id in sentence['sent']:
                number_of_words_per_document += 1
                word_lemma = id2word(vocabulary, word_id)['lemma']

                if idf_per_doc.get(url):
@@ -27,12 +30,14 @@ def precompute_tfidf(kb, vocabulary):
                    idf_per_doc[url] = {word_lemma}

                if tf_per_doc.get(url):
                    if tf_per_doc.get(url).get(word_lemma):
                    if tf_per_doc[url].get(word_lemma):
                        tf_per_doc[url][word_lemma] += 1
                    else:
                        tf_per_doc[url][word_lemma] = 1
                else:
                    tf_per_doc[url] = {word_lemma: 1}
        for word_lemma in tf_per_doc[url].keys():
            tf_per_doc[url][word_lemma] = float(tf_per_doc[url][word_lemma]) / number_of_words_per_document

    return number_of_documents, idf_per_doc, tf_per_doc

@@ -56,18 +61,16 @@ def compute_tfidf(kb, vocabulary, verbose=False):
                    print('Error: multiple tf-idf')
                    sys.exit()
                else:
                    tf_idf[url][word_lemma] = word_tf * idf[word_lemma]
                    tf_idf[url][word_lemma] = word_tf * math.log(float(number_of_documents) / idf[word_lemma])
            else:
                tf_idf[url] = {word_lemma: word_tf * idf[word_lemma]}
                tf_idf[url] = {word_lemma: word_tf * math.log(float(number_of_documents) / idf[word_lemma])}

    if verbose:
        # Check TF-IDF score manually
        for url, value in tf_per_doc.items():
            print(f'{url}')
            for w, c in value.items():
                word = id2word(vocabulary, w)
                print(f'{word["word"]}\t{word["lemma"]}\t'
                      f'{word["tag"]}: tf:{c}, idf: {idf[w]}, tf-idf: {tf_idf[url][w]}')
                print(f'{w}\t: tf:{c}, idf: {idf[w]}, tf-idf: {tf_idf[url][w]}')
        sys.exit()

    return tf_idf
@@ -114,8 +117,6 @@ def find_similar_senteces(db, tf_idf):

                    cos_sim = 1 - spatial.distance.cosine(v_as, v_sent)

                    # Filter exact answers
                    if idx not in record.similar_answers['sents_containing_ans_ext']:
                    similar_senteces.append((idx, cos_sim))

        yield rid, similar_senteces