Skip to content
Snippets Groups Projects
recommender.py 3.31 KiB
Newer Older
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import operator
import numpy as np


def create_interaction_matrix(df, user_col, city_col, rating_col):
    """Create an interaction matrix dataframe.
        - df = Pandas DataFrame containing user-city interactions
        - user_col = column name containing user's identifier
        - city_col = column name containing city's identifier
    Output - 
        - df with user-city interactions
    """
    return df.groupby([user_col, city_col])[rating_col].sum().unstack().reset_index().fillna(0).set_index(user_col)

def get_index_city_dict(df):
    cities_dict = df[["city"]].to_dict()["city"]
    # now i have index:city, i want city:index
    inv_map = {v: k for k, v in cities_dict.items()}
    return inv_map

def calculate_cosine_similarity(features):
    return cosine_similarity(features, features)

def cos_similarity(df, idx, how_many=10, drop=[]):
    drop.append("city")
    features = df.drop(drop, axis=1)
    cosine_sim = calculate_cosine_similarity(features)
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:1+how_many]
    indices = [i[0] for i in sim_scores]
    d = dict(zip(df['city'].iloc[indices].values, [i[1] for i in sim_scores]))
    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)

def get_cities_overlap(df, A, B):
    """ Calculate how many times cities A and B occured together per 1 user
    """
    grouped = df.groupby("user")
    users_list = []
    for i,g in grouped:
        if (g["city"] == A).any() & (g["city"] == B).any():
            users_list.append(i)
    return len(users_list)

def get_city_count(df, A):
    """ Calculate how many times city A occured in a dataset
    """
    return df[df["city"] == A].shape[0]

def sim_coefficient(df, A, B):
    return get_cities_overlap(df, A, B)/(get_city_count(df, A)*get_city_count(df, B))

def get_similar_cities(df, city_dict, cities, n_of_matches):
    """ Returns similarity coefficients for list of recommended cities 
    calculated by cosine similarity in descending order with reference to 
    what city lead to the recommendation 
    """
    d = []
    recommender_cities = []
    for c in cities:
        cos_sim = cos_similarity(df, city_dict[c] ,n_of_matches)
        for sim in cos_sim:
            if sim[0] in recommender_cities:
                # if city in recommended, keep the higher recommendation score
                if d[recommender_cities.index(sim[0])][1] < sim[1]:
                    d[recommender_cities.index(sim[0])][1] = sim[1]
                    d[recommender_cities.index(sim[0])][2] = c
            elif not sim[0] in cities:
                d.append([sim[0], sim[1], c])
                recommender_cities.append(sim[0])
    d = np.array(d)
    # sort by similarity value, [::-1] for reverse sort
    sorted_d = d[d[:, 1].argsort()][::-1]
    return sorted_d

def load_data(folder="../data/"):
    df_trips = pd.read_csv(f"{folder}trips.csv", sep='\s*,\s*', encoding='utf-8')
    df_trips.city = pd.Series([ str(city).lower().replace(' ', '-') for city in df_trips.city.values])
    df_trips["trip_duration"] = pd.to_timedelta(df_trips["trip_duration"])
    df_cities = pd.read_csv(f"{folder}cities_stats_full.csv", encoding='utf-8')
    return df_trips, df_cities