import pandas as pd from sklearn.metrics.pairwise import cosine_similarity import operator import numpy as np def create_interaction_matrix(df, user_col, city_col, rating_col): """Create an interaction matrix dataframe. - df = Pandas DataFrame containing user-city interactions - user_col = column name containing user's identifier - city_col = column name containing city's identifier Output - - df with user-city interactions """ return df.groupby([user_col, city_col])[rating_col].sum().unstack().reset_index().fillna(0).set_index(user_col) def get_index_city_dict(df): cities_dict = df[["city"]].to_dict()["city"] # now i have index:city, i want city:index inv_map = {v: k for k, v in cities_dict.items()} return inv_map def calculate_cosine_similarity(features): return cosine_similarity(features, features) def cos_similarity(df, idx, how_many=10, drop=[]): drop.append("city") features = df.drop(drop, axis=1) cosine_sim = calculate_cosine_similarity(features) sim_scores = list(enumerate(cosine_sim[idx])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) sim_scores = sim_scores[1:1+how_many] indices = [i[0] for i in sim_scores] d = dict(zip(df['city'].iloc[indices].values, [i[1] for i in sim_scores])) return sorted(d.items(), key=operator.itemgetter(1), reverse=True) def get_cities_overlap(df, A, B): """ Calculate how many times cities A and B occured together per 1 user """ grouped = df.groupby("user") users_list = [] for i,g in grouped: if (g["city"] == A).any() & (g["city"] == B).any(): users_list.append(i) return len(users_list) def get_city_count(df, A): """ Calculate how many times city A occured in a dataset """ return df[df["city"] == A].shape[0] def sim_coefficient(df, A, B): return get_cities_overlap(df, A, B)/(get_city_count(df, A)*get_city_count(df, B)) def get_similar_cities(df, city_dict, cities, n_of_matches): """ Returns similarity coefficients for list of recommended cities calculated by cosine similarity in descending order with reference to what city lead to the recommendation """ d = [] recommender_cities = [] for c in cities: cos_sim = cos_similarity(df, city_dict[c] ,n_of_matches) for sim in cos_sim: if sim[0] in recommender_cities: # if city in recommended, keep the higher recommendation score if d[recommender_cities.index(sim[0])][1] < sim[1]: d[recommender_cities.index(sim[0])][1] = sim[1] d[recommender_cities.index(sim[0])][2] = c elif not sim[0] in cities: d.append([sim[0], sim[1], c]) recommender_cities.append(sim[0]) d = np.array(d) # sort by similarity value, [::-1] for reverse sort sorted_d = d[d[:, 1].argsort()][::-1] return sorted_d def load_data(folder="../data/"): df_trips = pd.read_csv(f"{folder}trips.csv", sep='\s*,\s*', encoding='utf-8') df_trips.city = pd.Series([ str(city).lower().replace(' ', '-') for city in df_trips.city.values]) df_trips["trip_duration"] = pd.to_timedelta(df_trips["trip_duration"]) df_cities = pd.read_csv(f"{folder}cities_stats_full.csv", encoding='utf-8') return df_trips, df_cities