Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import operator
import numpy as np
def create_interaction_matrix(df, user_col, city_col, rating_col):
"""Create an interaction matrix dataframe.
- df = Pandas DataFrame containing user-city interactions
- user_col = column name containing user's identifier
- city_col = column name containing city's identifier
Output -
- df with user-city interactions
"""
return df.groupby([user_col, city_col])[rating_col].sum().unstack().reset_index().fillna(0).set_index(user_col)
def get_index_city_dict(df):
cities_dict = df[["city"]].to_dict()["city"]
# now i have index:city, i want city:index
inv_map = {v: k for k, v in cities_dict.items()}
return inv_map
def calculate_cosine_similarity(features):
return cosine_similarity(features, features)
def cos_similarity(df, idx, how_many=10, drop=[]):
drop.append("city")
features = df.drop(drop, axis=1)
cosine_sim = calculate_cosine_similarity(features)
sim_scores = list(enumerate(cosine_sim[idx]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:1+how_many]
indices = [i[0] for i in sim_scores]
d = dict(zip(df['city'].iloc[indices].values, [i[1] for i in sim_scores]))
return sorted(d.items(), key=operator.itemgetter(1), reverse=True)
def get_cities_overlap(df, A, B):
""" Calculate how many times cities A and B occured together per 1 user
"""
grouped = df.groupby("user")
users_list = []
for i,g in grouped:
if (g["city"] == A).any() & (g["city"] == B).any():
users_list.append(i)
return len(users_list)
def get_city_count(df, A):
""" Calculate how many times city A occured in a dataset
"""
return df[df["city"] == A].shape[0]
def sim_coefficient(df, A, B):
return get_cities_overlap(df, A, B)/(get_city_count(df, A)*get_city_count(df, B))
def get_similar_cities(df, city_dict, cities, n_of_matches):
""" Returns similarity coefficients for list of recommended cities
calculated by cosine similarity in descending order with reference to
what city lead to the recommendation
"""
d = []
recommender_cities = []
for c in cities:
cos_sim = cos_similarity(df, city_dict[c] ,n_of_matches)
for sim in cos_sim:
if sim[0] in recommender_cities:
# if city in recommended, keep the higher recommendation score
if d[recommender_cities.index(sim[0])][1] < sim[1]:
d[recommender_cities.index(sim[0])][1] = sim[1]
d[recommender_cities.index(sim[0])][2] = c
elif not sim[0] in cities:
d.append([sim[0], sim[1], c])
recommender_cities.append(sim[0])
d = np.array(d)
# sort by similarity value, [::-1] for reverse sort
sorted_d = d[d[:, 1].argsort()][::-1]
return sorted_d
def load_data(folder="../data/"):
df_trips = pd.read_csv(f"{folder}trips.csv", sep='\s*,\s*', encoding='utf-8')
df_trips.city = pd.Series([ str(city).lower().replace(' ', '-') for city in df_trips.city.values])
df_trips["trip_duration"] = pd.to_timedelta(df_trips["trip_duration"])
df_cities = pd.read_csv(f"{folder}cities_stats_full.csv", encoding='utf-8')
return df_trips, df_cities