Compare revisions

Terézia Slanináková · Terézia Slanináková · Jiří Vrbka · Terézia Slanináková · Terézia Slanináková · Terézia Slanináková
--- a/.gitignore
+++ b/.gitignore
+
+.idea/
+
+src/handler/__pycache__/
+
+src/scraper/__pycache__/
--- a/README.md
+++ b/README.md
@@ -2,6 +2,32 @@

 Project for PV254 (Fall 2019). Recommends a city based on your travel history.

+# Motivation
+
+## Problem statement
+We're recommending cities to visit as travel destinations based on person's travel history and other preferences.
+- **Use case:** I, as a user, want to find interesting cities to visit based on cities I liked in the past and my personal preferences (e.g. cost, peacefulness, etc.)
+
 ## Dataset
+The dataset, found in the `data/` folder, is scraped from user pages of [nomadlist.com](www.nomadlist.com). Contains 72k trips, 3700 users and 599 cities.
+
+## Algorithms
+- cosine similarity on features of `cities` dataset
+- SVD, NMF, KNN, Baseline
+
+
+### Loading the dataset with pandas
+Some boilerplate code is needed to load the dataset properly. Example:
+
+```PATH = "..\\data"
+df_trips = pd.read_csv(f"{PATH}\\trips.csv", sep='\s*,\s*', encoding='utf-8')
+df_trips.city = pd.Series([ str(city).lower().replace(' ', '-') for city in df_trips.city.values])

-The dataset, found in `data/trips.csv`, is scraped from user pages of [nomadlist.com](www.nomadlist.com). Overall, we were able to get data from over 72k trips and 3700 users.
+dict_df = {'city': [], 'hospital_score': [],'english_speaking': [], 'nightlife': [],'female_friendly': [],'racial_tolerance': [], 'peace_score': []}
+for (i, row) in df_cities[df_cities.cities != {}].iterrows():
+    for key in dict_df.keys():
+        if key != 'city':
+            dict_df[key].append(row.values[0][key])
+    dict_df['city'].append(row.name)
+df_cities = pd.DataFrame.from_dict(dict_df)
+```
\ No newline at end of file
--- a/data/cities_stats.json
+++ b/data/cities_stats.json
--- a/data/trips.csv
+++ b/data/trips.csv
@@ -825,7 +825,7 @@ adamnowek, London, United Kingdom, 2006-01-18, 2006-01-19, 1.0, 51.5073509, -0.1
 adamnowek, Exeter, United Kingdom, 2006-01-17, 2006-01-18, 1.0, 50.718412, -3.5338990000000194
 adamnowek, Bristol, United Kingdom, 2006-01-16, 2006-01-17, 1.0, 51.454513, -2.5879099999999653
 adamnowek, Cardiff, United Kingdom, 2006-01-15, 2006-01-16, 1.0, 51.48158100000001, -3.1790899999999738
-adamnowek, London, United Kingdom, 20016-01-12, 2006-01-15, 2.0, 51.5073509, -0.12775829999998223
+adamnowek, London, United Kingdom, 2016-01-12, 2006-01-15, 2.0, 51.5073509, -0.12775829999998223
 adamnowek, Vancouver, Canada, 2005-12-15, 2006-01-12, 28.0, 49.2827291, -123.12073750000002
 adamnowek, New York City, United States, 2005-12-09, 2005-12-15, 6.0, 40.7127837, -74.00594130000002
 adamnowek, Vancouver, Canada, 2005-11-13, 2005-12-09, 26.0, 49.2827291, -123.12073750000002
@@ -19740,7 +19740,6 @@ jeffmadduxcpa, Hoi An, Vietnam, 2015-12-08, 2015-12-08, 1.0, 15.8800584, 108.338
 jeffmadduxcpa, Da Nang, Vietnam, 2015-12-03, 2015-12-17, 14.0, 16.0544068, 108.20216670000002
 jeffmadduxcpa, San Francisco, United States, 2015-12-02, 2015-12-02, 1.0, 37.7749295, -122.41941550000001
 jeffmadduxcpa, Seattle, United States, 2015-10-24, 2015-10-26, 2.0, 47.6062095, -122.3320708
-jeffmadduxcpa, Napa, United States, Oct 18, 2015, Oct 18, 2015, 1.0, 38.2971, -122.2855
 jeffmadduxcpa, South Lake Tahoe, United States, 2015-08-30, 2015-08-31, 1.0, 38.9332, -119.9843
 jeffmadduxcpa, San Francisco, United States, 2015-06-15, 2015-06-17, 2.0, 37.7749295, -122.41941550000001
 jeffmadduxcpa, Puerto Vallarta, Mexico, 2015-05-10, 2015-05-27, 17.0, 20.65340699999999, -105.2253316
@@ -19751,16 +19750,12 @@ jeffmadduxcpa, Puerto Vallarta, Mexico, 2014-11-16, 2014-11-20, 4.0, 20.65340699
 jeffmadduxcpa, Seattle, United States, 2014-09-27, 2014-09-29, 2.0, 47.6062095, -122.3320708
 jeffmadduxcpa, Napa, United States, 2014-09-06, 2014-09-06, 1.0, 38.2975381, -122.28686500000003
 jeffmadduxcpa, San Francisco, United States, 2014-08-24, 2014-08-26, 2.0, 37.7749295, -122.41941550000001
-jeffmadduxcpa, Lake Tahoe, United States, Apr 11, 2014, Apr 13, 2014, 2.0, 39.09684929999999, -120.0323507
 jeffmadduxcpa, Maui, United States, 2014-01-15, 2014-01-20, 5.0, 20.7983626, -156.33192529999997
 jeffmadduxcpa, Portland, United States, 2013-09-20, 2013-09-22, 2.0, 45.52306220000001, -122.67648159999999
 jeffmadduxcpa, Seattle, United States, 2013-06-22, 2013-06-26, 4.0, 47.6062095, -122.3320708
 jeffmadduxcpa, Berkeley, United States, 2013-06-02, 2013-06-02, 1.0, 37.8715926, -122.27274699999998
 jeffmadduxcpa, Las Vegas, United States, 2012-11-28, 2012-11-30, 2.0, 36.1699412, -115.13982959999998
 jeffmadduxcpa, San Luis Obispo, United States, 2012-09-15, 2012-09-16, 1.0, 35.28275240000001, -120.6596156
-jeffmadduxcpa, Lake Tahoe, United States, Jul 20, 2012, Jul 22, 2012, 2.0, 39.09684929999999, -120.0323507
-jeffmadduxcpa, Lake Tahoe, United States, Jul 4, 2012, Jul 4, 2012, 1.0, 39.09684929999999, -120.0323507
-jeffmadduxcpa, Reno, United States, Aug 23, 2011, Aug 25, 2011, 2.0, 39.5296329, -119.8138027
 jeffmadduxcpa, Santa Monica, United States, 2011-02-01, 2011-02-02, 1.0, 34.0194543, -118.4911912
 jeffmadduxcpa, Patong, Thailand, 2010-04-27, 2010-04-29, 2.0, 7.8969151, 98.3020123
 jeffmadduxcpa, Ko Lanta, Thailand, 2010-04-25, 2010-04-27, 2.0, 7.624367700000001, 99.07922630000007
@@ -19768,7 +19763,6 @@ jeffmadduxcpa, Ao Nang, Thailand, 2010-04-22, 2010-04-25, 3.0, 8.0120253, 98.837
 jeffmadduxcpa, Ko Phi Phi, Thailand, 2010-04-16, 2010-04-22, 6.0, 7.747049899999999, 98.78596519999996
 jeffmadduxcpa, Phuket, Thailand, 2010-04-15, 2010-04-16, 1.0, 7.8804479, 98.39225039999997
 jeffmadduxcpa, Chiang Mai, Thailand, 2010-04-13, 2010-04-15, 2.0, 18.7888472, 98.9858313
-jeffmadduxcpa, Phra Nakhon Si Ayutthaya, Thailand, april 10, 2010, april 12, 2010, 2.0, 14.3532128, 100.56895989999998
 jeffmadduxcpa, Bangkok, Thailand, 2010-04-08, 2010-04-10, 2.0, 13.7563309, 100.50176510000006
 jeffmadduxcpa, San Francisco, United States, 2010-04-07, 2010-04-07, 1.0, 37.7749295, -122.41941550000001
 jeffmadduxcpa, Denver, United States, 2008-03-09, 2008-03-16, 7.0, 39.7392358, -104.990251
--- a/src/crawler/__init__.py
+++ b/src/crawler/__init__.py
--- a/src/crawler/nomad_city_crawler.py
+++ b/src/crawler/nomad_city_crawler.py
+from typing import List, Dict
+
+from src.handler.csv_handler import CsvHandler
+from src.handler.json_handler import JsonHandler
+from src.scraper.scraper import Scraper
+
+
+class NomadCityCrawler:
+    attribute_to_find = "data-key"
+    attribute_to_find_values = [
+        "hospital_score",
+        "english_speaking",
+        "nightlife",
+        "female_friendly",
+        "racial_tolerance",
+        "peace_score"]
+
+    @staticmethod
+    def get_cities_info(cities_to_browse: List[str]) -> dict:
+        cities_stats = {}
+        cities_stats["cities"] = {}
+
+        for city in cities_to_browse:
+            print("working on " + city)
+            try:
+                cities_stats["cities"][city] = {}
+
+                context = Scraper.at("https://nomadlist.com/" + city)
+                for key, element in context.with_attribute_having_values(NomadCityCrawler.attribute_to_find, NomadCityCrawler.attribute_to_find_values):
+                    score = element.find().get_attribute_value("data-value")
+                    cities_stats["cities"][city].update({key: score})
+
+            except Exception as e:
+                print("For city " + city)
+                print(e)
+
+        return cities_stats
+
+
+def main():
+    cities = []
+    for line in CsvHandler.load("./../../data/trips.csv")[1]:
+        cities.append(line["city"].lower().replace(" ", "-"))
+
+    cities = list(dict.fromkeys(cities))
+    cities_info = NomadCityCrawler.get_cities_info(cities)
+    JsonHandler.save("./../../data/cities_stats.json", cities_info)
+
+
+main()
\ No newline at end of file
--- a/src/handler/__init__.py
+++ b/src/handler/__init__.py
--- a/src/handler/csv_handler.py
+++ b/src/handler/csv_handler.py
+import csv
+from typing import List
+
+
+class CsvHandler:
+
+    @staticmethod
+    def __removeInitSpace__(word: str) -> str:
+        """
+        Removes space at index 0 if present
+        """
+        if word.startswith(" "):
+            return word[1:]
+        else:
+            return word
+
+    @staticmethod
+    def load(filepath: str) -> (List[str], List[dict]):
+        """
+        Loads csv file
+        :param filepath: path to csv file
+        :return: tuple of (list of header params, list of dict in format: {header1: value1, header2: value2,...})
+        """
+        item_list = []
+        with open(filepath, 'r') as csvFile:
+            header = csvFile.readline().split(",")
+
+            for i in range(1, len(header)):
+                header[i] = CsvHandler.__removeInitSpace__(header[i])
+
+            reader = csv.reader(csvFile)
+            for row in reader:
+                for i in range(1, len(row)):
+                    row[i] = CsvHandler.__removeInitSpace__(row[i])
+
+                dic = {}
+                for i in range(1, len(header)):
+                    dic[header[i]] = row[i]
+                item_list.append(dic)
+        csvFile.close()
+
+        return header, item_list
+
+    @staticmethod
+    def save(filepath: str, header: list, csv_as_list_dic: List[dict]) -> None:
+        """
+        Saves csv file into disk
+        :param filepath: path to save csv file (with filename)
+        :param header: header to save into file
+        :param csv_as_list_dic: list of dic in format {header1: value1, header2: value2,...}
+        :return: None
+        """
+        with open(filepath, 'w') as csvFile:
+            header_line = ""
+            for headerItem in header:
+                header_line += headerItem + ","
+            header_line = header_line[0:len(header_line) - 1]
+            csvFile.write(header_line + "\n")
+
+            for row in csv_as_list_dic:
+                line = ""
+                for headerItem in header:
+                    line = line + row[headerItem] + ","
+                line = line[0:len(line) - 1]
+                csvFile.write(line + "\n")
+        csvFile.close()
+
+
--- a/src/handler/json_handler.py
+++ b/src/handler/json_handler.py
+import json
+
+
+class JsonHandler:
+    @staticmethod
+    def load(filepath: str) -> dict:
+        """
+        Loads json from disk
+        :param filepath: path to json file (with filename)
+        :return: json as dict
+        """
+        with open(filepath, 'r') as jsonFile:
+            json_as_string = jsonFile.read()
+        jsonFile.close()
+        return json.loads(json_as_string)
+
+    @staticmethod
+    def save(filepath: str, json_as_dict: dict) -> None:
+        """
+        Saves json into disk
+        :param filepath: path to save json file (with filename)
+        :param json_as_dict: dict
+        :return: None
+        """
+        json_as_string = json.dumps(json_as_dict, indent=4)
+        with open(filepath, "w+") as jsonFile:
+            jsonFile.write(json_as_string)
+        jsonFile.close()
--- a/src/scraper.py
+++ b/src/scraper.py
-# PV254-city-recommender
+from bs4 import BeautifulSoup
+import requests

-Project for PV254 (Fall 2019). Recommends a city based on your travel history.
+BASE_URL = "https://nomadlist.com/"
+DEFAULT_USER = "@levelsio"

-## Dataset
+def get_users(soup, n=1000):
+  """Gets users from "crossed paths with" section on a user's profile.
+  Args:
+  soup: BeautifulSoup object from loaded page
+  n: number of users to get
+  Returns:
+  List of users
+  """
+  users = []
+  user_index = 0
+  while len(users) < n:
+    o = soup.find("div", {"id": "most-overlaps"})
+    if o:
+      for a in o.find_all('a', href=True):
+        users.append(a['href'])
+      users = list(dict.fromkeys(users)) # removing duplicates #
+    page = requests.get(f"{BASE_URL}{users[user_index]}"); user_index+=1
+    soup = BeautifulSoup(page.content, 'html.parser')
+  return users

-The dataset is scraped from [nomadlist.com](www.nomadlist.com)
-The dataset is found in `data/trips.csv` contains
\ No newline at end of file
+def get_most_visited_cities(soup):
+  """Gets the most visited cities by a user from 'Most visited section'.
+  Args:
+  soup: BeautifulSoup object from loaded page
+  Returns:
+  Dict of city: number of visits
+  """
+  trips_to_cities = soup.find("div", class_="most_trips_to_cities")
+  trips_to_cities_name = trips_to_cities.div.find_all("li")
+  trips_to_cities_count = trips_to_cities.div.find_all("span", class_="extra-people-counter")
+  city_visits = dict()
+  for city, counter in zip(trips_to_cities_name, trips_to_cities_count):
+      city_visits[city["data-slug"]] = counter.text.strip("x")
+  return city_visits
+
+def get_most_time_spent(soup):
+  """Gets the most time spent in cities by a user from 'Most time spent section'.
+  Args:
+  soup: BeautifulSoup object from loaded page
+  Returns:
+  Dict of city: time spent
+  """
+  most_visited_cities = soup.find_all("div", class_="most_visited_cities")
+  city_time = dict()
+  if len(most_visited_cities) > 1:
+    most_time_spent = most_visited_cities[1] # [1] because there's 2 divs with the same class name, i want the second one
+    most_time_spent_name = most_time_spent.div.find_all("li")
+    most_time_spent_count = most_time_spent.div.find_all("span", class_="extra-people-counter")
+    for city, time in zip(most_time_spent_name, most_time_spent_count):
+        city_time[city["data-slug"]] = time.text
+  return city_time
+
+def convert_time_to_days(t):
+  """ Converts time infomation in years and months to days
+  Args:
+  t: string of time
+  Returns:
+  time span as an integer (in days)
+  """
+  try:
+    if t[-2:] == "yr":
+      return float(t[:-2])*30.5*365.25
+    elif t[-2:] == "mo":
+      return float(t[:-2])*30.5
+    elif t[-1:] == "d":
+      return float(t[:-1])
+    else:
+      return float(t[:-2])
+  except ValueError:
+    return t
+
+def convert_list_time_to_days(time):
+  """ Converts time spans in the form: ['1yr', '11mo', ...] to days.
+  Args:
+  time: list of times
+  Returns:
+  list of times in days
+  """
+  time_in_days = []
+  for t in time:
+    time_in_days.append(convert_time_to_days(t))
+  return time_in_days
+
+def parse_trip_information(trip, user, table_row):
+  """ Gets relevant info from trip table row
+  Args:
+  trip: Specific trip to be parsed
+  user: User associated with the trip
+  table_row: csv string of rows to which a new trip info will be added
+  Returns:
+  One comma separated row corresponding to a single trip
+  """
+  delimiter = ", "
+  table_row += user.strip("/@") + delimiter
+  city_name = trip.find("td", class_="name").h2.text.split(",")[0]
+  table_row += city_name + delimiter
+  table_row += trip.find("td", class_="country").text + delimiter
+  table_row += trip['data-date-start'] + delimiter
+  table_row += trip['data-date-end'] + delimiter
+
+  table_row += str(convert_time_to_days(trip.find("td", class_="trip_start").find_next('td').text)) + delimiter
+
+  table_row += trip['data-latitude'] + delimiter + trip['data-longitude'] + "\n"
+
+  return table_row
+
+def create_dataset(n_users=3700):
+  """ Creates the user-trip dataset by scraping user web pages from nomadlist.com. 
+  Dumps the output to 'trips.csv' file.
+  Args:
+  n_users: Number of users to searche for
+  """
+  page = requests.get(f"{BASE_URL}{DEFAULT_USER}")
+  soup = BeautifulSoup(page.content, 'html.parser')
+  users = get_users(soup, n_users)
+  print(f"Found {len(users)} users.")
+  
+  f = open('trips.csv', 'w+', encoding="utf-8")
+  table_row="user, city, country, trip_start, trip_end, trip_duration, latitude, longitude\n"
+
+  for user in users:
+    page = requests.get(f"{BASE_URL}{user}")
+    soup = BeautifulSoup(page.content, 'html.parser')
+    trips = soup.find_all("tr", class_="trip")
+    print(f"Found {len(trips)} trips for {user}.")
+
+    for trip in trips:
+      table_row = parse_trip_information(trip, user, table_row)
+      f.write(table_row)
+      table_row = ""
+  f.close()
+
+if __name__ == "__main__":
+  create_dataset()
\ No newline at end of file
--- a/src/scraper/__init__.py
+++ b/src/scraper/__init__.py
--- a/src/scraper/scraper.py
+++ b/src/scraper/scraper.py
+from __future__ import annotations
+
+from typing import List, Dict
+
+from bs4 import BeautifulSoup
+import requests
+
+
+class ScraperBodyGet:
+    def __init__(self, soup: BeautifulSoup):
+        self.soup = soup
+
+    def get_attribute_value(self,  attribute: str):
+        return self.soup[attribute]
+
+    def get_text(self):
+        return self.soup.get_text()
+
+    def also(self):
+        return ScraperBodyInit(soup=self.soup)
+
+
+class ScraperBodyInit:
+    soup: BeautifulSoup
+    element_name: str = None
+    element_attribute: (str, str) = None
+
+    def __init__(self, soup: BeautifulSoup):
+        self.soup = soup
+
+    def at_element(self, name_of_element: str) -> ScraperBodyInit:
+        self.element_name = name_of_element
+        return self
+
+    def with_attribute_having_value(self, attribute: str, value: str) -> ScraperBodyInit:
+        self.element_attribute = (attribute, value)
+        return self
+
+    def with_attribute_having_values(self, attribute: str, values: List[str]) -> List[(str, ScraperBodyInit)]:
+        results = []
+        for value in values:
+            s = ScraperBodyInit(self.soup)
+            s.at_element(self.element_name)
+            s.with_attribute_having_value(attribute, value)
+            results.append((value, s))
+        return results
+
+    def find(self):
+        attrs = {self.element_attribute[0]: self.element_attribute[1]}
+        new_soup = self.soup.find(self.element_name, attrs)
+        return ScraperBodyGet(new_soup)
+
+    def reset(self) -> None:
+        self.element_name = None
+        self.element_attribute = None
+
+
+class Scraper:
+    @staticmethod
+    def at(web: str) -> ScraperBodyInit:
+        page = requests.get(web)
+        if page.status_code == 200:
+            return ScraperBodyInit(BeautifulSoup(page.content, "html.parser"))
No results found