diff --git a/src/scraper.py b/src/scraper.py index 775c5b40a4ea1588df1229891665e700260ad899..f8aeee79be016ed1327025f7ba016e847c0df147 100644 --- a/src/scraper.py +++ b/src/scraper.py @@ -1,8 +1,140 @@ -# PV254-city-recommender +from bs4 import BeautifulSoup +import requests -Project for PV254 (Fall 2019). Recommends a city based on your travel history. +BASE_URL = "https://nomadlist.com/" +DEFAULT_USER = "@levelsio" -## Dataset +def get_users(soup, n=1000): + """Gets users from "crossed paths with" section on a user's profile. + Args: + soup: BeautifulSoup object from loaded page + n: number of users to get + Returns: + List of users + """ + users = [] + user_index = 0 + while len(users) < n: + o = soup.find("div", {"id": "most-overlaps"}) + if o: + for a in o.find_all('a', href=True): + users.append(a['href']) + users = list(dict.fromkeys(users)) # removing duplicates # + page = requests.get(f"{BASE_URL}{users[user_index]}"); user_index+=1 + soup = BeautifulSoup(page.content, 'html.parser') + return users -The dataset is scraped from [nomadlist.com](www.nomadlist.com) -The dataset is found in `data/trips.csv` contains \ No newline at end of file +def get_most_visited_cities(soup): + """Gets the most visited cities by a user from 'Most visited section'. + Args: + soup: BeautifulSoup object from loaded page + Returns: + Dict of city: number of visits + """ + trips_to_cities = soup.find("div", class_="most_trips_to_cities") + trips_to_cities_name = trips_to_cities.div.find_all("li") + trips_to_cities_count = trips_to_cities.div.find_all("span", class_="extra-people-counter") + city_visits = dict() + for city, counter in zip(trips_to_cities_name, trips_to_cities_count): + city_visits[city["data-slug"]] = counter.text.strip("x") + return city_visits + +def get_most_time_spent(soup): + """Gets the most time spent in cities by a user from 'Most time spent section'. + Args: + soup: BeautifulSoup object from loaded page + Returns: + Dict of city: time spent + """ + most_visited_cities = soup.find_all("div", class_="most_visited_cities") + city_time = dict() + if len(most_visited_cities) > 1: + most_time_spent = most_visited_cities[1] # [1] because there's 2 divs with the same class name, i want the second one + most_time_spent_name = most_time_spent.div.find_all("li") + most_time_spent_count = most_time_spent.div.find_all("span", class_="extra-people-counter") + for city, time in zip(most_time_spent_name, most_time_spent_count): + city_time[city["data-slug"]] = time.text + return city_time + +def convert_time_to_days(t): + """ Converts time infomation in years and months to days + Args: + t: string of time + Returns: + time span as an integer (in days) + """ + try: + if t[-2:] == "yr": + return float(t[:-2])*30.5*365.25 + elif t[-2:] == "mo": + return float(t[:-2])*30.5 + elif t[-1:] == "d": + return float(t[:-1]) + else: + return float(t[:-2]) + except ValueError: + return t + +def convert_list_time_to_days(time): + """ Converts time spans in the form: ['1yr', '11mo', ...] to days. + Args: + time: list of times + Returns: + list of times in days + """ + time_in_days = [] + for t in time: + time_in_days.append(convert_time_to_days(t)) + return time_in_days + +def parse_trip_information(trip, user, table_row): + """ Gets relevant info from trip table row + Args: + trip: Specific trip to be parsed + user: User associated with the trip + table_row: csv string of rows to which a new trip info will be added + Returns: + One comma separated row corresponding to a single trip + """ + delimiter = ", " + table_row += user.strip("/@") + delimiter + city_name = trip.find("td", class_="name").h2.text.split(",")[0] + table_row += city_name + delimiter + table_row += trip.find("td", class_="country").text + delimiter + table_row += trip['data-date-start'] + delimiter + table_row += trip['data-date-end'] + delimiter + + table_row += str(convert_time_to_days(trip.find("td", class_="trip_start").find_next('td').text)) + delimiter + + table_row += trip['data-latitude'] + delimiter + trip['data-longitude'] + "\n" + + return table_row + +def create_dataset(n_users=3700): + """ Creates the user-trip dataset by scraping user web pages from nomadlist.com. + Dumps the output to 'trips.csv' file. + Args: + n_users: Number of users to searche for + """ + page = requests.get(f"{BASE_URL}{DEFAULT_USER}") + soup = BeautifulSoup(page.content, 'html.parser') + users = get_users(soup, n_users) + print(f"Found {len(users)} users.") + + f = open('trips.csv', 'w+', encoding="utf-8") + table_row="user, city, country, trip_start, trip_end, trip_duration, latitude, longitude\n" + + for user in users: + page = requests.get(f"{BASE_URL}{user}") + soup = BeautifulSoup(page.content, 'html.parser') + trips = soup.find_all("tr", class_="trip") + print(f"Found {len(trips)} trips for {user}.") + + for trip in trips: + table_row = parse_trip_information(trip, user, table_row) + f.write(table_row) + table_row = "" + f.close() + +if __name__ == "__main__": + create_dataset() \ No newline at end of file