from bs4 import BeautifulSoup import requests BASE_URL = "https://nomadlist.com/" DEFAULT_USER = "@levelsio" def get_users(soup, n=1000): """Gets users from "crossed paths with" section on a user's profile. Args: soup: BeautifulSoup object from loaded page n: number of users to get Returns: List of users """ users = [] user_index = 0 while len(users) < n: o = soup.find("div", {"id": "most-overlaps"}) if o: for a in o.find_all('a', href=True): users.append(a['href']) users = list(dict.fromkeys(users)) # removing duplicates # page = requests.get(f"{BASE_URL}{users[user_index]}"); user_index+=1 soup = BeautifulSoup(page.content, 'html.parser') return users def get_most_visited_cities(soup): """Gets the most visited cities by a user from 'Most visited section'. Args: soup: BeautifulSoup object from loaded page Returns: Dict of city: number of visits """ trips_to_cities = soup.find("div", class_="most_trips_to_cities") trips_to_cities_name = trips_to_cities.div.find_all("li") trips_to_cities_count = trips_to_cities.div.find_all("span", class_="extra-people-counter") city_visits = dict() for city, counter in zip(trips_to_cities_name, trips_to_cities_count): city_visits[city["data-slug"]] = counter.text.strip("x") return city_visits def get_most_time_spent(soup): """Gets the most time spent in cities by a user from 'Most time spent section'. Args: soup: BeautifulSoup object from loaded page Returns: Dict of city: time spent """ most_visited_cities = soup.find_all("div", class_="most_visited_cities") city_time = dict() if len(most_visited_cities) > 1: most_time_spent = most_visited_cities[1] # [1] because there's 2 divs with the same class name, i want the second one most_time_spent_name = most_time_spent.div.find_all("li") most_time_spent_count = most_time_spent.div.find_all("span", class_="extra-people-counter") for city, time in zip(most_time_spent_name, most_time_spent_count): city_time[city["data-slug"]] = time.text return city_time def convert_time_to_days(t): """ Converts time infomation in years and months to days Args: t: string of time Returns: time span as an integer (in days) """ try: if t[-2:] == "yr": return float(t[:-2])*30.5*365.25 elif t[-2:] == "mo": return float(t[:-2])*30.5 elif t[-1:] == "d": return float(t[:-1]) else: return float(t[:-2]) except ValueError: return t def convert_list_time_to_days(time): """ Converts time spans in the form: ['1yr', '11mo', ...] to days. Args: time: list of times Returns: list of times in days """ time_in_days = [] for t in time: time_in_days.append(convert_time_to_days(t)) return time_in_days def parse_trip_information(trip, user, table_row): """ Gets relevant info from trip table row Args: trip: Specific trip to be parsed user: User associated with the trip table_row: csv string of rows to which a new trip info will be added Returns: One comma separated row corresponding to a single trip """ delimiter = ", " table_row += user.strip("/@") + delimiter city_name = trip.find("td", class_="name").h2.text.split(",")[0] table_row += city_name + delimiter table_row += trip.find("td", class_="country").text + delimiter table_row += trip['data-date-start'] + delimiter table_row += trip['data-date-end'] + delimiter table_row += str(convert_time_to_days(trip.find("td", class_="trip_start").find_next('td').text)) + delimiter table_row += trip['data-latitude'] + delimiter + trip['data-longitude'] + "\n" return table_row def create_dataset(n_users=3700): """ Creates the user-trip dataset by scraping user web pages from nomadlist.com. Dumps the output to 'trips.csv' file. Args: n_users: Number of users to searche for """ page = requests.get(f"{BASE_URL}{DEFAULT_USER}") soup = BeautifulSoup(page.content, 'html.parser') users = get_users(soup, n_users) print(f"Found {len(users)} users.") f = open('trips.csv', 'w+', encoding="utf-8") table_row="user, city, country, trip_start, trip_end, trip_duration, latitude, longitude\n" for user in users: page = requests.get(f"{BASE_URL}{user}") soup = BeautifulSoup(page.content, 'html.parser') trips = soup.find_all("tr", class_="trip") print(f"Found {len(trips)} trips for {user}.") for trip in trips: table_row = parse_trip_information(trip, user, table_row) f.write(table_row) table_row = "" f.close() if __name__ == "__main__": create_dataset()