Skip to content
Snippets Groups Projects
scraper.py 4.51 KiB
Newer Older
from bs4 import BeautifulSoup
import requests
BASE_URL = "https://nomadlist.com/"
DEFAULT_USER = "@levelsio"
def get_users(soup, n=1000):
  """Gets users from "crossed paths with" section on a user's profile.
  Args:
  soup: BeautifulSoup object from loaded page
  n: number of users to get
  Returns:
  List of users
  """
  users = []
  user_index = 0
  while len(users) < n:
    o = soup.find("div", {"id": "most-overlaps"})
    if o:
      for a in o.find_all('a', href=True):
        users.append(a['href'])
      users = list(dict.fromkeys(users)) # removing duplicates #
    page = requests.get(f"{BASE_URL}{users[user_index]}"); user_index+=1
    soup = BeautifulSoup(page.content, 'html.parser')
  return users
def get_most_visited_cities(soup):
  """Gets the most visited cities by a user from 'Most visited section'.
  Args:
  soup: BeautifulSoup object from loaded page
  Returns:
  Dict of city: number of visits
  """
  trips_to_cities = soup.find("div", class_="most_trips_to_cities")
  trips_to_cities_name = trips_to_cities.div.find_all("li")
  trips_to_cities_count = trips_to_cities.div.find_all("span", class_="extra-people-counter")
  city_visits = dict()
  for city, counter in zip(trips_to_cities_name, trips_to_cities_count):
      city_visits[city["data-slug"]] = counter.text.strip("x")
  return city_visits

def get_most_time_spent(soup):
  """Gets the most time spent in cities by a user from 'Most time spent section'.
  Args:
  soup: BeautifulSoup object from loaded page
  Returns:
  Dict of city: time spent
  """
  most_visited_cities = soup.find_all("div", class_="most_visited_cities")
  city_time = dict()
  if len(most_visited_cities) > 1:
    most_time_spent = most_visited_cities[1] # [1] because there's 2 divs with the same class name, i want the second one
    most_time_spent_name = most_time_spent.div.find_all("li")
    most_time_spent_count = most_time_spent.div.find_all("span", class_="extra-people-counter")
    for city, time in zip(most_time_spent_name, most_time_spent_count):
        city_time[city["data-slug"]] = time.text
  return city_time

def convert_time_to_days(t):
  """ Converts time infomation in years and months to days
  Args:
  t: string of time
  Returns:
  time span as an integer (in days)
  """
  try:
    if t[-2:] == "yr":
      return float(t[:-2])*30.5*365.25
    elif t[-2:] == "mo":
      return float(t[:-2])*30.5
    elif t[-1:] == "d":
      return float(t[:-1])
    else:
      return float(t[:-2])
  except ValueError:
    return t

def convert_list_time_to_days(time):
  """ Converts time spans in the form: ['1yr', '11mo', ...] to days.
  Args:
  time: list of times
  Returns:
  list of times in days
  """
  time_in_days = []
  for t in time:
    time_in_days.append(convert_time_to_days(t))
  return time_in_days

def parse_trip_information(trip, user, table_row):
  """ Gets relevant info from trip table row
  Args:
  trip: Specific trip to be parsed
  user: User associated with the trip
  table_row: csv string of rows to which a new trip info will be added
  Returns:
  One comma separated row corresponding to a single trip
  """
  delimiter = ", "
  table_row += user.strip("/@") + delimiter
  city_name = trip.find("td", class_="name").h2.text.split(",")[0]
  table_row += city_name + delimiter
  table_row += trip.find("td", class_="country").text + delimiter
  table_row += trip['data-date-start'] + delimiter
  table_row += trip['data-date-end'] + delimiter

  table_row += str(convert_time_to_days(trip.find("td", class_="trip_start").find_next('td').text)) + delimiter

  table_row += trip['data-latitude'] + delimiter + trip['data-longitude'] + "\n"

  return table_row

def create_dataset(n_users=3700):
  """ Creates the user-trip dataset by scraping user web pages from nomadlist.com. 
  Dumps the output to 'trips.csv' file.
  Args:
  n_users: Number of users to searche for
  """
  page = requests.get(f"{BASE_URL}{DEFAULT_USER}")
  soup = BeautifulSoup(page.content, 'html.parser')
  users = get_users(soup, n_users)
  print(f"Found {len(users)} users.")
  
  f = open('trips.csv', 'w+', encoding="utf-8")
  table_row="user, city, country, trip_start, trip_end, trip_duration, latitude, longitude\n"

  for user in users:
    page = requests.get(f"{BASE_URL}{user}")
    soup = BeautifulSoup(page.content, 'html.parser')
    trips = soup.find_all("tr", class_="trip")
    print(f"Found {len(trips)} trips for {user}.")

    for trip in trips:
      table_row = parse_trip_information(trip, user, table_row)
      f.write(table_row)
      table_row = ""
  f.close()

if __name__ == "__main__":
  create_dataset()