Newer
Older
from bs4 import BeautifulSoup
import requests
BASE_URL = "https://nomadlist.com/"
DEFAULT_USER = "@levelsio"
def get_users(soup, n=1000):
"""Gets users from "crossed paths with" section on a user's profile.
Args:
soup: BeautifulSoup object from loaded page
n: number of users to get
Returns:
List of users
"""
users = []
user_index = 0
while len(users) < n:
o = soup.find("div", {"id": "most-overlaps"})
if o:
for a in o.find_all('a', href=True):
users.append(a['href'])
users = list(dict.fromkeys(users)) # removing duplicates #
page = requests.get(f"{BASE_URL}{users[user_index]}"); user_index+=1
soup = BeautifulSoup(page.content, 'html.parser')
return users
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def get_most_visited_cities(soup):
"""Gets the most visited cities by a user from 'Most visited section'.
Args:
soup: BeautifulSoup object from loaded page
Returns:
Dict of city: number of visits
"""
trips_to_cities = soup.find("div", class_="most_trips_to_cities")
trips_to_cities_name = trips_to_cities.div.find_all("li")
trips_to_cities_count = trips_to_cities.div.find_all("span", class_="extra-people-counter")
city_visits = dict()
for city, counter in zip(trips_to_cities_name, trips_to_cities_count):
city_visits[city["data-slug"]] = counter.text.strip("x")
return city_visits
def get_most_time_spent(soup):
"""Gets the most time spent in cities by a user from 'Most time spent section'.
Args:
soup: BeautifulSoup object from loaded page
Returns:
Dict of city: time spent
"""
most_visited_cities = soup.find_all("div", class_="most_visited_cities")
city_time = dict()
if len(most_visited_cities) > 1:
most_time_spent = most_visited_cities[1] # [1] because there's 2 divs with the same class name, i want the second one
most_time_spent_name = most_time_spent.div.find_all("li")
most_time_spent_count = most_time_spent.div.find_all("span", class_="extra-people-counter")
for city, time in zip(most_time_spent_name, most_time_spent_count):
city_time[city["data-slug"]] = time.text
return city_time
def convert_time_to_days(t):
""" Converts time infomation in years and months to days
Args:
t: string of time
Returns:
time span as an integer (in days)
"""
try:
if t[-2:] == "yr":
return float(t[:-2])*30.5*365.25
elif t[-2:] == "mo":
return float(t[:-2])*30.5
elif t[-1:] == "d":
return float(t[:-1])
else:
return float(t[:-2])
except ValueError:
return t
def convert_list_time_to_days(time):
""" Converts time spans in the form: ['1yr', '11mo', ...] to days.
Args:
time: list of times
Returns:
list of times in days
"""
time_in_days = []
for t in time:
time_in_days.append(convert_time_to_days(t))
return time_in_days
def parse_trip_information(trip, user, table_row):
""" Gets relevant info from trip table row
Args:
trip: Specific trip to be parsed
user: User associated with the trip
table_row: csv string of rows to which a new trip info will be added
Returns:
One comma separated row corresponding to a single trip
"""
delimiter = ", "
table_row += user.strip("/@") + delimiter
city_name = trip.find("td", class_="name").h2.text.split(",")[0]
table_row += city_name + delimiter
table_row += trip.find("td", class_="country").text + delimiter
table_row += trip['data-date-start'] + delimiter
table_row += trip['data-date-end'] + delimiter
table_row += str(convert_time_to_days(trip.find("td", class_="trip_start").find_next('td').text)) + delimiter
table_row += trip['data-latitude'] + delimiter + trip['data-longitude'] + "\n"
return table_row
def create_dataset(n_users=3700):
""" Creates the user-trip dataset by scraping user web pages from nomadlist.com.
Dumps the output to 'trips.csv' file.
Args:
n_users: Number of users to searche for
"""
page = requests.get(f"{BASE_URL}{DEFAULT_USER}")
soup = BeautifulSoup(page.content, 'html.parser')
users = get_users(soup, n_users)
print(f"Found {len(users)} users.")
f = open('trips.csv', 'w+', encoding="utf-8")
table_row="user, city, country, trip_start, trip_end, trip_duration, latitude, longitude\n"
for user in users:
page = requests.get(f"{BASE_URL}{user}")
soup = BeautifulSoup(page.content, 'html.parser')
trips = soup.find_all("tr", class_="trip")
print(f"Found {len(trips)} trips for {user}.")
for trip in trips:
table_row = parse_trip_information(trip, user, table_row)
f.write(table_row)
table_row = ""
f.close()
if __name__ == "__main__":
create_dataset()