Skip to content
Snippets Groups Projects
Commit fc662d8f authored by Terézia Slanináková's avatar Terézia Slanináková
Browse files

[scraping] added src for scraper

parent b00d38ad
No related branches found
No related tags found
1 merge request!2[scraping] added src for scraper
# PV254-city-recommender
from bs4 import BeautifulSoup
import requests
Project for PV254 (Fall 2019). Recommends a city based on your travel history.
BASE_URL = "https://nomadlist.com/"
DEFAULT_USER = "@levelsio"
## Dataset
def get_users(soup, n=1000):
"""Gets users from "crossed paths with" section on a user's profile.
Args:
soup: BeautifulSoup object from loaded page
n: number of users to get
Returns:
List of users
"""
users = []
user_index = 0
while len(users) < n:
o = soup.find("div", {"id": "most-overlaps"})
if o:
for a in o.find_all('a', href=True):
users.append(a['href'])
users = list(dict.fromkeys(users)) # removing duplicates #
page = requests.get(f"{BASE_URL}{users[user_index]}"); user_index+=1
soup = BeautifulSoup(page.content, 'html.parser')
return users
The dataset is scraped from [nomadlist.com](www.nomadlist.com)
The dataset is found in `data/trips.csv` contains
\ No newline at end of file
def get_most_visited_cities(soup):
"""Gets the most visited cities by a user from 'Most visited section'.
Args:
soup: BeautifulSoup object from loaded page
Returns:
Dict of city: number of visits
"""
trips_to_cities = soup.find("div", class_="most_trips_to_cities")
trips_to_cities_name = trips_to_cities.div.find_all("li")
trips_to_cities_count = trips_to_cities.div.find_all("span", class_="extra-people-counter")
city_visits = dict()
for city, counter in zip(trips_to_cities_name, trips_to_cities_count):
city_visits[city["data-slug"]] = counter.text.strip("x")
return city_visits
def get_most_time_spent(soup):
"""Gets the most time spent in cities by a user from 'Most time spent section'.
Args:
soup: BeautifulSoup object from loaded page
Returns:
Dict of city: time spent
"""
most_visited_cities = soup.find_all("div", class_="most_visited_cities")
city_time = dict()
if len(most_visited_cities) > 1:
most_time_spent = most_visited_cities[1] # [1] because there's 2 divs with the same class name, i want the second one
most_time_spent_name = most_time_spent.div.find_all("li")
most_time_spent_count = most_time_spent.div.find_all("span", class_="extra-people-counter")
for city, time in zip(most_time_spent_name, most_time_spent_count):
city_time[city["data-slug"]] = time.text
return city_time
def convert_time_to_days(t):
""" Converts time infomation in years and months to days
Args:
t: string of time
Returns:
time span as an integer (in days)
"""
try:
if t[-2:] == "yr":
return float(t[:-2])*30.5*365.25
elif t[-2:] == "mo":
return float(t[:-2])*30.5
elif t[-1:] == "d":
return float(t[:-1])
else:
return float(t[:-2])
except ValueError:
return t
def convert_list_time_to_days(time):
""" Converts time spans in the form: ['1yr', '11mo', ...] to days.
Args:
time: list of times
Returns:
list of times in days
"""
time_in_days = []
for t in time:
time_in_days.append(convert_time_to_days(t))
return time_in_days
def parse_trip_information(trip, user, table_row):
""" Gets relevant info from trip table row
Args:
trip: Specific trip to be parsed
user: User associated with the trip
table_row: csv string of rows to which a new trip info will be added
Returns:
One comma separated row corresponding to a single trip
"""
delimiter = ", "
table_row += user.strip("/@") + delimiter
city_name = trip.find("td", class_="name").h2.text.split(",")[0]
table_row += city_name + delimiter
table_row += trip.find("td", class_="country").text + delimiter
table_row += trip['data-date-start'] + delimiter
table_row += trip['data-date-end'] + delimiter
table_row += str(convert_time_to_days(trip.find("td", class_="trip_start").find_next('td').text)) + delimiter
table_row += trip['data-latitude'] + delimiter + trip['data-longitude'] + "\n"
return table_row
def create_dataset(n_users=3700):
""" Creates the user-trip dataset by scraping user web pages from nomadlist.com.
Dumps the output to 'trips.csv' file.
Args:
n_users: Number of users to searche for
"""
page = requests.get(f"{BASE_URL}{DEFAULT_USER}")
soup = BeautifulSoup(page.content, 'html.parser')
users = get_users(soup, n_users)
print(f"Found {len(users)} users.")
f = open('trips.csv', 'w+', encoding="utf-8")
table_row="user, city, country, trip_start, trip_end, trip_duration, latitude, longitude\n"
for user in users:
page = requests.get(f"{BASE_URL}{user}")
soup = BeautifulSoup(page.content, 'html.parser')
trips = soup.find_all("tr", class_="trip")
print(f"Found {len(trips)} trips for {user}.")
for trip in trips:
table_row = parse_trip_information(trip, user, table_row)
f.write(table_row)
table_row = ""
f.close()
if __name__ == "__main__":
create_dataset()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment