Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
PV254-city-recommender
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Terézia Slanináková
PV254-city-recommender
Commits
fc662d8f
There was an error fetching the commit references. Please try again later.
Commit
fc662d8f
authored
5 years ago
by
Terézia Slanináková
Browse files
Options
Downloads
Patches
Plain Diff
[scraping] added src for scraper
parent
b00d38ad
No related branches found
No related tags found
1 merge request
!2
[scraping] added src for scraper
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/scraper.py
+137
-5
137 additions, 5 deletions
src/scraper.py
with
137 additions
and
5 deletions
src/scraper.py
+
137
−
5
View file @
fc662d8f
# PV254-city-recommender
from
bs4
import
BeautifulSoup
import
requests
Project
for
PV254
(
Fall
2019
).
Recommends
a
city
based
on
your
travel
history
.
BASE_URL
=
"
https://nomadlist.com/
"
DEFAULT_USER
=
"
@levelsio
"
## Dataset
def
get_users
(
soup
,
n
=
1000
):
"""
Gets users from
"
crossed paths with
"
section on a user
'
s profile.
Args:
soup: BeautifulSoup object from loaded page
n: number of users to get
Returns:
List of users
"""
users
=
[]
user_index
=
0
while
len
(
users
)
<
n
:
o
=
soup
.
find
(
"
div
"
,
{
"
id
"
:
"
most-overlaps
"
})
if
o
:
for
a
in
o
.
find_all
(
'
a
'
,
href
=
True
):
users
.
append
(
a
[
'
href
'
])
users
=
list
(
dict
.
fromkeys
(
users
))
# removing duplicates #
page
=
requests
.
get
(
f
"
{
BASE_URL
}{
users
[
user_index
]
}
"
);
user_index
+=
1
soup
=
BeautifulSoup
(
page
.
content
,
'
html.parser
'
)
return
users
The
dataset
is
scraped
from
[
nomadlist
.
com
](
www
.
nomadlist
.
com
)
def
get_most_visited_cities
(
soup
):
The
dataset
is
found
in
`data/trips.csv`
contains
"""
Gets the most visited cities by a user from
'
Most visited section
'
.
\ No newline at end of file
Args:
soup: BeautifulSoup object from loaded page
Returns:
Dict of city: number of visits
"""
trips_to_cities
=
soup
.
find
(
"
div
"
,
class_
=
"
most_trips_to_cities
"
)
trips_to_cities_name
=
trips_to_cities
.
div
.
find_all
(
"
li
"
)
trips_to_cities_count
=
trips_to_cities
.
div
.
find_all
(
"
span
"
,
class_
=
"
extra-people-counter
"
)
city_visits
=
dict
()
for
city
,
counter
in
zip
(
trips_to_cities_name
,
trips_to_cities_count
):
city_visits
[
city
[
"
data-slug
"
]]
=
counter
.
text
.
strip
(
"
x
"
)
return
city_visits
def
get_most_time_spent
(
soup
):
"""
Gets the most time spent in cities by a user from
'
Most time spent section
'
.
Args:
soup: BeautifulSoup object from loaded page
Returns:
Dict of city: time spent
"""
most_visited_cities
=
soup
.
find_all
(
"
div
"
,
class_
=
"
most_visited_cities
"
)
city_time
=
dict
()
if
len
(
most_visited_cities
)
>
1
:
most_time_spent
=
most_visited_cities
[
1
]
# [1] because there's 2 divs with the same class name, i want the second one
most_time_spent_name
=
most_time_spent
.
div
.
find_all
(
"
li
"
)
most_time_spent_count
=
most_time_spent
.
div
.
find_all
(
"
span
"
,
class_
=
"
extra-people-counter
"
)
for
city
,
time
in
zip
(
most_time_spent_name
,
most_time_spent_count
):
city_time
[
city
[
"
data-slug
"
]]
=
time
.
text
return
city_time
def
convert_time_to_days
(
t
):
"""
Converts time infomation in years and months to days
Args:
t: string of time
Returns:
time span as an integer (in days)
"""
try
:
if
t
[
-
2
:]
==
"
yr
"
:
return
float
(
t
[:
-
2
])
*
30.5
*
365.25
elif
t
[
-
2
:]
==
"
mo
"
:
return
float
(
t
[:
-
2
])
*
30.5
elif
t
[
-
1
:]
==
"
d
"
:
return
float
(
t
[:
-
1
])
else
:
return
float
(
t
[:
-
2
])
except
ValueError
:
return
t
def
convert_list_time_to_days
(
time
):
"""
Converts time spans in the form: [
'
1yr
'
,
'
11mo
'
, ...] to days.
Args:
time: list of times
Returns:
list of times in days
"""
time_in_days
=
[]
for
t
in
time
:
time_in_days
.
append
(
convert_time_to_days
(
t
))
return
time_in_days
def
parse_trip_information
(
trip
,
user
,
table_row
):
"""
Gets relevant info from trip table row
Args:
trip: Specific trip to be parsed
user: User associated with the trip
table_row: csv string of rows to which a new trip info will be added
Returns:
One comma separated row corresponding to a single trip
"""
delimiter
=
"
,
"
table_row
+=
user
.
strip
(
"
/@
"
)
+
delimiter
city_name
=
trip
.
find
(
"
td
"
,
class_
=
"
name
"
).
h2
.
text
.
split
(
"
,
"
)[
0
]
table_row
+=
city_name
+
delimiter
table_row
+=
trip
.
find
(
"
td
"
,
class_
=
"
country
"
).
text
+
delimiter
table_row
+=
trip
[
'
data-date-start
'
]
+
delimiter
table_row
+=
trip
[
'
data-date-end
'
]
+
delimiter
table_row
+=
str
(
convert_time_to_days
(
trip
.
find
(
"
td
"
,
class_
=
"
trip_start
"
).
find_next
(
'
td
'
).
text
))
+
delimiter
table_row
+=
trip
[
'
data-latitude
'
]
+
delimiter
+
trip
[
'
data-longitude
'
]
+
"
\n
"
return
table_row
def
create_dataset
(
n_users
=
3700
):
"""
Creates the user-trip dataset by scraping user web pages from nomadlist.com.
Dumps the output to
'
trips.csv
'
file.
Args:
n_users: Number of users to searche for
"""
page
=
requests
.
get
(
f
"
{
BASE_URL
}{
DEFAULT_USER
}
"
)
soup
=
BeautifulSoup
(
page
.
content
,
'
html.parser
'
)
users
=
get_users
(
soup
,
n_users
)
print
(
f
"
Found
{
len
(
users
)
}
users.
"
)
f
=
open
(
'
trips.csv
'
,
'
w+
'
,
encoding
=
"
utf-8
"
)
table_row
=
"
user, city, country, trip_start, trip_end, trip_duration, latitude, longitude
\n
"
for
user
in
users
:
page
=
requests
.
get
(
f
"
{
BASE_URL
}{
user
}
"
)
soup
=
BeautifulSoup
(
page
.
content
,
'
html.parser
'
)
trips
=
soup
.
find_all
(
"
tr
"
,
class_
=
"
trip
"
)
print
(
f
"
Found
{
len
(
trips
)
}
trips for
{
user
}
.
"
)
for
trip
in
trips
:
table_row
=
parse_trip_information
(
trip
,
user
,
table_row
)
f
.
write
(
table_row
)
table_row
=
""
f
.
close
()
if
__name__
==
"
__main__
"
:
create_dataset
()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment