Skip to content
Snippets Groups Projects
Commit acf2a79a authored by Jirka's avatar Jirka
Browse files

FInal correctures

parent bbec51a0
No related branches found
No related tags found
1 merge request!5City knowledge base nomad
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import re
from typing import List, Dict
import requests
from src.handler.csv_handler import CsvHandler
from src.handler.json_handler import JsonHandler
from src.scraper.scraper import Scraper, Given
from bs4 import BeautifulSoup
base_url = "https://nomadlist.com/"
class NomadCityCrawler:
attribute_to_find = "data-key"
attribute_to_find_values = [
"hospital_score",
"english_speaking",
"nightlife",
"female_friendly",
"racial_tolerance",
"walkScore_score",
"life_score",
"places_to_work_score",
"friendliness_to_foreigners",
"press_freedom_index_score",
"startup_score",
"road_traffic_score",
"ac_availability",
"lgbt_friendly",
"leisure_quality",
"peace_score"]
"hospital_availability",
"nightlife",
"female_friendly",
"racial_tolerance",
"walkability",
"quality_of_life",
"places_to_work_from",
"friendliness_to_foreigners",
"freedom_of_speech",
"startup_score",
"traffic_safety",
"ac_availability",
"lgbt_friendly",
"fun",
"wifi_availability",
"peace",
"nomad_score",
"internet_mbs",
"cost_dollars_per_month"]
@staticmethod
def get_cities_info(cities_to_browse: List[str]) -> dict:
cities_stats = {}
cities_stats["cities"] = {}
for city in cities_to_browse:
print("working on " + city)
try:
cities_stats["cities"][city] = {}
context = Scraper.at("https://nomadlist.com/" + city)
for key, element in context.with_attribute_having_values(NomadCityCrawler.attribute_to_find, NomadCityCrawler.attribute_to_find_values):
try:
score = element.find().get_attribute_value("data-value")
cities_stats["cities"][city].update({key: score})
except TypeError as te:
cities_stats["cities"][city].update({key: "-1"})
except Exception as e:
print("For city " + city)
print(e)
return cities_stats
def get_number(string: str):
inte = ""
for c in string:
if c.isdigit():
inte += c
return int(inte)
@staticmethod
def get_cities_info2(cities_to_browse: List[str]) -> dict:
cities_stats = {}
cities_stats["cities"] = {}
city = "cody"
page = Given().url("https://nomadlist.com/" + city).get_page()
if page.status_code() == 200:
for attr in NomadCityCrawler.attribute_to_find_values:
try:
score = page.find().with_attribute("data-key", attr).get_attribute_value("data-value")
cities_stats["cities"][city].update({attr: score})
print(attr + ": " + score)
except TypeError as e:
cities_stats["cities"][city].update({attr: "-1"})
print(attr + ": -1")
def get_city_statistics(city: str) -> Dict[str, str]:
full_url = base_url + city
try:
page = Scraper.get("https://nomadlist.com/" + city)
except Exception as e:
print("Web error for " + full_url)
print(e)
return {}
values = {}
if page.status_code() != 200:
print("Unable to locate page for " + full_url)
return values
else:
print("Page not found for: " + city)
print("Working on " + full_url)
data_value_stats = [
("hospital_score", "hospital_availability"),
("nightlife", "nightlife"),
("female_friendly", "female_friendly"),
("racial_tolerance", "racial_tolerance"),
("walkScore_score", "walkability"),
("life_score", "quality_of_life"),
("places_to_work_score", "places_to_work_from"),
("friendliness_to_foreigners", "friendliness_to_foreigners"),
("press_freedom_index_score", "freedom_of_speech"),
("startup_score", "startup_score"),
("road_traffic_score", "traffic_safety"),
("ac_availability", "ac_availability"),
("lgbt_friendly", "lgbt_friendly"),
("leisure_quality", "fun"),
("wifi_availability", "wifi_availability"),
("peace_score", "peace")]
for key, mapping in data_value_stats:
score = page.find().with_attribute("data-key", key).get_attribute_value("data-value")
values[mapping] = score if score is not None else -1
score = page.find().with_attribute("xitemprop", "ratingValue").get_text()
values["nomad_score"] = score if score else -1
text = "Mbps"
score = page.find().get_soup().find(text=re.compile('.*' + text + '.*'))
score = NomadCityCrawler.get_number(score)
values["internet_mbs"] = score if score else -1
text = " / mo"
score = page.find().get_soup().find(text=re.compile('.*' + text + '.*'))
score = NomadCityCrawler.get_number(score)
values["cost_dollars_per_month"] = score if score else -1
return values
@staticmethod
def get_cities_info3(cities_to_browse: List[str]) -> dict:
cities_stats = {}
cities_stats["cities"] = {}
for city in cities_to_browse:
def get_cities_statistics(cities: List[str]):
cities_result = {}
cities_result["cities"] = {}
for city in cities:
try:
print("working on " + city)
page = Scraper.get("https://nomadlist.com/" + city)
cities_stats["cities"][city] = {}
if page.status_code() == 200:
for attr in NomadCityCrawler.attribute_to_find_values:
try:
score = page.find().with_attribute("data-key", attr).get_attribute_value("data-value")
cities_stats["cities"][city].update({attr: score})
print(attr + ": " + score)
except TypeError as e:
cities_stats["cities"][city].update({attr: "-1"})
print(attr + ": -1")
else:
print("Page not found for: " + city)
except Exception as ex:
print("Exception for: " + city)
print(ex)
return cities_stats
result = NomadCityCrawler.get_city_statistics(city)
if result:
cities_result["cities"][city] = result
except Exception as e:
print("Web error for " + city)
print(e)
return cities_result
def main():
......@@ -104,26 +118,12 @@ def main():
cities.append(line["city"].lower().replace(" ", "-"))
cities = list(dict.fromkeys(cities))
cities_info = NomadCityCrawler.get_cities_info3(cities)
JsonHandler.save("./../../data/cities_stats_ful.json", cities_info)
print(cities)
cities_info = NomadCityCrawler.get_cities_statistics(cities)
JsonHandler.save("./../../data/cities_stats_full.json", cities_info)
cities = JsonHandler.load("./../../data/cities_stats_full.json")
CsvHandler.saveFromDict("./../../data/cities_stats_full.csv", "city", NomadCityCrawler.attribute_to_find_values,
cities["cities"], "-1")
# CsvHandler.save("./../../data/cities_stats_full.csv", cities_info)
def main3():
cities = JsonHandler.load("./../../data/cities_stats_full.json")
CsvHandler.saveFromDict("./../../data/cities_stats_full.csv", "city", NomadCityCrawler.attribute_to_find_values, cities["cities"], "0")
def main2():
cities = []
for line in CsvHandler.load("./../../data/trips.csv")[1]:
cities.append(line["city"].lower().replace(" ", "-"))
cities = list(dict.fromkeys(cities))
cities_info = NomadCityCrawler.get_cities_info3(cities)
main()
\ No newline at end of file
main()
......@@ -85,7 +85,7 @@ class CsvHandler:
line = key + ","
for headerItem in header:
if headerItem in csv_as_dic[key]:
line = line + csv_as_dic[key][headerItem] + ","
line = line + str(csv_as_dic[key][headerItem]) + ","
else:
line = line + default_value + ","
line = line[0:len(line) - 1]
......
from __future__ import annotations
import re
from typing import List, Dict
from bs4 import BeautifulSoup
......@@ -67,12 +68,27 @@ class Element:
def with_attribute(self, attribute: str, value: str = None) -> Element:
return Element(self._soup.find(attrs={attribute: value}))
def with_text(self, text: str):
return Element(self._soup.find(text=re.compile('.*' + text + '.*')))
def text_contains(self, text: str):
return Element(self._soup.select_one(':contains("' + text + '")'))
def parent(self):
return Element(self._soup.parent)
def get_attribute_value(self, attribute: str):
return self._soup[attribute]
return self._soup[attribute] if self._soup is not None else None
def get_text(self):
return self._soup.get_text()
def is_none(self):
return self._soup is None
def get_soup(self):
return self._soup
class Page:
_page: requests.api
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment