Skip to content
Snippets Groups Projects
Commit 3a360832 authored by Jiří Vrbka's avatar Jiří Vrbka Committed by Terézia Slanináková
Browse files

City knowledge base nomad

parent 54d7f129
No related branches found
No related tags found
No related merge requests found
.idea/
src/handler/__pycache__/
src/scraper/__pycache__/
This diff is collapsed.
from typing import List, Dict
from src.handler.csv_handler import CsvHandler
from src.handler.json_handler import JsonHandler
from src.scraper.scraper import Scraper
class NomadCityCrawler:
attribute_to_find = "data-key"
attribute_to_find_values = [
"hospital_score",
"english_speaking",
"nightlife",
"female_friendly",
"racial_tolerance",
"peace_score"]
@staticmethod
def get_cities_info(cities_to_browse: List[str]) -> dict:
cities_stats = {}
cities_stats["cities"] = {}
for city in cities_to_browse:
print("working on " + city)
try:
cities_stats["cities"][city] = {}
context = Scraper.at("https://nomadlist.com/" + city)
for key, element in context.with_attribute_having_values(NomadCityCrawler.attribute_to_find, NomadCityCrawler.attribute_to_find_values):
score = element.find().get_attribute_value("data-value")
cities_stats["cities"][city].update({key: score})
except Exception as e:
print("For city " + city)
print(e)
return cities_stats
def main():
cities = []
for line in CsvHandler.load("./../../data/trips.csv")[1]:
cities.append(line["city"].lower().replace(" ", "-"))
cities = list(dict.fromkeys(cities))
cities_info = NomadCityCrawler.get_cities_info(cities)
JsonHandler.save("./../../data/cities_stats.json", cities_info)
main()
\ No newline at end of file
import csv
from typing import List
class CsvHandler:
@staticmethod
def __removeInitSpace__(word: str) -> str:
"""
Removes space at index 0 if present
"""
if word.startswith(" "):
return word[1:]
else:
return word
@staticmethod
def load(filepath: str) -> (List[str], List[dict]):
"""
Loads csv file
:param filepath: path to csv file
:return: tuple of (list of header params, list of dict in format: {header1: value1, header2: value2,...})
"""
item_list = []
with open(filepath, 'r') as csvFile:
header = csvFile.readline().split(",")
for i in range(1, len(header)):
header[i] = CsvHandler.__removeInitSpace__(header[i])
reader = csv.reader(csvFile)
for row in reader:
for i in range(1, len(row)):
row[i] = CsvHandler.__removeInitSpace__(row[i])
dic = {}
for i in range(1, len(header)):
dic[header[i]] = row[i]
item_list.append(dic)
csvFile.close()
return header, item_list
@staticmethod
def save(filepath: str, header: list, csv_as_list_dic: List[dict]) -> None:
"""
Saves csv file into disk
:param filepath: path to save csv file (with filename)
:param header: header to save into file
:param csv_as_list_dic: list of dic in format {header1: value1, header2: value2,...}
:return: None
"""
with open(filepath, 'w') as csvFile:
header_line = ""
for headerItem in header:
header_line += headerItem + ","
header_line = header_line[0:len(header_line) - 1]
csvFile.write(header_line + "\n")
for row in csv_as_list_dic:
line = ""
for headerItem in header:
line = line + row[headerItem] + ","
line = line[0:len(line) - 1]
csvFile.write(line + "\n")
csvFile.close()
import json
class JsonHandler:
@staticmethod
def load(filepath: str) -> dict:
"""
Loads json from disk
:param filepath: path to json file (with filename)
:return: json as dict
"""
with open(filepath, 'r') as jsonFile:
json_as_string = jsonFile.read()
jsonFile.close()
return json.loads(json_as_string)
@staticmethod
def save(filepath: str, json_as_dict: dict) -> None:
"""
Saves json into disk
:param filepath: path to save json file (with filename)
:param json_as_dict: dict
:return: None
"""
json_as_string = json.dumps(json_as_dict, indent=4)
with open(filepath, "w+") as jsonFile:
jsonFile.write(json_as_string)
jsonFile.close()
from __future__ import annotations
from typing import List, Dict
from bs4 import BeautifulSoup
import requests
class ScraperBodyGet:
def __init__(self, soup: BeautifulSoup):
self.soup = soup
def get_attribute_value(self, attribute: str):
return self.soup[attribute]
def get_text(self):
return self.soup.get_text()
def also(self):
return ScraperBodyInit(soup=self.soup)
class ScraperBodyInit:
soup: BeautifulSoup
element_name: str = None
element_attribute: (str, str) = None
def __init__(self, soup: BeautifulSoup):
self.soup = soup
def at_element(self, name_of_element: str) -> ScraperBodyInit:
self.element_name = name_of_element
return self
def with_attribute_having_value(self, attribute: str, value: str) -> ScraperBodyInit:
self.element_attribute = (attribute, value)
return self
def with_attribute_having_values(self, attribute: str, values: List[str]) -> List[(str, ScraperBodyInit)]:
results = []
for value in values:
s = ScraperBodyInit(self.soup)
s.at_element(self.element_name)
s.with_attribute_having_value(attribute, value)
results.append((value, s))
return results
def find(self):
attrs = {self.element_attribute[0]: self.element_attribute[1]}
new_soup = self.soup.find(self.element_name, attrs)
return ScraperBodyGet(new_soup)
def reset(self) -> None:
self.element_name = None
self.element_attribute = None
class Scraper:
@staticmethod
def at(web: str) -> ScraperBodyInit:
page = requests.get(web)
if page.status_code == 200:
return ScraperBodyInit(BeautifulSoup(page.content, "html.parser"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment