City knowledge base nomad

3a360832 · Jiří Vrbka · Terézia Slanináková · 54d7f129 · 3a360832 · 3a360832
Commit 3a360832 authored 5 years ago by Jiří Vrbka Committed by Terézia Slanináková 5 years ago
--- a/.gitignore
+++ b/.gitignore
+
+.idea/
+
+src/handler/__pycache__/
+
+src/scraper/__pycache__/
--- a/data/cities_stats.json
+++ b/data/cities_stats.json
--- a/src/crawler/__init__.py
+++ b/src/crawler/__init__.py
--- a/src/crawler/nomad_city_crawler.py
+++ b/src/crawler/nomad_city_crawler.py
+from typing import List, Dict
+
+from src.handler.csv_handler import CsvHandler
+from src.handler.json_handler import JsonHandler
+from src.scraper.scraper import Scraper
+
+
+class NomadCityCrawler:
+    attribute_to_find = "data-key"
+    attribute_to_find_values = [
+        "hospital_score",
+        "english_speaking",
+        "nightlife",
+        "female_friendly",
+        "racial_tolerance",
+        "peace_score"]
+
+    @staticmethod
+    def get_cities_info(cities_to_browse: List[str]) -> dict:
+        cities_stats = {}
+        cities_stats["cities"] = {}
+
+        for city in cities_to_browse:
+            print("working on " + city)
+            try:
+                cities_stats["cities"][city] = {}
+
+                context = Scraper.at("https://nomadlist.com/" + city)
+                for key, element in context.with_attribute_having_values(NomadCityCrawler.attribute_to_find, NomadCityCrawler.attribute_to_find_values):
+                    score = element.find().get_attribute_value("data-value")
+                    cities_stats["cities"][city].update({key: score})
+
+            except Exception as e:
+                print("For city " + city)
+                print(e)
+
+        return cities_stats
+
+
+def main():
+    cities = []
+    for line in CsvHandler.load("./../../data/trips.csv")[1]:
+        cities.append(line["city"].lower().replace(" ", "-"))
+
+    cities = list(dict.fromkeys(cities))
+    cities_info = NomadCityCrawler.get_cities_info(cities)
+    JsonHandler.save("./../../data/cities_stats.json", cities_info)
+
+
+main()
\ No newline at end of file
--- a/src/handler/__init__.py
+++ b/src/handler/__init__.py
--- a/src/handler/csv_handler.py
+++ b/src/handler/csv_handler.py
+import csv
+from typing import List
+
+
+class CsvHandler:
+
+    @staticmethod
+    def __removeInitSpace__(word: str) -> str:
+        """
+        Removes space at index 0 if present
+        """
+        if word.startswith(" "):
+            return word[1:]
+        else:
+            return word
+
+    @staticmethod
+    def load(filepath: str) -> (List[str], List[dict]):
+        """
+        Loads csv file
+        :param filepath: path to csv file
+        :return: tuple of (list of header params, list of dict in format: {header1: value1, header2: value2,...})
+        """
+        item_list = []
+        with open(filepath, 'r') as csvFile:
+            header = csvFile.readline().split(",")
+
+            for i in range(1, len(header)):
+                header[i] = CsvHandler.__removeInitSpace__(header[i])
+
+            reader = csv.reader(csvFile)
+            for row in reader:
+                for i in range(1, len(row)):
+                    row[i] = CsvHandler.__removeInitSpace__(row[i])
+
+                dic = {}
+                for i in range(1, len(header)):
+                    dic[header[i]] = row[i]
+                item_list.append(dic)
+        csvFile.close()
+
+        return header, item_list
+
+    @staticmethod
+    def save(filepath: str, header: list, csv_as_list_dic: List[dict]) -> None:
+        """
+        Saves csv file into disk
+        :param filepath: path to save csv file (with filename)
+        :param header: header to save into file
+        :param csv_as_list_dic: list of dic in format {header1: value1, header2: value2,...}
+        :return: None
+        """
+        with open(filepath, 'w') as csvFile:
+            header_line = ""
+            for headerItem in header:
+                header_line += headerItem + ","
+            header_line = header_line[0:len(header_line) - 1]
+            csvFile.write(header_line + "\n")
+
+            for row in csv_as_list_dic:
+                line = ""
+                for headerItem in header:
+                    line = line + row[headerItem] + ","
+                line = line[0:len(line) - 1]
+                csvFile.write(line + "\n")
+        csvFile.close()
+
+
--- a/src/handler/json_handler.py
+++ b/src/handler/json_handler.py
+import json
+
+
+class JsonHandler:
+    @staticmethod
+    def load(filepath: str) -> dict:
+        """
+        Loads json from disk
+        :param filepath: path to json file (with filename)
+        :return: json as dict
+        """
+        with open(filepath, 'r') as jsonFile:
+            json_as_string = jsonFile.read()
+        jsonFile.close()
+        return json.loads(json_as_string)
+
+    @staticmethod
+    def save(filepath: str, json_as_dict: dict) -> None:
+        """
+        Saves json into disk
+        :param filepath: path to save json file (with filename)
+        :param json_as_dict: dict
+        :return: None
+        """
+        json_as_string = json.dumps(json_as_dict, indent=4)
+        with open(filepath, "w+") as jsonFile:
+            jsonFile.write(json_as_string)
+        jsonFile.close()
--- a/src/scraper/__init__.py
+++ b/src/scraper/__init__.py
--- a/src/scraper/scraper.py
+++ b/src/scraper/scraper.py
+from __future__ import annotations
+
+from typing import List, Dict
+
+from bs4 import BeautifulSoup
+import requests
+
+
+class ScraperBodyGet:
+    def __init__(self, soup: BeautifulSoup):
+        self.soup = soup
+
+    def get_attribute_value(self,  attribute: str):
+        return self.soup[attribute]
+
+    def get_text(self):
+        return self.soup.get_text()
+
+    def also(self):
+        return ScraperBodyInit(soup=self.soup)
+
+
+class ScraperBodyInit:
+    soup: BeautifulSoup
+    element_name: str = None
+    element_attribute: (str, str) = None
+
+    def __init__(self, soup: BeautifulSoup):
+        self.soup = soup
+
+    def at_element(self, name_of_element: str) -> ScraperBodyInit:
+        self.element_name = name_of_element
+        return self
+
+    def with_attribute_having_value(self, attribute: str, value: str) -> ScraperBodyInit:
+        self.element_attribute = (attribute, value)
+        return self
+
+    def with_attribute_having_values(self, attribute: str, values: List[str]) -> List[(str, ScraperBodyInit)]:
+        results = []
+        for value in values:
+            s = ScraperBodyInit(self.soup)
+            s.at_element(self.element_name)
+            s.with_attribute_having_value(attribute, value)
+            results.append((value, s))
+        return results
+
+    def find(self):
+        attrs = {self.element_attribute[0]: self.element_attribute[1]}
+        new_soup = self.soup.find(self.element_name, attrs)
+        return ScraperBodyGet(new_soup)
+
+    def reset(self) -> None:
+        self.element_name = None
+        self.element_attribute = None
+
+
+class Scraper:
+    @staticmethod
+    def at(web: str) -> ScraperBodyInit:
+        page = requests.get(web)
+        if page.status_code == 200:
+            return ScraperBodyInit(BeautifulSoup(page.content, "html.parser"))