From 81f34ff44f6957277444952b986afa080e9036ef Mon Sep 17 00:00:00 2001
From: Jirka <vrbka.jirka@gmail.com>
Date: Sat, 12 Oct 2019 00:23:39 +0200
Subject: [PATCH] crawler for city knowledge base via nomadlist

---
 src/crawler/nomad_city_crawler.py | 50 +++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/src/crawler/nomad_city_crawler.py b/src/crawler/nomad_city_crawler.py
index e69de29..3e8ad85 100644
--- a/src/crawler/nomad_city_crawler.py
+++ b/src/crawler/nomad_city_crawler.py
@@ -0,0 +1,50 @@
+from typing import List, Dict
+
+from src.handler.csv_handler import CsvHandler
+from src.handler.json_handler import JsonHandler
+from src.scraper.scraper import Scraper
+
+
+class NomadCityCrawler:
+    attribute_to_find = "data-key"
+    attribute_to_find_values = [
+        "hospital_score",
+        "english_speaking",
+        "nightlife",
+        "female_friendly",
+        "racial_tolerance",
+        "peace_score"]
+
+    @staticmethod
+    def get_cities_info(cities_to_browse: List[str]) -> dict:
+        cities_stats = {}
+        cities_stats["cities"] = {}
+
+        for city in cities_to_browse:
+            print("working on " + city)
+            try:
+                cities_stats["cities"][city] = {}
+
+                context = Scraper.at("https://nomadlist.com/" + city)
+                for key, element in context.with_attribute_having_values(NomadCityCrawler.attribute_to_find, NomadCityCrawler.attribute_to_find_values):
+                    score = element.find().get_attribute_value("data-value")
+                    cities_stats["cities"][city].update({key: score})
+
+            except Exception as e:
+                print("For city " + city)
+                print(e)
+
+        return cities_stats
+
+
+def main():
+    cities = []
+    for line in CsvHandler.load("./../../data/trips.csv")[1]:
+        cities.append(line["city"].lower().replace(" ", "-"))
+
+    cities = list(dict.fromkeys(cities))
+    cities_info = NomadCityCrawler.get_cities_info(cities)
+    JsonHandler.save("./../../data/cities_stats2.json", cities_info)
+
+
+main()
\ No newline at end of file
-- 
GitLab