diff --git a/src/crawler/nomad_city_crawler.py b/src/crawler/nomad_city_crawler.py index ed7d8065efed75a256fd0c411309e86c926ab280..9d8636ba9d369217cfe7ccb8b9ba34b31f314d68 100644 --- a/src/crawler/nomad_city_crawler.py +++ b/src/crawler/nomad_city_crawler.py @@ -85,12 +85,12 @@ class NomadCityCrawler: values["nomad_score"] = score if score else -1 text = "Mbps" - score = page.find().get_soup().find(text=re.compile('.*' + text + '.*')) + score = page.find().contains_text(text).get_text() #page.find().get_soup().find(text=re.compile('.*' + text + '.*')) score = NomadCityCrawler.get_number(score) values["internet_mbs"] = score if score else -1 text = " / mo" - score = page.find().get_soup().find(text=re.compile('.*' + text + '.*')) + score = page.find().contains_text(text).get_text() #page.find().get_soup().find(text=re.compile('.*' + text + '.*')) score = NomadCityCrawler.get_number(score) values["cost_dollars_per_month"] = score if score else -1 diff --git a/src/scraper/scraper.py b/src/scraper/scraper.py index 61a6aa4a4858bde54cd546544f7db8f9466039cc..0b5bd4d2a86dfcd8121d656afe2f338073e75f0a 100644 --- a/src/scraper/scraper.py +++ b/src/scraper/scraper.py @@ -3,7 +3,7 @@ from __future__ import annotations import re from typing import List, Dict -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString import requests @@ -63,24 +63,27 @@ class Element: self._soup = soup def with_id(self, id: str) -> Element: - return Element(self._soup.find(id=id)) + return Element(self._soup.find(id=id)) if self._soup is not None else None def with_attribute(self, attribute: str, value: str = None) -> Element: - return Element(self._soup.find(attrs={attribute: value})) + return Element(self._soup.find(attrs={attribute: value})) if self._soup is not None else None - def with_text(self, text: str): - return Element(self._soup.find(text=re.compile('.*' + text + '.*'))) - - def text_contains(self, text: str): - return Element(self._soup.select_one(':contains("' + text + '")')) + def contains_text(self, text: str): + return Element(self._soup.find(text=re.compile('.*' + text + '.*'))) if self._soup is not None else None def parent(self): - return Element(self._soup.parent) + return Element(self._soup.parent) if self._soup is not None else None def get_attribute_value(self, attribute: str): return self._soup[attribute] if self._soup is not None else None def get_text(self): + if self._soup is None: + return None + + if isinstance(self._soup, NavigableString): + return self._soup + return self._soup.get_text() def is_none(self):