Skip to content
Snippets Groups Projects
scraper.py 4.49 KiB
Newer Older
from __future__ import annotations

Jirka's avatar
Jirka committed
import re
from typing import List, Dict
Jirka's avatar
Jirka committed
from bs4 import BeautifulSoup, NavigableString
import requests


class ScraperBodyGet:
    def __init__(self, soup: BeautifulSoup):
        self.soup = soup

    def get_attribute_value(self,  attribute: str):
        return self.soup[attribute]

    def get_text(self):
        return self.soup.get_text()

    def also(self):
        return ScraperBodyInit(soup=self.soup)


class ScraperBodyInit:
    soup: BeautifulSoup
    element_name: str = None
    element_attribute: (str, str) = None

    def __init__(self, soup: BeautifulSoup):
        self.soup = soup

    def at_element(self, name_of_element: str) -> ScraperBodyInit:
        self.element_name = name_of_element
        return self

    def with_attribute_having_value(self, attribute: str, value: str) -> ScraperBodyInit:
        self.element_attribute = (attribute, value)
        return self

    def with_attribute_having_values(self, attribute: str, values: List[str]) -> List[(str, ScraperBodyInit)]:
        results = []
        for value in values:
            s = ScraperBodyInit(self.soup)
            s.at_element(self.element_name)
            s.with_attribute_having_value(attribute, value)
            results.append((value, s))
        return results

    def find(self):
        attrs = {self.element_attribute[0]: self.element_attribute[1]}
        new_soup = self.soup.find(self.element_name, attrs)
        return ScraperBodyGet(new_soup)

    def reset(self) -> None:
        self.element_name = None
        self.element_attribute = None


class Element:
    _soup: BeautifulSoup

    def __init__(self,  soup: BeautifulSoup):
        self._soup = soup

    def with_id(self, id: str) -> Element:
Jirka's avatar
Jirka committed
        return Element(self._soup.find(id=id)) if self._soup is not None else None

    def with_attribute(self, attribute: str, value: str = None) -> Element:
Jirka's avatar
Jirka committed
        return Element(self._soup.find(attrs={attribute: value})) if self._soup is not None else None
Jirka's avatar
Jirka committed
    def contains_text(self, text: str):
        return Element(self._soup.find(text=re.compile('.*' + text + '.*'))) if self._soup is not None else None
Jirka's avatar
Jirka committed

    def parent(self):
Jirka's avatar
Jirka committed
        return Element(self._soup.parent) if self._soup is not None else None
Jirka's avatar
Jirka committed

    def get_attribute_value(self,  attribute: str):
Jirka's avatar
Jirka committed
        return self._soup[attribute] if self._soup is not None else None

    def get_text(self):
Jirka's avatar
Jirka committed
        if self._soup is None:
            return None

        if isinstance(self._soup, NavigableString):
            return self._soup

        return self._soup.get_text()

Jirka's avatar
Jirka committed
    def is_none(self):
        return self._soup is None

    def get_soup(self):
        return self._soup


class Page:
    _page: requests.api

    def __init__(self, page: requests.api):
        self._page = page

    def status_code(self) -> int:
        return self._page.status_code

    def find(self):
        return Element(BeautifulSoup(self._page.content, "html.parser"))


class Then:
    _soup: BeautifulSoup
    _status_code: int

    def __init__(self, soup: BeautifulSoup, status_code: int):
        self._soup = soup
        self._status_code = status_code

    def get_status_code(self):
        return self._status_code

    def get_attribute(self, attribute: str):
        return None if self._soup is None else self._soup[attribute]

    def get_text(self):
        return None if self._soup is None else self._soup.get_text()


class When:
    _soup: BeautifulSoup
    _status_code: int

    def __init__(self, soup: BeautifulSoup, status_code: int):
        self._soup = soup
        self._status_code = status_code

    def id(self, id: str) -> When:
        self._soup = self._soup.find(id=id)
        return self

    def attribute(self, attribute: str, value: str = None) -> When:
        self._soup = self._soup.find(attrs={attribute: value})
        return self

    def Then(self):
        return Then(self._soup, self._status_code)


class Given:
    _request: requests.api

    def __init__(self, url: str):
        self._request = requests.get(url=url)

    def status_code(self) -> int:
        return self._request.status_code

    def When(self):
        return When(BeautifulSoup(self._request.content, "html.parser"))


class Scraper:
    @staticmethod
    def at(web: str) -> ScraperBodyInit:
        page = requests.get(web)
        if page.status_code == 200:
            return ScraperBodyInit(BeautifulSoup(page.content, "html.parser"))

    @staticmethod
    def get(web: str) -> Page:
        page = requests.get(web)
        return Page(page)