Skip to content
Snippets Groups Projects
scraper.py 1.81 KiB
Newer Older
from __future__ import annotations

Jirka's avatar
Jirka committed
import re
Jirka's avatar
Jirka committed
from bs4 import BeautifulSoup
import requests


class Element:
    _soup: BeautifulSoup

    def __init__(self,  soup: BeautifulSoup):
        self._soup = soup

    def with_id(self, id: str) -> Element:
Jirka's avatar
Jirka committed
        return Element(self._soup.find(id=id)) if self._soup is not None else None

    def with_attribute(self, attribute: str, value: str = None) -> Element:
Jirka's avatar
Jirka committed
        return Element(self._soup.find(attrs={attribute: value})) if self._soup is not None else None
Jirka's avatar
Jirka committed
    def contains_text(self, text: str):
        return Element(self._soup.find(text=re.compile('.*' + text + '.*'))) if self._soup is not None else None
Jirka's avatar
Jirka committed

    def parent(self):
Jirka's avatar
Jirka committed
        return Element(self._soup.parent) if self._soup is not None else None
Jirka's avatar
Jirka committed

    def get_attribute_value(self,  attribute: str):
Jirka's avatar
Jirka committed
        return self._soup[attribute] if self._soup is not None else None

    def get_text(self):
Jirka's avatar
Jirka committed
        if self._soup is None:
            return None

Jirka's avatar
Jirka committed
        from bs4 import NavigableString
Jirka's avatar
Jirka committed
        if isinstance(self._soup, NavigableString):
            return self._soup

        return self._soup.get_text()

Jirka's avatar
Jirka committed
    def is_none(self):
        return self._soup is None

    def get_soup(self):
        return self._soup


class Page:
    _page: requests.api

    def __init__(self, page: requests.api):
        self._page = page

    def status_code(self) -> int:
        return self._page.status_code

    def find(self):
        return Element(BeautifulSoup(self._page.content, "html.parser"))


class Scraper:
    @staticmethod
    def at(web: str) -> ScraperBodyInit:
        page = requests.get(web)
        if page.status_code == 200:
            return ScraperBodyInit(BeautifulSoup(page.content, "html.parser"))

    @staticmethod
    def get(web: str) -> Page:
        page = requests.get(web)
        return Page(page)