"""The Injection Middleware needs a standard way to build the Page Inputs dependencies
that the Page Objects uses to get external data (e.g. the HTML). That's why we
have created a colletion of Page Object Input Providers.
The current module implements a Page Input Provider for
:class:`web_poet.HttpResponse <web_poet.page_inputs.http.HttpResponse>`, which
is in charge of providing the response HTML from Scrapy. You could also implement
different providers in order to acquire data from multiple external sources,
for example, from scrapy-playwright or from scrapy-zyte-api.
"""
from typing import Any, Callable, ClassVar, Set
from scrapy import Request
from scrapy.crawler import Crawler
from scrapy.http import Response
from scrapy.utils.defer import maybe_deferred_to_future
from web_poet import (
HttpClient,
HttpRequest,
HttpRequestHeaders,
HttpResponse,
HttpResponseHeaders,
PageParams,
RequestUrl,
ResponseUrl,
Stats,
)
from web_poet.page_inputs.stats import StatCollector, StatNum
from scrapy_poet.downloader import _create_scrapy_downloader
from scrapy_poet.injection_errors import MalformedProvidedClassesError
[docs]
class PageObjectInputProvider:
"""
This is the base class for creating Page Object Input Providers.
A Page Object Input Provider (POIP) takes responsibility for providing
instances of some types to Scrapy callbacks. The types a POIP provides must
be declared in the class attribute ``provided_classes``.
POIPs are initialized when the spider starts by invoking the ``__init__``
method, which receives the ``scrapy_poet.injection.Injector`` instance as
argument.
The ``__call__`` method must be overridden, and it is inside this method
where the actual instances must be build. The default ``__call__``
signature is as follows:
.. code-block:: python
def __call__(self, to_provide: Set[Callable]) -> Sequence[Any]: ...
Therefore, it receives a list of types to be provided and return a list
with the instances created (don't get confused by the ``Callable``
annotation. Think on it as a synonym of ``Type``).
Additional dependencies can be declared in the ``__call__`` signature that
will be automatically injected. Currently, scrapy-poet is able to inject
instances of the following classes:
- :class:`~scrapy.http.Request`
- :class:`~scrapy.http.Response`
- :class:`~scrapy.crawler.Crawler`
- :class:`~scrapy.settings.Settings`
- :class:`~scrapy.statscollectors.StatsCollector`
Finally, ``__call__`` function can execute asynchronous code. Just prepend
the declaration with ``async``.
The available POIPs should be declared in the spider setting using the key
``SCRAPY_POET_PROVIDERS``. It must be a dictionary that follows same
structure than the :ref:`Scrapy Middlewares
<scrapy:topics-downloader-middleware-ref>` configuration dictionaries.
A simple example of a provider:
.. code-block:: python
class BodyHtml(str):
pass
class BodyHtmlProvider(PageObjectInputProvider):
provided_classes = {BodyHtml}
def __call__(self, to_provide, response: Response):
return [BodyHtml(response.css("html body").get())]
The **provided_classes** class attribute is the ``set`` of classes that
this provider provides. Alternatively, it can be a function with type
``Callable[[Callable], bool]`` that returns ``True`` if and only if the
given type, which must be callable, is provided by this provider.
"""
provided_classes: set[Callable] | Callable[[Callable], bool]
name: ClassVar[str] = "" # It must be a unique name. Used by the cache mechanism
[docs]
def is_provided(self, type_: Callable) -> bool:
"""
Return ``True`` if the given type is provided by this provider based
on the value of the attribute ``provided_classes``
"""
if isinstance(self.provided_classes, (set, frozenset)):
return type_ in self.provided_classes
if callable(self.provided_classes):
return self.provided_classes(type_)
raise MalformedProvidedClassesError(
f"Unexpected type {type_!r} for 'provided_classes' attribute of"
f"{self!r}. Expected either 'set' or 'callable'"
)
# FIXME: Can't import the Injector as class annotation due to circular dep.
[docs]
def __init__(self, injector):
"""Initializes the provider. Invoked only at spider start up."""
self.injector = injector
# Remember that is expected for all children to implement the ``__call__``
# method. The simplest signature for it is:
#
# def __call__(self, to_provide: Set[Callable]) -> Sequence[Any]:
#
# But some adding some other injectable attributes are possible
# (see the class docstring)
#
# The technical reason why this method was not declared abstract is that
# injection breaks the method overriding rules and mypy then complains.
[docs]
class HttpRequestProvider(PageObjectInputProvider):
"""This class provides :class:`web_poet.HttpRequest
<web_poet.page_inputs.http.HttpRequest>` instances.
"""
provided_classes = {HttpRequest}
name = "request_data"
[docs]
def __call__(self, to_provide: Set[Callable], request: Request):
"""Builds a :class:`web_poet.HttpRequest
<web_poet.page_inputs.http.HttpRequest>` instance using a
:class:`scrapy.http.Request` instance.
"""
return [
HttpRequest(
url=RequestUrl(request.url),
method=request.method,
headers=HttpRequestHeaders.from_bytes_dict(request.headers),
body=request.body,
)
]
[docs]
class HttpResponseProvider(PageObjectInputProvider):
"""This class provides :class:`web_poet.HttpResponse
<web_poet.page_inputs.http.HttpResponse>` instances.
"""
provided_classes = {HttpResponse}
name = "response_data"
[docs]
def __call__(self, to_provide: Set[Callable], response: Response):
"""Builds a :class:`web_poet.HttpResponse
<web_poet.page_inputs.http.HttpResponse>` instance using a
:class:`scrapy.http.Response` instance.
"""
return [
HttpResponse(
url=response.url,
body=response.body,
status=response.status,
headers=HttpResponseHeaders.from_bytes_dict(response.headers),
)
]
[docs]
class HttpClientProvider(PageObjectInputProvider):
"""This class provides :class:`web_poet.HttpClient
<web_poet.page_inputs.client.HttpClient>` instances.
"""
provided_classes = {HttpClient}
[docs]
def __call__(self, to_provide: Set[Callable], crawler: Crawler):
"""Creates an :class:`web_poet.HttpClient
<web_poet.page_inputs.client.HttpClient>` instance using Scrapy's
downloader.
"""
if hasattr(crawler.engine, "download_async"): # Scrapy 2.14+
assert crawler.engine
download_func = crawler.engine.download_async
else:
async def download_func(request: Request):
assert crawler.engine
return await maybe_deferred_to_future(crawler.engine.download(request))
downloader = _create_scrapy_downloader(download_func)
save_responses = crawler.settings.getbool("_SCRAPY_POET_SAVEFIXTURE")
return [
HttpClient(request_downloader=downloader, save_responses=save_responses)
]
[docs]
class PageParamsProvider(PageObjectInputProvider):
"""This class provides :class:`web_poet.PageParams
<web_poet.page_inputs.page_params.PageParams>` instances.
"""
provided_classes = {PageParams}
[docs]
def __call__(self, to_provide: Set[Callable], request: Request):
"""Creates a :class:`web_poet.PageParams
<web_poet.page_inputs.page_params.PageParams>` instance based on the
data found from the ``meta["page_params"]`` field of a
:class:`scrapy.http.Response` instance.
"""
return [PageParams(request.meta.get("page_params", {}))]
[docs]
class RequestUrlProvider(PageObjectInputProvider):
"""This class provides :class:`web_poet.RequestUrl
<web_poet.page_inputs.http.RequestUrl>` instances.
"""
provided_classes = {RequestUrl}
name = "request_url"
[docs]
def __call__(self, to_provide: Set[Callable], request: Request):
"""Builds a :class:`web_poet.RequestUrl <web_poet.page_inputs.http.RequestUrl>`
instance using :class:`scrapy.Request <scrapy.http.Request>` instance.
"""
return [RequestUrl(url=request.url)]
[docs]
class ResponseUrlProvider(PageObjectInputProvider):
provided_classes = {ResponseUrl}
name = "response_url"
[docs]
def __call__(self, to_provide: Set[Callable], response: Response):
"""Builds a :class:`web_poet.RequestUrl <web_poet.page_inputs.http.RequestUrl>`
instance using a :class:`scrapy.http.Response` instance.
"""
return [ResponseUrl(url=response.url)]
[docs]
class ScrapyPoetStatCollector(StatCollector):
[docs]
def __init__(self, stats):
self._stats = stats
self._prefix = "poet/stats/"
[docs]
def set(self, key: str, value: Any) -> None:
self._stats.set_value(f"{self._prefix}{key}", value)
[docs]
def inc(self, key: str, value: StatNum = 1) -> None:
self._stats.inc_value(f"{self._prefix}{key}", value)
[docs]
class StatsProvider(PageObjectInputProvider):
"""This class provides :class:`web_poet.Stats
<web_poet.page_inputs.client.Stats>` instances.
"""
provided_classes = {Stats}
[docs]
def __call__(self, to_provide: Set[Callable], crawler: Crawler):
"""Creates an :class:`web_poet.Stats
<web_poet.page_inputs.client.Stats>` instance using Scrapy's
stat collector.
"""
return [Stats(stat_collector=ScrapyPoetStatCollector(crawler.stats))]