"""An important part of scrapy-poet is the Injection Middleware. It's
responsible for injecting Page Input dependencies before the request callbacks
are executed.
"""
from __future__ import annotations
import inspect
import logging
import warnings
from typing import TYPE_CHECKING
from scrapy.downloadermiddlewares.stats import DownloaderStats
from web_poet import RulesRegistry
from web_poet.exceptions import Retry
from .api import DummyResponse
from .injection import Injector
from .page_input_providers import (
HttpClientProvider,
HttpRequestProvider,
HttpResponseProvider,
PageParamsProvider,
RequestUrlProvider,
ResponseUrlProvider,
StatsProvider,
)
from .utils import (
_get_retry_request_from_exception,
create_registry_instance,
is_min_scrapy_version,
)
if TYPE_CHECKING:
from scrapy import Spider
from scrapy.crawler import Crawler
from scrapy.http import Request, Response
# typing.Self requires Python 3.11
from typing_extensions import Self
logger = logging.getLogger(__name__)
[docs]
class DownloaderStatsMiddleware(DownloaderStats):
def process_response(
self, request: Request, response: Response, spider: Spider | None = None
) -> Request | Response:
if isinstance(response, DummyResponse):
return response
kwargs = {"spider": spider} if spider is not None else {}
return super().process_response(request, response, **kwargs)
DEFAULT_PROVIDERS = {
HttpRequestProvider: 400,
HttpResponseProvider: 500,
HttpClientProvider: 600,
PageParamsProvider: 700,
RequestUrlProvider: 800,
ResponseUrlProvider: 900,
StatsProvider: 1000,
}
[docs]
class InjectionMiddleware:
"""This is a Downloader Middleware that's supposed to:
* check if request downloads could be skipped
* inject dependencies before request callbacks are executed
"""
[docs]
def __init__(self, crawler: Crawler) -> None:
"""Initialize the middleware"""
self.crawler = crawler
self.registry = create_registry_instance(RulesRegistry, crawler)
self.injector = Injector(
crawler,
default_providers=DEFAULT_PROVIDERS,
registry=self.registry,
)
@classmethod
def from_crawler(cls, crawler: Crawler) -> Self:
return cls(crawler)
[docs]
def process_request(
self, request: Request, spider: Spider | None = None
) -> DummyResponse | None:
"""This method checks if the request is really needed and if its
download could be skipped by trying to infer if a :class:`scrapy.http.Response`
is going to be used by the callback or a Page Input.
If the :class:`scrapy.http.Response` can be ignored, a
:class:`~.DummyResponse` instance is returned on its place. This
:class:`~.DummyResponse` is linked to the original :class:`scrapy.Request
<scrapy.http.Request>` instance.
With this behavior, we're able to optimize spider executions avoiding
unnecessary downloads. That could be the case when the callback is
actually using another source like external APIs such as Zyte API.
"""
if self.injector.is_scrapy_response_required(request):
return None
logger.debug(f"Using DummyResponse instead of downloading {request}")
assert self.crawler.stats
self.crawler.stats.inc_value("scrapy_poet/dummy_response_count")
return DummyResponse(url=request.url, request=request)
def _skip_dependency_creation(self, request: Request) -> bool:
"""See:
* https://github.com/scrapinghub/scrapy-poet/issues/48 — scrapy < 2.8
* https://github.com/scrapinghub/scrapy-poet/issues/118 — scrapy >= 2.8
"""
if is_min_scrapy_version("2.8.0"):
return False
# No need to skip if the callback doesn't default to the parse() method
if request.callback is not None:
return False
# If the Request.cb_kwargs possess all of the cb dependencies, then no
# warning message should be issued.
assert self.crawler.spider
signature_iter = iter(inspect.signature(self.crawler.spider.parse).parameters)
next(signature_iter) # skip the first arg: response
cb_param_names = set(signature_iter)
if cb_param_names and cb_param_names == request.cb_kwargs.keys():
return False
# Skip if providers are needed.
return bool(self.injector.discover_callback_providers(request))
[docs]
async def process_response(
self, request: Request, response: Response, spider: Spider | None = None
) -> Response | Request:
"""This method fills :attr:`scrapy.Request.cb_kwargs
<scrapy.http.Request.cb_kwargs>` with instances for the required Page
Objects found in the callback signature.
In other words, this method instantiates all :class:`web_poet.Injectable
<web_poet.pages.Injectable>` subclasses declared as request callback
arguments and any other parameter with a :class:`~.PageObjectInputProvider`
configured for its type.
"""
if self._skip_dependency_creation(request):
warnings.warn(
"A request has been encountered with callback=None which "
"defaults to the parse() method. On such cases, annotated "
"dependencies in the parse() method won't be built by "
"scrapy-poet. However, if the request has callback=parse, "
"the annotated dependencies will be built.\n\n"
"See the Pitfalls doc for more info.",
stacklevel=2,
)
return response
# Find out the dependencies
try:
final_kwargs = await self.injector.build_callback_dependencies(
request,
response,
)
except Retry as exception:
new_request_or_none = _get_retry_request_from_exception(
request, exception, self.crawler
)
if not new_request_or_none:
return response
return new_request_or_none
# Fill the callback arguments with the created instances
for arg, value in final_kwargs.items():
# If scrapy-poet can't provided the dependency, allow the user to
# give it.
if value is None and arg in request.cb_kwargs:
continue
request.cb_kwargs[arg] = value
# TODO: check if all arguments are fulfilled somehow?
return response