Source code for scrapy_poet.downloadermiddlewares

"""An important part of scrapy-poet is the Injection Middleware. It's
responsible for injecting Page Input dependencies before the request callbacks
are executed.
"""

from __future__ import annotations

import inspect
import logging
import warnings
from typing import TYPE_CHECKING

from scrapy.downloadermiddlewares.stats import DownloaderStats
from web_poet import RulesRegistry
from web_poet.exceptions import Retry

from .api import DummyResponse
from .injection import Injector
from .page_input_providers import (
    HttpClientProvider,
    HttpRequestProvider,
    HttpResponseProvider,
    PageParamsProvider,
    RequestUrlProvider,
    ResponseUrlProvider,
    StatsProvider,
)
from .utils import (
    _get_retry_request_from_exception,
    create_registry_instance,
    is_min_scrapy_version,
)

if TYPE_CHECKING:
    from scrapy import Spider
    from scrapy.crawler import Crawler
    from scrapy.http import Request, Response

    # typing.Self requires Python 3.11
    from typing_extensions import Self

logger = logging.getLogger(__name__)


[docs] class DownloaderStatsMiddleware(DownloaderStats): def process_response( self, request: Request, response: Response, spider: Spider | None = None ) -> Request | Response: if isinstance(response, DummyResponse): return response kwargs = {"spider": spider} if spider is not None else {} return super().process_response(request, response, **kwargs)
DEFAULT_PROVIDERS = { HttpRequestProvider: 400, HttpResponseProvider: 500, HttpClientProvider: 600, PageParamsProvider: 700, RequestUrlProvider: 800, ResponseUrlProvider: 900, StatsProvider: 1000, }
[docs] class InjectionMiddleware: """This is a Downloader Middleware that's supposed to: * check if request downloads could be skipped * inject dependencies before request callbacks are executed """
[docs] def __init__(self, crawler: Crawler) -> None: """Initialize the middleware""" self.crawler = crawler self.registry = create_registry_instance(RulesRegistry, crawler) self.injector = Injector( crawler, default_providers=DEFAULT_PROVIDERS, registry=self.registry, )
@classmethod def from_crawler(cls, crawler: Crawler) -> Self: return cls(crawler)
[docs] def process_request( self, request: Request, spider: Spider | None = None ) -> DummyResponse | None: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a :class:`scrapy.http.Response` is going to be used by the callback or a Page Input. If the :class:`scrapy.http.Response` can be ignored, a :class:`~.DummyResponse` instance is returned on its place. This :class:`~.DummyResponse` is linked to the original :class:`scrapy.Request <scrapy.http.Request>` instance. With this behavior, we're able to optimize spider executions avoiding unnecessary downloads. That could be the case when the callback is actually using another source like external APIs such as Zyte API. """ if self.injector.is_scrapy_response_required(request): return None logger.debug(f"Using DummyResponse instead of downloading {request}") assert self.crawler.stats self.crawler.stats.inc_value("scrapy_poet/dummy_response_count") return DummyResponse(url=request.url, request=request)
def _skip_dependency_creation(self, request: Request) -> bool: """See: * https://github.com/scrapinghub/scrapy-poet/issues/48 — scrapy < 2.8 * https://github.com/scrapinghub/scrapy-poet/issues/118 — scrapy >= 2.8 """ if is_min_scrapy_version("2.8.0"): return False # No need to skip if the callback doesn't default to the parse() method if request.callback is not None: return False # If the Request.cb_kwargs possess all of the cb dependencies, then no # warning message should be issued. assert self.crawler.spider signature_iter = iter(inspect.signature(self.crawler.spider.parse).parameters) next(signature_iter) # skip the first arg: response cb_param_names = set(signature_iter) if cb_param_names and cb_param_names == request.cb_kwargs.keys(): return False # Skip if providers are needed. return bool(self.injector.discover_callback_providers(request))
[docs] async def process_response( self, request: Request, response: Response, spider: Spider | None = None ) -> Response | Request: """This method fills :attr:`scrapy.Request.cb_kwargs <scrapy.http.Request.cb_kwargs>` with instances for the required Page Objects found in the callback signature. In other words, this method instantiates all :class:`web_poet.Injectable <web_poet.pages.Injectable>` subclasses declared as request callback arguments and any other parameter with a :class:`~.PageObjectInputProvider` configured for its type. """ if self._skip_dependency_creation(request): warnings.warn( "A request has been encountered with callback=None which " "defaults to the parse() method. On such cases, annotated " "dependencies in the parse() method won't be built by " "scrapy-poet. However, if the request has callback=parse, " "the annotated dependencies will be built.\n\n" "See the Pitfalls doc for more info.", stacklevel=2, ) return response # Find out the dependencies try: final_kwargs = await self.injector.build_callback_dependencies( request, response, ) except Retry as exception: new_request_or_none = _get_retry_request_from_exception( request, exception, self.crawler ) if not new_request_or_none: return response return new_request_or_none # Fill the callback arguments with the created instances for arg, value in final_kwargs.items(): # If scrapy-poet can't provided the dependency, allow the user to # give it. if value is None and arg in request.cb_kwargs: continue request.cb_kwargs[arg] = value # TODO: check if all arguments are fulfilled somehow? return response