Source code for scrapy_poet.downloadermiddlewares

"""An important part of scrapy-poet is the Injection Middleware. It's
responsible for injecting Page Input dependencies before the request callbacks
are executed.
"""
import inspect
import logging
import warnings
from typing import Generator, Optional, Type, TypeVar, Union

from scrapy import Spider
from scrapy.crawler import Crawler
from scrapy.downloadermiddlewares.stats import DownloaderStats
from scrapy.http import Request, Response
from twisted.internet.defer import Deferred, inlineCallbacks
from web_poet import RulesRegistry

from .api import DummyResponse
from .injection import Injector
from .page_input_providers import (
    HttpClientProvider,
    HttpRequestProvider,
    HttpResponseProvider,
    PageParamsProvider,
    RequestUrlProvider,
    ResponseUrlProvider,
    StatsProvider,
)
from .utils import create_registry_instance, is_min_scrapy_version

logger = logging.getLogger(__name__)


class DownloaderStatsMiddleware(DownloaderStats):
    def process_response(
        self, request: Request, response: Response, spider: Spider
    ) -> Union[Request, Response]:
        if isinstance(response, DummyResponse):
            return response
        return super().process_response(request, response, spider)


DEFAULT_PROVIDERS = {
    HttpRequestProvider: 400,
    HttpResponseProvider: 500,
    HttpClientProvider: 600,
    PageParamsProvider: 700,
    RequestUrlProvider: 800,
    ResponseUrlProvider: 900,
    StatsProvider: 1000,
}

InjectionMiddlewareTV = TypeVar("InjectionMiddlewareTV", bound="InjectionMiddleware")


[docs] class InjectionMiddleware: """This is a Downloader Middleware that's supposed to: * check if request downloads could be skipped * inject dependencies before request callbacks are executed """
[docs] def __init__(self, crawler: Crawler) -> None: """Initialize the middleware""" self.crawler = crawler self.registry = create_registry_instance(RulesRegistry, crawler) self.injector = Injector( crawler, default_providers=DEFAULT_PROVIDERS, registry=self.registry, )
@classmethod def from_crawler( cls: Type[InjectionMiddlewareTV], crawler: Crawler ) -> InjectionMiddlewareTV: o = cls(crawler) return o
[docs] def process_request( self, request: Request, spider: Spider ) -> Optional[DummyResponse]: """This method checks if the request is really needed and if its download could be skipped by trying to infer if a :class:`scrapy.http.Response` is going to be used by the callback or a Page Input. If the :class:`scrapy.http.Response` can be ignored, a :class:`~.DummyResponse` instance is returned on its place. This :class:`~.DummyResponse` is linked to the original :class:`scrapy.Request <scrapy.http.Request>` instance. With this behavior, we're able to optimize spider executions avoiding unnecessary downloads. That could be the case when the callback is actually using another source like external APIs such as Zyte's AutoExtract. """ if self.injector.is_scrapy_response_required(request): return None logger.debug(f"Using DummyResponse instead of downloading {request}") self.crawler.stats.inc_value("scrapy_poet/dummy_response_count") return DummyResponse(url=request.url, request=request)
def _skip_dependency_creation(self, request: Request, spider: Spider) -> bool: """See: * https://github.com/scrapinghub/scrapy-poet/issues/48 — scrapy < 2.8 * https://github.com/scrapinghub/scrapy-poet/issues/118 — scrapy >= 2.8 """ if is_min_scrapy_version("2.8.0"): return False # No need to skip if the callback doesn't default to the parse() method if request.callback is not None: return False # If the Request.cb_kwargs possess all of the cb dependencies, then no # warning message should be issued. signature_iter = iter(inspect.signature(spider.parse).parameters) next(signature_iter) # skip the first arg: response cb_param_names = set(signature_iter) if cb_param_names and cb_param_names == request.cb_kwargs.keys(): return False # Skip if providers are needed. if self.injector.discover_callback_providers(request): return True return False
[docs] @inlineCallbacks def process_response( self, request: Request, response: Response, spider: Spider ) -> Generator[Deferred, object, Response]: """This method fills :attr:`scrapy.Request.cb_kwargs <scrapy.http.Request.cb_kwargs>` with instances for the required Page Objects found in the callback signature. In other words, this method instantiates all :class:`web_poet.Injectable <web_poet.pages.Injectable>` subclasses declared as request callback arguments and any other parameter with a :class:`~.PageObjectInputProvider` configured for its type. """ if self._skip_dependency_creation(request, spider): warnings.warn( "A request has been encountered with callback=None which " "defaults to the parse() method. On such cases, annotated " "dependencies in the parse() method won't be built by " "scrapy-poet. However, if the request has callback=parse, " "the annotated dependencies will be built.\n\n" "See the Pitfalls doc for more info.", stacklevel=2, ) return response # Find out the dependencies final_kwargs = yield from self.injector.build_callback_dependencies( request, response, ) # Fill the callback arguments with the created instances for arg, value in final_kwargs.items(): # If scrapy-poet can't provided the dependency, allow the user to # give it. if value is None and arg in request.cb_kwargs: continue request.cb_kwargs[arg] = value # TODO: check if all arguments are fulfilled somehow? return response