Source code for scrapy_poet._request_fingerprinter

from __future__ import annotations

from typing import TYPE_CHECKING

try:
    from scrapy.utils.request import RequestFingerprinter  # noqa: F401
except ImportError:
    if not TYPE_CHECKING:
        ScrapyPoetRequestFingerprinter = None
else:
    import hashlib
    import json
    from functools import cached_property
    from logging import getLogger
    from typing import Annotated, get_args, get_origin
    from weakref import WeakKeyDictionary

    from andi import CustomBuilder
    from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
    from scrapy.utils.misc import load_object

    if TYPE_CHECKING:
        from collections.abc import Callable

        from scrapy import Request
        from scrapy.crawler import Crawler

    try:
        from scrapy.utils.misc import build_from_crawler
    except ImportError:  # Scrapy < 2.12
        from typing import Any, TypeVar

        from scrapy.utils.misc import create_instance  # type: ignore[attr-defined]

        T = TypeVar("T")

        def build_from_crawler(  # type: ignore[no-redef]
            objcls: type[T], crawler: Crawler, /, *args: Any, **kwargs: Any
        ) -> T:
            return create_instance(objcls, None, crawler, *args, **kwargs)

    from web_poet import (
        HttpClient,
        HttpRequest,
        HttpRequestBody,
        HttpRequestHeaders,
        PageParams,
        RequestUrl,
        Stats,
    )
    from web_poet.utils import get_fq_class_name

    from scrapy_poet import InjectionMiddleware
    from scrapy_poet.injection import get_callback

    logger = getLogger(__name__)

    def _serialize_dep(cls):
        if isinstance(cls, CustomBuilder):
            cls = cls.result_class_or_fn
        elif get_origin(cls) is Annotated:
            annotated, *annotations = get_args(cls)
            return f"{_serialize_dep(annotated)}{annotations!r}"
        return get_fq_class_name(cls)


[docs]
    class ScrapyPoetRequestFingerprinter:
        IGNORED_UNANNOTATED_DEPS = {
            # These dependencies are tools for page objects that should have no
            # bearing on the request itself.
            HttpClient,
            Stats,
            # These dependencies do not impact the fingerprint as dependencies,
            # it is their value on the request itself that should have an
            # impact on the request fingerprint.
            HttpRequest,
            HttpRequestBody,
            HttpRequestHeaders,
            PageParams,
            RequestUrl,
        }

        @classmethod
        def from_crawler(cls, crawler):
            return cls(crawler)


[docs]
        def __init__(self, crawler: Crawler) -> None:
            self._base_request_fingerprinter = build_from_crawler(
                load_object(
                    crawler.settings.get(
                        "SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS",
                        REQUEST_FINGERPRINTER_CLASS,
                    )
                ),
                crawler,
            )
            self._callback_cache: dict[Callable, bytes | None] = {}
            self._request_cache: WeakKeyDictionary[Request, bytes] = WeakKeyDictionary()
            self._crawler: Crawler = crawler
            self._saw_unserializable_page_params = False


        @cached_property
        def _injector(self):
            middlewares = self._crawler.engine.downloader.middleware.middlewares
            for middleware in middlewares:
                if isinstance(middleware, InjectionMiddleware):
                    return middleware.injector
            raise RuntimeError(
                "scrapy_poet.InjectionMiddleware not found at run time, has it "
                "been configured in the DOWNLOADER_MIDDLEWARES setting?"
            )

        def _get_deps(self, request: Request) -> list[str] | None:
            """Return a JSON-serializable structure that uniquely identifies the
            dependencies requested by the request, or None if dependency injection
            is not required."""
            plan = self._injector.build_plan(request)
            deps = {dep for dep, params in plan[:-1]} - self.IGNORED_UNANNOTATED_DEPS
            if not deps:
                return None
            return sorted([_serialize_dep(cls) for cls in deps])


[docs]
        def get_deps_key(self, request: Request) -> bytes | None:
            """Return a JSON array as bytes that uniquely identifies the
            dependencies requested through scrapy-poet injection that could
            impact the request, or None if there are no such dependencies."""
            callback = get_callback(request, self._crawler.spider)
            if callback in self._callback_cache:
                return self._callback_cache[callback]

            deps = self._get_deps(request)
            if not deps:
                self._callback_cache[callback] = None
                return None

            deps_key = json.dumps(deps, sort_keys=True).encode()
            self._callback_cache[callback] = deps_key
            return self._callback_cache[callback]



[docs]
        def serialize_page_params(self, request: Request) -> bytes | None:
            """Return a JSON object as bytes that represents the page params,
            or None if there are no page params or they are not
            JSON-serializable."""
            page_params = request.meta.get("page_params", None)
            if not page_params:
                return None

            try:
                return json.dumps(page_params, sort_keys=True).encode()
            except TypeError:
                if not self._saw_unserializable_page_params:
                    self._saw_unserializable_page_params = True
                    logger.warning(
                        f"Cannot serialize page params {page_params!r} of "
                        f"request {request} as JSON. This can be an issue if "
                        f"you have requests that are identical except for "
                        f"their page params, because unserializable page "
                        f"params are treated the same as missing or empty "
                        f"page params for purposes of request fingerprinting "
                        f"(see "
                        f"https://docs.scrapy.org/en/latest/topics/request-response.html#request-fingerprints). "
                        f"This will be the only warning about this issue, "
                        f"other requests might be also affected."
                    )
                return None


        def fingerprint(self, request: Request) -> bytes:
            if request in self._request_cache:
                return self._request_cache[request]

            fingerprint = self._base_request_fingerprinter.fingerprint(request)
            deps_key = self.get_deps_key(request)
            serialized_page_params = self.serialize_page_params(request)
            if deps_key is None and serialized_page_params is None:
                return fingerprint
            if deps_key is not None:
                fingerprint += deps_key
            if serialized_page_params is not None:
                fingerprint += serialized_page_params

            self._request_cache[request] = hashlib.sha1(fingerprint).digest()  # noqa: S324
            return self._request_cache[request]