from __future__ import annotations
from typing import TYPE_CHECKING
try:
from scrapy.utils.request import RequestFingerprinter # noqa: F401
except ImportError:
if not TYPE_CHECKING:
ScrapyPoetRequestFingerprinter = None
else:
import hashlib
import json
from functools import cached_property
from logging import getLogger
from typing import Annotated, get_args, get_origin
from weakref import WeakKeyDictionary
from andi import CustomBuilder
from scrapy.settings.default_settings import REQUEST_FINGERPRINTER_CLASS
from scrapy.utils.misc import load_object
if TYPE_CHECKING:
from collections.abc import Callable
from scrapy import Request
from scrapy.crawler import Crawler
try:
from scrapy.utils.misc import build_from_crawler
except ImportError: # Scrapy < 2.12
from typing import Any, TypeVar
from scrapy.utils.misc import create_instance # type: ignore[attr-defined]
T = TypeVar("T")
def build_from_crawler( # type: ignore[no-redef]
objcls: type[T], crawler: Crawler, /, *args: Any, **kwargs: Any
) -> T:
return create_instance(objcls, None, crawler, *args, **kwargs)
from web_poet import (
HttpClient,
HttpRequest,
HttpRequestBody,
HttpRequestHeaders,
PageParams,
RequestUrl,
Stats,
)
from web_poet.utils import get_fq_class_name
from scrapy_poet import InjectionMiddleware
from scrapy_poet.injection import get_callback
logger = getLogger(__name__)
def _serialize_dep(cls):
if isinstance(cls, CustomBuilder):
cls = cls.result_class_or_fn
elif get_origin(cls) is Annotated:
annotated, *annotations = get_args(cls)
return f"{_serialize_dep(annotated)}{annotations!r}"
return get_fq_class_name(cls)
[docs]
class ScrapyPoetRequestFingerprinter:
IGNORED_UNANNOTATED_DEPS = {
# These dependencies are tools for page objects that should have no
# bearing on the request itself.
HttpClient,
Stats,
# These dependencies do not impact the fingerprint as dependencies,
# it is their value on the request itself that should have an
# impact on the request fingerprint.
HttpRequest,
HttpRequestBody,
HttpRequestHeaders,
PageParams,
RequestUrl,
}
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
[docs]
def __init__(self, crawler: Crawler) -> None:
self._base_request_fingerprinter = build_from_crawler(
load_object(
crawler.settings.get(
"SCRAPY_POET_REQUEST_FINGERPRINTER_BASE_CLASS",
REQUEST_FINGERPRINTER_CLASS,
)
),
crawler,
)
self._callback_cache: dict[Callable, bytes | None] = {}
self._request_cache: WeakKeyDictionary[Request, bytes] = WeakKeyDictionary()
self._crawler: Crawler = crawler
self._saw_unserializable_page_params = False
@cached_property
def _injector(self):
middlewares = self._crawler.engine.downloader.middleware.middlewares
for middleware in middlewares:
if isinstance(middleware, InjectionMiddleware):
return middleware.injector
raise RuntimeError(
"scrapy_poet.InjectionMiddleware not found at run time, has it "
"been configured in the DOWNLOADER_MIDDLEWARES setting?"
)
def _get_deps(self, request: Request) -> list[str] | None:
"""Return a JSON-serializable structure that uniquely identifies the
dependencies requested by the request, or None if dependency injection
is not required."""
plan = self._injector.build_plan(request)
deps = {dep for dep, params in plan[:-1]} - self.IGNORED_UNANNOTATED_DEPS
if not deps:
return None
return sorted([_serialize_dep(cls) for cls in deps])
[docs]
def get_deps_key(self, request: Request) -> bytes | None:
"""Return a JSON array as bytes that uniquely identifies the
dependencies requested through scrapy-poet injection that could
impact the request, or None if there are no such dependencies."""
callback = get_callback(request, self._crawler.spider)
if callback in self._callback_cache:
return self._callback_cache[callback]
deps = self._get_deps(request)
if not deps:
self._callback_cache[callback] = None
return None
deps_key = json.dumps(deps, sort_keys=True).encode()
self._callback_cache[callback] = deps_key
return self._callback_cache[callback]
[docs]
def serialize_page_params(self, request: Request) -> bytes | None:
"""Return a JSON object as bytes that represents the page params,
or None if there are no page params or they are not
JSON-serializable."""
page_params = request.meta.get("page_params", None)
if not page_params:
return None
try:
return json.dumps(page_params, sort_keys=True).encode()
except TypeError:
if not self._saw_unserializable_page_params:
self._saw_unserializable_page_params = True
logger.warning(
f"Cannot serialize page params {page_params!r} of "
f"request {request} as JSON. This can be an issue if "
f"you have requests that are identical except for "
f"their page params, because unserializable page "
f"params are treated the same as missing or empty "
f"page params for purposes of request fingerprinting "
f"(see "
f"https://docs.scrapy.org/en/latest/topics/request-response.html#request-fingerprints). "
f"This will be the only warning about this issue, "
f"other requests might be also affected."
)
return None
def fingerprint(self, request: Request) -> bytes:
if request in self._request_cache:
return self._request_cache[request]
fingerprint = self._base_request_fingerprinter.fingerprint(request)
deps_key = self.get_deps_key(request)
serialized_page_params = self.serialize_page_params(request)
if deps_key is None and serialized_page_params is None:
return fingerprint
if deps_key is not None:
fingerprint += deps_key
if serialized_page_params is not None:
fingerprint += serialized_page_params
self._request_cache[request] = hashlib.sha1(fingerprint).digest() # noqa: S324
return self._request_cache[request]