Source code for scrapy_poet.cache

from __future__ import annotations

import abc
import pickle
from pathlib import Path
from typing import TYPE_CHECKING, Any

from web_poet.serialization.api import SerializedData, SerializedDataFileStorage

if TYPE_CHECKING:
    import os


class _Cache(abc.ABC):
    @abc.abstractmethod
    def __getitem__(self, fingerprint: str) -> Any:
        pass

    @abc.abstractmethod
    def __setitem__(self, fingerprint: str, value) -> None:
        pass

    def close(self) -> None:  # noqa: B027
        pass


[docs] class SerializedDataCache(_Cache): """ Stores dependencies from Providers in a persistent local storage using `web_poet.serialization.SerializedDataFileStorage` """
[docs] def __init__(self, directory: str | os.PathLike) -> None: self.directory = Path(directory)
def __getitem__(self, fingerprint: str) -> SerializedData: storage = SerializedDataFileStorage(self._get_directory_path(fingerprint)) try: serialized_data = storage.read() except FileNotFoundError as ex: raise KeyError(f"Fingerprint '{fingerprint}' not found in cache") from ex return serialized_data def __setitem__(self, fingerprint: str, value: SerializedData | Exception) -> None: if isinstance(value, Exception): self.write_exception(fingerprint, value) else: storage_path = self._get_directory_path(fingerprint) storage_path.mkdir(parents=True, exist_ok=True) storage = SerializedDataFileStorage(storage_path) storage.write(value) def write_exception(self, fingerprint: str, exception: Exception) -> None: exception_path = self._get_exception_file_path(fingerprint) exception_path.parent.mkdir(parents=True, exist_ok=True) with exception_path.open("wb") as file: pickle.dump(exception, file) def _get_directory_path(self, fingerprint: str) -> Path: return self.directory / fingerprint def _get_exception_file_path(self, fingerprint: str) -> Path: """Save exception inside self.directory, so that `storage.read()` can read it correctly""" return self._get_directory_path(fingerprint) / "error"
# TODO: Add option for compressed cache