Source code for scrapy_poet.api

from inspect import iscoroutinefunction
from typing import Callable, Optional, Type

from scrapy.http import Request, Response
from web_poet.pages import ItemPage

_CALLBACK_FOR_MARKER = "__scrapy_poet_callback"


[docs] class DummyResponse(Response): """This class is returned by the :class:`~.InjectionMiddleware` when it detects that the download could be skipped. It inherits from :class:`scrapy.http.Response` and signals and stores the URL and references the original :class:`scrapy.Request <scrapy.http.Request>`. If you want to skip downloads, you can type annotate your parse method with this class. .. code-block:: python def parse(self, response: DummyResponse): pass If there's no Page Input that depends on a :class:`scrapy.http.Response`, the :class:`~.InjectionMiddleware` is going to skip download and provide a :class:`~.DummyResponse` to your parser instead. """ def __init__(self, url: str, request: Optional[Request] = None): super().__init__(url=url, request=request)
[docs] def callback_for(page_or_item_cls: Type) -> Callable: """Create a callback for an :class:`web_poet.ItemPage <web_poet.pages.ItemPage>` subclass or an item class. The generated callback returns the output of the :meth:`to_item <web_poet.pages.ItemPage.to_item>` method, i.e. extracts a single item from a web page, using a Page Object. This helper allows to reduce the boilerplate when working with Page Objects. For example, instead of this: .. code-block:: python class BooksSpider(scrapy.Spider): name = "books" start_urls = ["http://books.toscrape.com/"] def parse(self, response): links = response.css(".image_container a") yield from response.follow_all(links, self.parse_book) def parse_book(self, response: DummyResponse, page: BookPage): return page.to_item() It allows to write this: .. code-block:: python class BooksSpider(scrapy.Spider): name = "books" start_urls = ["http://books.toscrape.com/"] def parse(self, response): links = response.css(".image_container a") yield from response.follow_all(links, self.parse_book) parse_book = callback_for(BookPage) It also supports producing an async generator callable if the Page Objects's :meth:`to_item <web_poet.pages.ItemPage.to_item>` method is a coroutine which uses the ``async/await`` syntax. So if we have the following: .. code-block:: python class BooksSpider(scrapy.Spider): name = "books" start_urls = ["http://books.toscrape.com/"] def parse(self, response): links = response.css(".image_container a") yield from response.follow_all(links, self.parse_book) async def parse_book(self, response: DummyResponse, page: BookPage): yield await page.to_item() It could be turned into: .. code-block:: python class BooksSpider(scrapy.Spider): name = "books" start_urls = ["http://books.toscrape.com/"] def parse(self, response): links = response.css(".image_container a") yield from response.follow_all(links, self.parse_book) parse_book = callback_for(BookPage) The generated callback could be used as a spider instance method or passed as an inline/anonymous argument. Make sure to define it as a spider attribute (as shown in the example above) if you're planning to use disk queues, because in this case Scrapy is able to serialize your request object. """ # When the callback is used as an instance method of the spider, it expects # to receive 'self' as its first argument. When used as a simple inline # function, it expects to receive a response as its first argument. # # To avoid a TypeError, we need to receive a list of unnamed arguments and # a dict of named arguments after our injectable. if issubclass(page_or_item_cls, ItemPage): def parse(*args, page: page_or_item_cls, **kwargs): # type: ignore yield page.to_item() # type: ignore async def async_parse(*args, page: page_or_item_cls, **kwargs): # type: ignore yield await page.to_item() # type: ignore if iscoroutinefunction(page_or_item_cls.to_item): setattr(async_parse, _CALLBACK_FOR_MARKER, True) return async_parse else: def parse(*args, item: page_or_item_cls, **kwargs): # type:ignore yield item setattr(parse, _CALLBACK_FOR_MARKER, True) return parse