Source code for scrapy_poet.api

from inspect import iscoroutinefunction
from typing import Callable, Optional, Type

from scrapy.http import Request, Response
from web_poet.pages import ItemPage

_CALLBACK_FOR_MARKER = "__scrapy_poet_callback"



[docs]
class DummyResponse(Response):
    """This class is returned by the
    :class:`~.InjectionMiddleware` when it detects that the download could be
    skipped. It inherits from :class:`scrapy.http.Response` and signals and
    stores the URL and references the original :class:`scrapy.Request
    <scrapy.http.Request>`.

    If you want to skip downloads, you can type annotate your parse method
    with this class.

    .. code-block:: python

        def parse(self, response: DummyResponse):
            pass

    If there's no Page Input that depends on a :class:`scrapy.http.Response`, the
    :class:`~.InjectionMiddleware` is going to skip download and provide a
    :class:`~.DummyResponse` to your parser instead.
    """

    def __init__(self, url: str, request: Optional[Request] = None):
        super().__init__(url=url, request=request)




[docs]
def callback_for(page_or_item_cls: Type) -> Callable:
    """Create a callback for an :class:`web_poet.ItemPage <web_poet.pages.ItemPage>`
    subclass or an item class.

    The generated callback returns the output of the
    :meth:`to_item <web_poet.pages.ItemPage.to_item>` method, i.e. extracts a single
    item from a web page, using a Page Object.

    This helper allows to reduce the boilerplate when working
    with Page Objects. For example, instead of this:

    .. code-block:: python

        class BooksSpider(scrapy.Spider):
            name = "books"
            start_urls = ["http://books.toscrape.com/"]

            def parse(self, response):
                links = response.css(".image_container a")
                yield from response.follow_all(links, self.parse_book)

            def parse_book(self, response: DummyResponse, page: BookPage):
                return page.to_item()

    It allows to write this:

    .. code-block:: python

        class BooksSpider(scrapy.Spider):
            name = "books"
            start_urls = ["http://books.toscrape.com/"]

            def parse(self, response):
                links = response.css(".image_container a")
                yield from response.follow_all(links, self.parse_book)

            parse_book = callback_for(BookPage)

    It also supports producing an async generator callable if the Page Objects's
    :meth:`to_item <web_poet.pages.ItemPage.to_item>` method is a coroutine
    which uses the ``async/await`` syntax.

    So if we have the following:

    .. code-block:: python

        class BooksSpider(scrapy.Spider):
            name = "books"
            start_urls = ["http://books.toscrape.com/"]

            def parse(self, response):
                links = response.css(".image_container a")
                yield from response.follow_all(links, self.parse_book)

            async def parse_book(self, response: DummyResponse, page: BookPage):
                yield await page.to_item()

    It could be turned into:

    .. code-block:: python

        class BooksSpider(scrapy.Spider):
            name = "books"
            start_urls = ["http://books.toscrape.com/"]

            def parse(self, response):
                links = response.css(".image_container a")
                yield from response.follow_all(links, self.parse_book)

            parse_book = callback_for(BookPage)

    The generated callback could be used as a spider instance method or passed
    as an inline/anonymous argument. Make sure to define it as a spider
    attribute (as shown in the example above) if you're planning to use
    disk queues, because in this case Scrapy is able to serialize
    your request object.
    """
    # When the callback is used as an instance method of the spider, it expects
    # to receive 'self' as its first argument. When used as a simple inline
    # function, it expects to receive a response as its first argument.
    #
    # To avoid a TypeError, we need to receive a list of unnamed arguments and
    # a dict of named arguments after our injectable.
    if issubclass(page_or_item_cls, ItemPage):

        def parse(*args, page: page_or_item_cls, **kwargs):  # type: ignore
            yield page.to_item()  # type: ignore

        async def async_parse(*args, page: page_or_item_cls, **kwargs):  # type: ignore
            yield await page.to_item()  # type: ignore

        if iscoroutinefunction(page_or_item_cls.to_item):
            setattr(async_parse, _CALLBACK_FOR_MARKER, True)
            return async_parse

    else:

        def parse(*args, item: page_or_item_cls, **kwargs):  # type:ignore
            yield item

    setattr(parse, _CALLBACK_FOR_MARKER, True)
    return parse