Source code for scrapy.spidermiddlewares.referer

"""
RefererMiddleware: populates Request referer field, based on the Response which
originated it.
"""

from __future__ import annotations

import warnings
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, cast
from urllib.parse import urlparse
from warnings import warn

from scrapy.exceptions import NotConfigured
from scrapy.http import Request, Response
from scrapy.spidermiddlewares.base import BaseSpiderMiddleware
from scrapy.utils.misc import load_object
from scrapy.utils.python import _looks_like_import_path, to_unicode
from scrapy.utils.url import strip_url

if TYPE_CHECKING:
    # typing.Self requires Python 3.11
    from typing_extensions import Self, TypedDict, Unpack

    from scrapy.crawler import Crawler
    from scrapy.settings import BaseSettings

    class _PolicyKwargs(TypedDict, total=False):
        resp_or_url: Response | str


LOCAL_SCHEMES: tuple[str, ...] = (
    "about",
    "blob",
    "data",
    "filesystem",
)

POLICY_NO_REFERRER = "no-referrer"
POLICY_NO_REFERRER_WHEN_DOWNGRADE = "no-referrer-when-downgrade"
POLICY_SAME_ORIGIN = "same-origin"
POLICY_ORIGIN = "origin"
POLICY_STRICT_ORIGIN = "strict-origin"
POLICY_ORIGIN_WHEN_CROSS_ORIGIN = "origin-when-cross-origin"
POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN = "strict-origin-when-cross-origin"
POLICY_UNSAFE_URL = "unsafe-url"
POLICY_SCRAPY_DEFAULT = "scrapy-default"


[docs] class ReferrerPolicy(ABC): """Abstract base class for referrer policies.""" NOREFERRER_SCHEMES: tuple[str, ...] = LOCAL_SCHEMES name: str @abstractmethod def referrer(self, response_url: str, request_url: str) -> str | None: raise NotImplementedError def stripped_referrer(self, url: str) -> str | None: if urlparse(url).scheme not in self.NOREFERRER_SCHEMES: return self.strip_url(url) return None def origin_referrer(self, url: str) -> str | None: if urlparse(url).scheme not in self.NOREFERRER_SCHEMES: return self.origin(url) return None def strip_url(self, url: str, origin_only: bool = False) -> str | None: """ https://www.w3.org/TR/referrer-policy/#strip-url If url is null, return no referrer. If url's scheme is a local scheme, then return no referrer. Set url's username to the empty string. Set url's password to null. Set url's fragment to null. If the origin-only flag is true, then: Set url's path to null. Set url's query to null. Return url. """ if not url: return None return strip_url( url, strip_credentials=True, strip_fragment=True, strip_default_port=True, origin_only=origin_only, ) def origin(self, url: str) -> str | None: """Return serialized origin (scheme, host, path) for a request or response URL.""" return self.strip_url(url, origin_only=True) def potentially_trustworthy(self, url: str) -> bool: # Note: this does not follow https://w3c.github.io/webappsec-secure-contexts/#is-url-trustworthy parsed_url = urlparse(url) if parsed_url.scheme == "data": return False return self.tls_protected(url) def tls_protected(self, url: str) -> bool: return urlparse(url).scheme in {"https", "ftps"}
[docs] class NoReferrerPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer The simplest policy is "no-referrer", which specifies that no referrer information is to be sent along with requests made from a particular request client to any origin. The header will be omitted entirely. """ name: str = POLICY_NO_REFERRER def referrer(self, response_url: str, request_url: str) -> str | None: return None
[docs] class NoReferrerWhenDowngradePolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-no-referrer-when-downgrade The "no-referrer-when-downgrade" policy sends a full URL along with requests from a TLS-protected environment settings object to a potentially trustworthy URL, and requests from clients which are not TLS-protected to any origin. Requests from TLS-protected clients to non-potentially trustworthy URLs, on the other hand, will contain no referrer information. A Referer HTTP header will not be sent. This is a user agent's default behavior, if no policy is otherwise specified. """ name: str = POLICY_NO_REFERRER_WHEN_DOWNGRADE def referrer(self, response_url: str, request_url: str) -> str | None: if not self.tls_protected(response_url) or self.tls_protected(request_url): return self.stripped_referrer(response_url) return None
[docs] class SameOriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-same-origin The "same-origin" policy specifies that a full URL, stripped for use as a referrer, is sent as referrer information when making same-origin requests from a particular request client. Cross-origin requests, on the other hand, will contain no referrer information. A Referer HTTP header will not be sent. """ name: str = POLICY_SAME_ORIGIN def referrer(self, response_url: str, request_url: str) -> str | None: if self.origin(response_url) == self.origin(request_url): return self.stripped_referrer(response_url) return None
[docs] class OriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-origin The "origin" policy specifies that only the ASCII serialization of the origin of the request client is sent as referrer information when making both same-origin requests and cross-origin requests from a particular request client. """ name: str = POLICY_ORIGIN def referrer(self, response_url: str, request_url: str) -> str | None: return self.origin_referrer(response_url)
[docs] class StrictOriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin The "strict-origin" policy sends the ASCII serialization of the origin of the request client when making requests: - from a TLS-protected environment settings object to a potentially trustworthy URL, and - from non-TLS-protected environment settings objects to any origin. Requests from TLS-protected request clients to non- potentially trustworthy URLs, on the other hand, will contain no referrer information. A Referer HTTP header will not be sent. """ name: str = POLICY_STRICT_ORIGIN def referrer(self, response_url: str, request_url: str) -> str | None: if ( self.tls_protected(response_url) and self.potentially_trustworthy(request_url) ) or not self.tls_protected(response_url): return self.origin_referrer(response_url) return None
[docs] class OriginWhenCrossOriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-origin-when-cross-origin The "origin-when-cross-origin" policy specifies that a full URL, stripped for use as a referrer, is sent as referrer information when making same-origin requests from a particular request client, and only the ASCII serialization of the origin of the request client is sent as referrer information when making cross-origin requests from a particular request client. """ name: str = POLICY_ORIGIN_WHEN_CROSS_ORIGIN def referrer(self, response_url: str, request_url: str) -> str | None: origin = self.origin(response_url) if origin == self.origin(request_url): return self.stripped_referrer(response_url) return origin
[docs] class StrictOriginWhenCrossOriginPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-strict-origin-when-cross-origin The "strict-origin-when-cross-origin" policy specifies that a full URL, stripped for use as a referrer, is sent as referrer information when making same-origin requests from a particular request client, and only the ASCII serialization of the origin of the request client when making cross-origin requests: - from a TLS-protected environment settings object to a potentially trustworthy URL, and - from non-TLS-protected environment settings objects to any origin. Requests from TLS-protected clients to non- potentially trustworthy URLs, on the other hand, will contain no referrer information. A Referer HTTP header will not be sent. """ name: str = POLICY_STRICT_ORIGIN_WHEN_CROSS_ORIGIN def referrer(self, response_url: str, request_url: str) -> str | None: origin = self.origin(response_url) if origin == self.origin(request_url): return self.stripped_referrer(response_url) if ( self.tls_protected(response_url) and self.potentially_trustworthy(request_url) ) or not self.tls_protected(response_url): return self.origin_referrer(response_url) return None
[docs] class UnsafeUrlPolicy(ReferrerPolicy): """ https://www.w3.org/TR/referrer-policy/#referrer-policy-unsafe-url The "unsafe-url" policy specifies that a full URL, stripped for use as a referrer, is sent along with both cross-origin requests and same-origin requests made from a particular request client. Note: The policy's name doesn't lie; it is unsafe. This policy will leak origins and paths from TLS-protected resources to insecure origins. Carefully consider the impact of setting such a policy for potentially sensitive documents. """ name: str = POLICY_UNSAFE_URL def referrer(self, response_url: str, request_url: str) -> str | None: return self.stripped_referrer(response_url)
[docs] class DefaultReferrerPolicy(NoReferrerWhenDowngradePolicy): """ A variant of "no-referrer-when-downgrade", with the addition that "Referer" is not sent if the parent request was using ``file://`` or ``s3://`` scheme. """ NOREFERRER_SCHEMES: tuple[str, ...] = (*LOCAL_SCHEMES, "file", "s3") name: str = POLICY_SCRAPY_DEFAULT
[docs] class RefererMiddleware(BaseSpiderMiddleware): def __init__(self, settings: BaseSettings | None = None): # pylint: disable=super-init-not-called self.default_policy: type[ReferrerPolicy] = DefaultReferrerPolicy self.policies: dict[str, type[ReferrerPolicy]] = { p.name: p for p in ( NoReferrerPolicy, NoReferrerWhenDowngradePolicy, SameOriginPolicy, OriginPolicy, StrictOriginPolicy, OriginWhenCrossOriginPolicy, StrictOriginWhenCrossOriginPolicy, UnsafeUrlPolicy, DefaultReferrerPolicy, ) } # Reference: https://www.w3.org/TR/referrer-policy/#referrer-policy-empty-string self.policies[""] = NoReferrerWhenDowngradePolicy if settings is None: return setting_policies = settings.getdict("REFERRER_POLICIES") for policy_name, policy_class_import_path in setting_policies.items(): if policy_class_import_path is None: del self.policies[policy_name] else: self.policies[policy_name] = load_object(policy_class_import_path) settings_policy = self._load_policy_class( settings.get("REFERRER_POLICY"), allow_import_path=True ) assert settings_policy self.default_policy = settings_policy @classmethod def from_crawler(cls, crawler: Crawler) -> Self: if not crawler.settings.getbool("REFERER_ENABLED"): raise NotConfigured return cls(crawler.settings) def policy( self, response: Response | str | None = None, request: Request | None = None, **kwargs: Unpack[_PolicyKwargs], ) -> ReferrerPolicy: """Return the referrer policy to use for *request* based on *request* meta, *response* and settings. - if a valid policy is set in Request meta, it is used. - if the policy is set in meta but is wrong (e.g. a typo error), the policy from settings is used - if the policy is not set in Request meta, but there is a Referrer-Policy header in the parent response, it is used if valid - otherwise, the policy from settings is used. """ if "resp_or_url" in kwargs: if response is not None: raise TypeError("Cannot pass both 'response' and 'resp_or_url'") response = kwargs.pop("resp_or_url") warn( "Passing 'resp_or_url' is deprecated, use 'response' instead.", DeprecationWarning, stacklevel=2, ) if response is None: raise TypeError("Missing required argument: 'response'") if request is None: raise TypeError("Missing required argument: 'request'") if isinstance(response, str): warn( "Passing a response URL to RefererMiddleware.policy() instead " "of a Response object is deprecated.", DeprecationWarning, stacklevel=2, ) allow_import_path = True policy_name = request.meta.get("referrer_policy") if policy_name is None and isinstance(response, Response): policy_header = response.headers.get("Referrer-Policy") if policy_header is not None: policy_name = to_unicode(policy_header.decode("latin1")) allow_import_path = False if policy_name is None: return self.default_policy() cls = self._load_policy_class( policy_name, warning_only=True, allow_import_path=allow_import_path ) return cls() if cls else self.default_policy() def _load_policy_class( self, policy: str, warning_only: bool = False, *, allow_import_path: bool = False, ) -> type[ReferrerPolicy] | None: """Load the :class:`ReferrerPolicy` class to use for *policy*. *policy* may be any of the following: - A standard policy name, e.g. ``"no-referrer"``, ``"origin-when-cross-origin"``, etc. - The special ``"scrapy-default"`` policy. - The import path of a :class:`ReferrerPolicy` subclass, e.g. ``"scrapy.spidermiddlewares.referer.NoReferrerPolicy"`` or ``"myproject.policies.CustomReferrerPolicy"``. If *warning_only* is ``False`` (default) and *policy* cannot be turned into a :class:`ReferrerPolicy` subclass, a :exc:`RuntimeError` is raised. If *warning_only* is ``True``, a warning is logged and ``None`` is returned instead. If *allow_import_path* is ``False`` (default), import paths are not allowed, resulting in :exc:`RuntimeError` or ``None``. If ``True``, they are allowed. Use ``True`` only if you trust the source of the *policy* value. """ if allow_import_path: try: return cast("type[ReferrerPolicy]", load_object(policy)) except ValueError: pass policy_names = [ policy_name.strip() for policy_name in policy.lower().split(",") ] # https://www.w3.org/TR/referrer-policy/#parse-referrer-policy-from-header for policy_name in policy_names[::-1]: if policy_name in self.policies: return self.policies[policy_name] msg = f"Could not load referrer policy {policy!r}" if not allow_import_path and _looks_like_import_path(policy): msg += " (import paths from the response Referrer-Policy header are not allowed)" if not warning_only: raise RuntimeError(msg) warnings.warn(msg, RuntimeWarning, stacklevel=2) return None def get_processed_request( self, request: Request, response: Response | None ) -> Request | None: if response is None: # start requests return request referrer = self.policy(response, request).referrer(response.url, request.url) if referrer is not None: request.headers.setdefault("Referer", referrer) return request