mirror of
				https://gitlab.sectorq.eu/jaydee/omv_backup.git
				synced 2025-10-31 02:21:10 +01:00 
			
		
		
		
	added v3
This commit is contained in:
		| @@ -0,0 +1,2 @@ | ||||
| """Index interaction code | ||||
| """ | ||||
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											Binary file not shown.
										
									
								
							| @@ -0,0 +1,505 @@ | ||||
| """ | ||||
| The main purpose of this module is to expose LinkCollector.collect_sources(). | ||||
| """ | ||||
|  | ||||
| import collections | ||||
| import email.message | ||||
| import functools | ||||
| import itertools | ||||
| import json | ||||
| import logging | ||||
| import os | ||||
| import urllib.parse | ||||
| import urllib.request | ||||
| from html.parser import HTMLParser | ||||
| from optparse import Values | ||||
| from typing import ( | ||||
|     TYPE_CHECKING, | ||||
|     Callable, | ||||
|     Dict, | ||||
|     Iterable, | ||||
|     List, | ||||
|     MutableMapping, | ||||
|     NamedTuple, | ||||
|     Optional, | ||||
|     Sequence, | ||||
|     Tuple, | ||||
|     Union, | ||||
| ) | ||||
|  | ||||
| from pip._vendor import requests | ||||
| from pip._vendor.requests import Response | ||||
| from pip._vendor.requests.exceptions import RetryError, SSLError | ||||
|  | ||||
| from pip._internal.exceptions import NetworkConnectionError | ||||
| from pip._internal.models.link import Link | ||||
| from pip._internal.models.search_scope import SearchScope | ||||
| from pip._internal.network.session import PipSession | ||||
| from pip._internal.network.utils import raise_for_status | ||||
| from pip._internal.utils.filetypes import is_archive_file | ||||
| from pip._internal.utils.misc import redact_auth_from_url | ||||
| from pip._internal.vcs import vcs | ||||
|  | ||||
| from .sources import CandidatesFromPage, LinkSource, build_source | ||||
|  | ||||
| if TYPE_CHECKING: | ||||
|     from typing import Protocol | ||||
| else: | ||||
|     Protocol = object | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| ResponseHeaders = MutableMapping[str, str] | ||||
|  | ||||
|  | ||||
| def _match_vcs_scheme(url: str) -> Optional[str]: | ||||
|     """Look for VCS schemes in the URL. | ||||
|  | ||||
|     Returns the matched VCS scheme, or None if there's no match. | ||||
|     """ | ||||
|     for scheme in vcs.schemes: | ||||
|         if url.lower().startswith(scheme) and url[len(scheme)] in "+:": | ||||
|             return scheme | ||||
|     return None | ||||
|  | ||||
|  | ||||
| class _NotAPIContent(Exception): | ||||
|     def __init__(self, content_type: str, request_desc: str) -> None: | ||||
|         super().__init__(content_type, request_desc) | ||||
|         self.content_type = content_type | ||||
|         self.request_desc = request_desc | ||||
|  | ||||
|  | ||||
| def _ensure_api_header(response: Response) -> None: | ||||
|     """ | ||||
|     Check the Content-Type header to ensure the response contains a Simple | ||||
|     API Response. | ||||
|  | ||||
|     Raises `_NotAPIContent` if the content type is not a valid content-type. | ||||
|     """ | ||||
|     content_type = response.headers.get("Content-Type", "Unknown") | ||||
|  | ||||
|     content_type_l = content_type.lower() | ||||
|     if content_type_l.startswith( | ||||
|         ( | ||||
|             "text/html", | ||||
|             "application/vnd.pypi.simple.v1+html", | ||||
|             "application/vnd.pypi.simple.v1+json", | ||||
|         ) | ||||
|     ): | ||||
|         return | ||||
|  | ||||
|     raise _NotAPIContent(content_type, response.request.method) | ||||
|  | ||||
|  | ||||
| class _NotHTTP(Exception): | ||||
|     pass | ||||
|  | ||||
|  | ||||
| def _ensure_api_response(url: str, session: PipSession) -> None: | ||||
|     """ | ||||
|     Send a HEAD request to the URL, and ensure the response contains a simple | ||||
|     API Response. | ||||
|  | ||||
|     Raises `_NotHTTP` if the URL is not available for a HEAD request, or | ||||
|     `_NotAPIContent` if the content type is not a valid content type. | ||||
|     """ | ||||
|     scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url) | ||||
|     if scheme not in {"http", "https"}: | ||||
|         raise _NotHTTP() | ||||
|  | ||||
|     resp = session.head(url, allow_redirects=True) | ||||
|     raise_for_status(resp) | ||||
|  | ||||
|     _ensure_api_header(resp) | ||||
|  | ||||
|  | ||||
| def _get_simple_response(url: str, session: PipSession) -> Response: | ||||
|     """Access an Simple API response with GET, and return the response. | ||||
|  | ||||
|     This consists of three parts: | ||||
|  | ||||
|     1. If the URL looks suspiciously like an archive, send a HEAD first to | ||||
|        check the Content-Type is HTML or Simple API, to avoid downloading a | ||||
|        large file. Raise `_NotHTTP` if the content type cannot be determined, or | ||||
|        `_NotAPIContent` if it is not HTML or a Simple API. | ||||
|     2. Actually perform the request. Raise HTTP exceptions on network failures. | ||||
|     3. Check the Content-Type header to make sure we got a Simple API response, | ||||
|        and raise `_NotAPIContent` otherwise. | ||||
|     """ | ||||
|     if is_archive_file(Link(url).filename): | ||||
|         _ensure_api_response(url, session=session) | ||||
|  | ||||
|     logger.debug("Getting page %s", redact_auth_from_url(url)) | ||||
|  | ||||
|     resp = session.get( | ||||
|         url, | ||||
|         headers={ | ||||
|             "Accept": ", ".join( | ||||
|                 [ | ||||
|                     "application/vnd.pypi.simple.v1+json", | ||||
|                     "application/vnd.pypi.simple.v1+html; q=0.1", | ||||
|                     "text/html; q=0.01", | ||||
|                 ] | ||||
|             ), | ||||
|             # We don't want to blindly returned cached data for | ||||
|             # /simple/, because authors generally expecting that | ||||
|             # twine upload && pip install will function, but if | ||||
|             # they've done a pip install in the last ~10 minutes | ||||
|             # it won't. Thus by setting this to zero we will not | ||||
|             # blindly use any cached data, however the benefit of | ||||
|             # using max-age=0 instead of no-cache, is that we will | ||||
|             # still support conditional requests, so we will still | ||||
|             # minimize traffic sent in cases where the page hasn't | ||||
|             # changed at all, we will just always incur the round | ||||
|             # trip for the conditional GET now instead of only | ||||
|             # once per 10 minutes. | ||||
|             # For more information, please see pypa/pip#5670. | ||||
|             "Cache-Control": "max-age=0", | ||||
|         }, | ||||
|     ) | ||||
|     raise_for_status(resp) | ||||
|  | ||||
|     # The check for archives above only works if the url ends with | ||||
|     # something that looks like an archive. However that is not a | ||||
|     # requirement of an url. Unless we issue a HEAD request on every | ||||
|     # url we cannot know ahead of time for sure if something is a | ||||
|     # Simple API response or not. However we can check after we've | ||||
|     # downloaded it. | ||||
|     _ensure_api_header(resp) | ||||
|  | ||||
|     logger.debug( | ||||
|         "Fetched page %s as %s", | ||||
|         redact_auth_from_url(url), | ||||
|         resp.headers.get("Content-Type", "Unknown"), | ||||
|     ) | ||||
|  | ||||
|     return resp | ||||
|  | ||||
|  | ||||
| def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]: | ||||
|     """Determine if we have any encoding information in our headers.""" | ||||
|     if headers and "Content-Type" in headers: | ||||
|         m = email.message.Message() | ||||
|         m["content-type"] = headers["Content-Type"] | ||||
|         charset = m.get_param("charset") | ||||
|         if charset: | ||||
|             return str(charset) | ||||
|     return None | ||||
|  | ||||
|  | ||||
| class CacheablePageContent: | ||||
|     def __init__(self, page: "IndexContent") -> None: | ||||
|         assert page.cache_link_parsing | ||||
|         self.page = page | ||||
|  | ||||
|     def __eq__(self, other: object) -> bool: | ||||
|         return isinstance(other, type(self)) and self.page.url == other.page.url | ||||
|  | ||||
|     def __hash__(self) -> int: | ||||
|         return hash(self.page.url) | ||||
|  | ||||
|  | ||||
| class ParseLinks(Protocol): | ||||
|     def __call__(self, page: "IndexContent") -> Iterable[Link]: | ||||
|         ... | ||||
|  | ||||
|  | ||||
| def with_cached_index_content(fn: ParseLinks) -> ParseLinks: | ||||
|     """ | ||||
|     Given a function that parses an Iterable[Link] from an IndexContent, cache the | ||||
|     function's result (keyed by CacheablePageContent), unless the IndexContent | ||||
|     `page` has `page.cache_link_parsing == False`. | ||||
|     """ | ||||
|  | ||||
|     @functools.lru_cache(maxsize=None) | ||||
|     def wrapper(cacheable_page: CacheablePageContent) -> List[Link]: | ||||
|         return list(fn(cacheable_page.page)) | ||||
|  | ||||
|     @functools.wraps(fn) | ||||
|     def wrapper_wrapper(page: "IndexContent") -> List[Link]: | ||||
|         if page.cache_link_parsing: | ||||
|             return wrapper(CacheablePageContent(page)) | ||||
|         return list(fn(page)) | ||||
|  | ||||
|     return wrapper_wrapper | ||||
|  | ||||
|  | ||||
| @with_cached_index_content | ||||
| def parse_links(page: "IndexContent") -> Iterable[Link]: | ||||
|     """ | ||||
|     Parse a Simple API's Index Content, and yield its anchor elements as Link objects. | ||||
|     """ | ||||
|  | ||||
|     content_type_l = page.content_type.lower() | ||||
|     if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): | ||||
|         data = json.loads(page.content) | ||||
|         for file in data.get("files", []): | ||||
|             link = Link.from_json(file, page.url) | ||||
|             if link is None: | ||||
|                 continue | ||||
|             yield link | ||||
|         return | ||||
|  | ||||
|     parser = HTMLLinkParser(page.url) | ||||
|     encoding = page.encoding or "utf-8" | ||||
|     parser.feed(page.content.decode(encoding)) | ||||
|  | ||||
|     url = page.url | ||||
|     base_url = parser.base_url or url | ||||
|     for anchor in parser.anchors: | ||||
|         link = Link.from_element(anchor, page_url=url, base_url=base_url) | ||||
|         if link is None: | ||||
|             continue | ||||
|         yield link | ||||
|  | ||||
|  | ||||
| class IndexContent: | ||||
|     """Represents one response (or page), along with its URL""" | ||||
|  | ||||
|     def __init__( | ||||
|         self, | ||||
|         content: bytes, | ||||
|         content_type: str, | ||||
|         encoding: Optional[str], | ||||
|         url: str, | ||||
|         cache_link_parsing: bool = True, | ||||
|     ) -> None: | ||||
|         """ | ||||
|         :param encoding: the encoding to decode the given content. | ||||
|         :param url: the URL from which the HTML was downloaded. | ||||
|         :param cache_link_parsing: whether links parsed from this page's url | ||||
|                                    should be cached. PyPI index urls should | ||||
|                                    have this set to False, for example. | ||||
|         """ | ||||
|         self.content = content | ||||
|         self.content_type = content_type | ||||
|         self.encoding = encoding | ||||
|         self.url = url | ||||
|         self.cache_link_parsing = cache_link_parsing | ||||
|  | ||||
|     def __str__(self) -> str: | ||||
|         return redact_auth_from_url(self.url) | ||||
|  | ||||
|  | ||||
| class HTMLLinkParser(HTMLParser): | ||||
|     """ | ||||
|     HTMLParser that keeps the first base HREF and a list of all anchor | ||||
|     elements' attributes. | ||||
|     """ | ||||
|  | ||||
|     def __init__(self, url: str) -> None: | ||||
|         super().__init__(convert_charrefs=True) | ||||
|  | ||||
|         self.url: str = url | ||||
|         self.base_url: Optional[str] = None | ||||
|         self.anchors: List[Dict[str, Optional[str]]] = [] | ||||
|  | ||||
|     def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: | ||||
|         if tag == "base" and self.base_url is None: | ||||
|             href = self.get_href(attrs) | ||||
|             if href is not None: | ||||
|                 self.base_url = href | ||||
|         elif tag == "a": | ||||
|             self.anchors.append(dict(attrs)) | ||||
|  | ||||
|     def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]: | ||||
|         for name, value in attrs: | ||||
|             if name == "href": | ||||
|                 return value | ||||
|         return None | ||||
|  | ||||
|  | ||||
| def _handle_get_simple_fail( | ||||
|     link: Link, | ||||
|     reason: Union[str, Exception], | ||||
|     meth: Optional[Callable[..., None]] = None, | ||||
| ) -> None: | ||||
|     if meth is None: | ||||
|         meth = logger.debug | ||||
|     meth("Could not fetch URL %s: %s - skipping", link, reason) | ||||
|  | ||||
|  | ||||
| def _make_index_content( | ||||
|     response: Response, cache_link_parsing: bool = True | ||||
| ) -> IndexContent: | ||||
|     encoding = _get_encoding_from_headers(response.headers) | ||||
|     return IndexContent( | ||||
|         response.content, | ||||
|         response.headers["Content-Type"], | ||||
|         encoding=encoding, | ||||
|         url=response.url, | ||||
|         cache_link_parsing=cache_link_parsing, | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]: | ||||
|     url = link.url.split("#", 1)[0] | ||||
|  | ||||
|     # Check for VCS schemes that do not support lookup as web pages. | ||||
|     vcs_scheme = _match_vcs_scheme(url) | ||||
|     if vcs_scheme: | ||||
|         logger.warning( | ||||
|             "Cannot look at %s URL %s because it does not support lookup as web pages.", | ||||
|             vcs_scheme, | ||||
|             link, | ||||
|         ) | ||||
|         return None | ||||
|  | ||||
|     # Tack index.html onto file:// URLs that point to directories | ||||
|     scheme, _, path, _, _, _ = urllib.parse.urlparse(url) | ||||
|     if scheme == "file" and os.path.isdir(urllib.request.url2pathname(path)): | ||||
|         # add trailing slash if not present so urljoin doesn't trim | ||||
|         # final segment | ||||
|         if not url.endswith("/"): | ||||
|             url += "/" | ||||
|         # TODO: In the future, it would be nice if pip supported PEP 691 | ||||
|         #       style responses in the file:// URLs, however there's no | ||||
|         #       standard file extension for application/vnd.pypi.simple.v1+json | ||||
|         #       so we'll need to come up with something on our own. | ||||
|         url = urllib.parse.urljoin(url, "index.html") | ||||
|         logger.debug(" file: URL is directory, getting %s", url) | ||||
|  | ||||
|     try: | ||||
|         resp = _get_simple_response(url, session=session) | ||||
|     except _NotHTTP: | ||||
|         logger.warning( | ||||
|             "Skipping page %s because it looks like an archive, and cannot " | ||||
|             "be checked by a HTTP HEAD request.", | ||||
|             link, | ||||
|         ) | ||||
|     except _NotAPIContent as exc: | ||||
|         logger.warning( | ||||
|             "Skipping page %s because the %s request got Content-Type: %s. " | ||||
|             "The only supported Content-Types are application/vnd.pypi.simple.v1+json, " | ||||
|             "application/vnd.pypi.simple.v1+html, and text/html", | ||||
|             link, | ||||
|             exc.request_desc, | ||||
|             exc.content_type, | ||||
|         ) | ||||
|     except NetworkConnectionError as exc: | ||||
|         _handle_get_simple_fail(link, exc) | ||||
|     except RetryError as exc: | ||||
|         _handle_get_simple_fail(link, exc) | ||||
|     except SSLError as exc: | ||||
|         reason = "There was a problem confirming the ssl certificate: " | ||||
|         reason += str(exc) | ||||
|         _handle_get_simple_fail(link, reason, meth=logger.info) | ||||
|     except requests.ConnectionError as exc: | ||||
|         _handle_get_simple_fail(link, f"connection error: {exc}") | ||||
|     except requests.Timeout: | ||||
|         _handle_get_simple_fail(link, "timed out") | ||||
|     else: | ||||
|         return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing) | ||||
|     return None | ||||
|  | ||||
|  | ||||
| class CollectedSources(NamedTuple): | ||||
|     find_links: Sequence[Optional[LinkSource]] | ||||
|     index_urls: Sequence[Optional[LinkSource]] | ||||
|  | ||||
|  | ||||
| class LinkCollector: | ||||
|  | ||||
|     """ | ||||
|     Responsible for collecting Link objects from all configured locations, | ||||
|     making network requests as needed. | ||||
|  | ||||
|     The class's main method is its collect_sources() method. | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|         self, | ||||
|         session: PipSession, | ||||
|         search_scope: SearchScope, | ||||
|     ) -> None: | ||||
|         self.search_scope = search_scope | ||||
|         self.session = session | ||||
|  | ||||
|     @classmethod | ||||
|     def create( | ||||
|         cls, | ||||
|         session: PipSession, | ||||
|         options: Values, | ||||
|         suppress_no_index: bool = False, | ||||
|     ) -> "LinkCollector": | ||||
|         """ | ||||
|         :param session: The Session to use to make requests. | ||||
|         :param suppress_no_index: Whether to ignore the --no-index option | ||||
|             when constructing the SearchScope object. | ||||
|         """ | ||||
|         index_urls = [options.index_url] + options.extra_index_urls | ||||
|         if options.no_index and not suppress_no_index: | ||||
|             logger.debug( | ||||
|                 "Ignoring indexes: %s", | ||||
|                 ",".join(redact_auth_from_url(url) for url in index_urls), | ||||
|             ) | ||||
|             index_urls = [] | ||||
|  | ||||
|         # Make sure find_links is a list before passing to create(). | ||||
|         find_links = options.find_links or [] | ||||
|  | ||||
|         search_scope = SearchScope.create( | ||||
|             find_links=find_links, | ||||
|             index_urls=index_urls, | ||||
|             no_index=options.no_index, | ||||
|         ) | ||||
|         link_collector = LinkCollector( | ||||
|             session=session, | ||||
|             search_scope=search_scope, | ||||
|         ) | ||||
|         return link_collector | ||||
|  | ||||
|     @property | ||||
|     def find_links(self) -> List[str]: | ||||
|         return self.search_scope.find_links | ||||
|  | ||||
|     def fetch_response(self, location: Link) -> Optional[IndexContent]: | ||||
|         """ | ||||
|         Fetch an HTML page containing package links. | ||||
|         """ | ||||
|         return _get_index_content(location, session=self.session) | ||||
|  | ||||
|     def collect_sources( | ||||
|         self, | ||||
|         project_name: str, | ||||
|         candidates_from_page: CandidatesFromPage, | ||||
|     ) -> CollectedSources: | ||||
|         # The OrderedDict calls deduplicate sources by URL. | ||||
|         index_url_sources = collections.OrderedDict( | ||||
|             build_source( | ||||
|                 loc, | ||||
|                 candidates_from_page=candidates_from_page, | ||||
|                 page_validator=self.session.is_secure_origin, | ||||
|                 expand_dir=False, | ||||
|                 cache_link_parsing=False, | ||||
|             ) | ||||
|             for loc in self.search_scope.get_index_urls_locations(project_name) | ||||
|         ).values() | ||||
|         find_links_sources = collections.OrderedDict( | ||||
|             build_source( | ||||
|                 loc, | ||||
|                 candidates_from_page=candidates_from_page, | ||||
|                 page_validator=self.session.is_secure_origin, | ||||
|                 expand_dir=True, | ||||
|                 cache_link_parsing=True, | ||||
|             ) | ||||
|             for loc in self.find_links | ||||
|         ).values() | ||||
|  | ||||
|         if logger.isEnabledFor(logging.DEBUG): | ||||
|             lines = [ | ||||
|                 f"* {s.link}" | ||||
|                 for s in itertools.chain(find_links_sources, index_url_sources) | ||||
|                 if s is not None and s.link is not None | ||||
|             ] | ||||
|             lines = [ | ||||
|                 f"{len(lines)} location(s) to search " | ||||
|                 f"for versions of {project_name}:" | ||||
|             ] + lines | ||||
|             logger.debug("\n".join(lines)) | ||||
|  | ||||
|         return CollectedSources( | ||||
|             find_links=list(find_links_sources), | ||||
|             index_urls=list(index_url_sources), | ||||
|         ) | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										224
									
								
								venv/lib/python3.11/site-packages/pip/_internal/index/sources.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										224
									
								
								venv/lib/python3.11/site-packages/pip/_internal/index/sources.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,224 @@ | ||||
| import logging | ||||
| import mimetypes | ||||
| import os | ||||
| import pathlib | ||||
| from typing import Callable, Iterable, Optional, Tuple | ||||
|  | ||||
| from pip._internal.models.candidate import InstallationCandidate | ||||
| from pip._internal.models.link import Link | ||||
| from pip._internal.utils.urls import path_to_url, url_to_path | ||||
| from pip._internal.vcs import is_url | ||||
|  | ||||
| logger = logging.getLogger(__name__) | ||||
|  | ||||
| FoundCandidates = Iterable[InstallationCandidate] | ||||
| FoundLinks = Iterable[Link] | ||||
| CandidatesFromPage = Callable[[Link], Iterable[InstallationCandidate]] | ||||
| PageValidator = Callable[[Link], bool] | ||||
|  | ||||
|  | ||||
| class LinkSource: | ||||
|     @property | ||||
|     def link(self) -> Optional[Link]: | ||||
|         """Returns the underlying link, if there's one.""" | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def page_candidates(self) -> FoundCandidates: | ||||
|         """Candidates found by parsing an archive listing HTML file.""" | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|     def file_links(self) -> FoundLinks: | ||||
|         """Links found by specifying archives directly.""" | ||||
|         raise NotImplementedError() | ||||
|  | ||||
|  | ||||
| def _is_html_file(file_url: str) -> bool: | ||||
|     return mimetypes.guess_type(file_url, strict=False)[0] == "text/html" | ||||
|  | ||||
|  | ||||
| class _FlatDirectorySource(LinkSource): | ||||
|     """Link source specified by ``--find-links=<path-to-dir>``. | ||||
|  | ||||
|     This looks the content of the directory, and returns: | ||||
|  | ||||
|     * ``page_candidates``: Links listed on each HTML file in the directory. | ||||
|     * ``file_candidates``: Archives in the directory. | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|         self, | ||||
|         candidates_from_page: CandidatesFromPage, | ||||
|         path: str, | ||||
|     ) -> None: | ||||
|         self._candidates_from_page = candidates_from_page | ||||
|         self._path = pathlib.Path(os.path.realpath(path)) | ||||
|  | ||||
|     @property | ||||
|     def link(self) -> Optional[Link]: | ||||
|         return None | ||||
|  | ||||
|     def page_candidates(self) -> FoundCandidates: | ||||
|         for path in self._path.iterdir(): | ||||
|             url = path_to_url(str(path)) | ||||
|             if not _is_html_file(url): | ||||
|                 continue | ||||
|             yield from self._candidates_from_page(Link(url)) | ||||
|  | ||||
|     def file_links(self) -> FoundLinks: | ||||
|         for path in self._path.iterdir(): | ||||
|             url = path_to_url(str(path)) | ||||
|             if _is_html_file(url): | ||||
|                 continue | ||||
|             yield Link(url) | ||||
|  | ||||
|  | ||||
| class _LocalFileSource(LinkSource): | ||||
|     """``--find-links=<path-or-url>`` or ``--[extra-]index-url=<path-or-url>``. | ||||
|  | ||||
|     If a URL is supplied, it must be a ``file:`` URL. If a path is supplied to | ||||
|     the option, it is converted to a URL first. This returns: | ||||
|  | ||||
|     * ``page_candidates``: Links listed on an HTML file. | ||||
|     * ``file_candidates``: The non-HTML file. | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|         self, | ||||
|         candidates_from_page: CandidatesFromPage, | ||||
|         link: Link, | ||||
|     ) -> None: | ||||
|         self._candidates_from_page = candidates_from_page | ||||
|         self._link = link | ||||
|  | ||||
|     @property | ||||
|     def link(self) -> Optional[Link]: | ||||
|         return self._link | ||||
|  | ||||
|     def page_candidates(self) -> FoundCandidates: | ||||
|         if not _is_html_file(self._link.url): | ||||
|             return | ||||
|         yield from self._candidates_from_page(self._link) | ||||
|  | ||||
|     def file_links(self) -> FoundLinks: | ||||
|         if _is_html_file(self._link.url): | ||||
|             return | ||||
|         yield self._link | ||||
|  | ||||
|  | ||||
| class _RemoteFileSource(LinkSource): | ||||
|     """``--find-links=<url>`` or ``--[extra-]index-url=<url>``. | ||||
|  | ||||
|     This returns: | ||||
|  | ||||
|     * ``page_candidates``: Links listed on an HTML file. | ||||
|     * ``file_candidates``: The non-HTML file. | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|         self, | ||||
|         candidates_from_page: CandidatesFromPage, | ||||
|         page_validator: PageValidator, | ||||
|         link: Link, | ||||
|     ) -> None: | ||||
|         self._candidates_from_page = candidates_from_page | ||||
|         self._page_validator = page_validator | ||||
|         self._link = link | ||||
|  | ||||
|     @property | ||||
|     def link(self) -> Optional[Link]: | ||||
|         return self._link | ||||
|  | ||||
|     def page_candidates(self) -> FoundCandidates: | ||||
|         if not self._page_validator(self._link): | ||||
|             return | ||||
|         yield from self._candidates_from_page(self._link) | ||||
|  | ||||
|     def file_links(self) -> FoundLinks: | ||||
|         yield self._link | ||||
|  | ||||
|  | ||||
| class _IndexDirectorySource(LinkSource): | ||||
|     """``--[extra-]index-url=<path-to-directory>``. | ||||
|  | ||||
|     This is treated like a remote URL; ``candidates_from_page`` contains logic | ||||
|     for this by appending ``index.html`` to the link. | ||||
|     """ | ||||
|  | ||||
|     def __init__( | ||||
|         self, | ||||
|         candidates_from_page: CandidatesFromPage, | ||||
|         link: Link, | ||||
|     ) -> None: | ||||
|         self._candidates_from_page = candidates_from_page | ||||
|         self._link = link | ||||
|  | ||||
|     @property | ||||
|     def link(self) -> Optional[Link]: | ||||
|         return self._link | ||||
|  | ||||
|     def page_candidates(self) -> FoundCandidates: | ||||
|         yield from self._candidates_from_page(self._link) | ||||
|  | ||||
|     def file_links(self) -> FoundLinks: | ||||
|         return () | ||||
|  | ||||
|  | ||||
| def build_source( | ||||
|     location: str, | ||||
|     *, | ||||
|     candidates_from_page: CandidatesFromPage, | ||||
|     page_validator: PageValidator, | ||||
|     expand_dir: bool, | ||||
|     cache_link_parsing: bool, | ||||
| ) -> Tuple[Optional[str], Optional[LinkSource]]: | ||||
|  | ||||
|     path: Optional[str] = None | ||||
|     url: Optional[str] = None | ||||
|     if os.path.exists(location):  # Is a local path. | ||||
|         url = path_to_url(location) | ||||
|         path = location | ||||
|     elif location.startswith("file:"):  # A file: URL. | ||||
|         url = location | ||||
|         path = url_to_path(location) | ||||
|     elif is_url(location): | ||||
|         url = location | ||||
|  | ||||
|     if url is None: | ||||
|         msg = ( | ||||
|             "Location '%s' is ignored: " | ||||
|             "it is either a non-existing path or lacks a specific scheme." | ||||
|         ) | ||||
|         logger.warning(msg, location) | ||||
|         return (None, None) | ||||
|  | ||||
|     if path is None: | ||||
|         source: LinkSource = _RemoteFileSource( | ||||
|             candidates_from_page=candidates_from_page, | ||||
|             page_validator=page_validator, | ||||
|             link=Link(url, cache_link_parsing=cache_link_parsing), | ||||
|         ) | ||||
|         return (url, source) | ||||
|  | ||||
|     if os.path.isdir(path): | ||||
|         if expand_dir: | ||||
|             source = _FlatDirectorySource( | ||||
|                 candidates_from_page=candidates_from_page, | ||||
|                 path=path, | ||||
|             ) | ||||
|         else: | ||||
|             source = _IndexDirectorySource( | ||||
|                 candidates_from_page=candidates_from_page, | ||||
|                 link=Link(url, cache_link_parsing=cache_link_parsing), | ||||
|             ) | ||||
|         return (url, source) | ||||
|     elif os.path.isfile(path): | ||||
|         source = _LocalFileSource( | ||||
|             candidates_from_page=candidates_from_page, | ||||
|             link=Link(url, cache_link_parsing=cache_link_parsing), | ||||
|         ) | ||||
|         return (url, source) | ||||
|     logger.warning( | ||||
|         "Location '%s' is ignored: it is neither a file nor a directory.", | ||||
|         location, | ||||
|     ) | ||||
|     return (url, None) | ||||
		Reference in New Issue
	
	Block a user