diff --git a/python/private/pypi/extension.bzl b/python/private/pypi/extension.bzl index 7354a67e67..bd384722fc 100644 --- a/python/private/pypi/extension.bzl +++ b/python/private/pypi/extension.bzl @@ -225,6 +225,7 @@ You cannot use both the additive_build_content and additive_build_content_file a # See `hub_builder.bzl%hub_builder()` for `HubBuilder` pip_hub_map = {} simpleapi_cache = {} + facts = {} for mod in module_ctx.modules: for pip_attr in mod.tags.parse: @@ -242,6 +243,7 @@ You cannot use both the additive_build_content and additive_build_content_file a evaluate_markers_fn = kwargs.get("evaluate_markers", None), available_interpreters = kwargs.get("available_interpreters", INTERPRETER_LABELS), logger = repo_utils.logger(module_ctx, "pypi:hub:" + hub_name), + facts = facts, ) pip_hub_map[pip_attr.hub_name] = builder elif pip_hub_map[hub_name].module_name != mod.name: @@ -288,6 +290,25 @@ You cannot use both the additive_build_content and additive_build_content_file a hub_group_map[hub.name] = out.group_map hub_whl_map[hub.name] = out.whl_map + facts = { + "fact_version": facts.get("fact_version"), + } | { + index_url: { + k: _sorted_dict(f.get(k)) + for k in [ + "dist_filenames", + "dist_hashes", + "dist_yanked", + ] + if f.get(k) + } + for index_url, f in facts.items() + if index_url not in ["fact_version"] + } + if len(facts) == 1: + # only version is present, skip writing + facts = None + return struct( config = config, exposed_packages = exposed_packages, @@ -296,6 +317,7 @@ You cannot use both the additive_build_content and additive_build_content_file a hub_whl_map = hub_whl_map, whl_libraries = whl_libraries, whl_mods = whl_mods, + facts = facts, platform_config_settings = { hub_name: { platform_name: sorted([str(Label(cv)) for cv in p.config_settings]) @@ -305,6 +327,12 @@ You cannot use both the additive_build_content and additive_build_content_file a }, ) +def _sorted_dict(d): + if not d: + return {} + + return {k: v for k, v in sorted(d.items())} + def _pip_impl(module_ctx): """Implementation of a class tag that creates the pip hub and corresponding pip spoke whl repositories. @@ -393,9 +421,11 @@ def _pip_impl(module_ctx): groups = mods.hub_group_map.get(hub_name), ) - return module_ctx.extension_metadata( - reproducible = True, - ) + kwargs = {"reproducible": True} + if mods.facts: + kwargs["facts"] = mods.facts + + return module_ctx.extension_metadata(**kwargs) _default_attrs = { "arch_name": attr.string( diff --git a/python/private/pypi/hub_builder.bzl b/python/private/pypi/hub_builder.bzl index f0aa6a73bc..2bc12bf7b6 100644 --- a/python/private/pypi/hub_builder.bzl +++ b/python/private/pypi/hub_builder.bzl @@ -31,6 +31,7 @@ def hub_builder( simpleapi_download_fn, evaluate_markers_fn, logger, + facts = None, simpleapi_cache = {}): """Return a hub builder instance @@ -47,6 +48,7 @@ def hub_builder( used during the `repository_rule` and must be always compatible with the host. simpleapi_download_fn: the function used to download from SimpleAPI. simpleapi_cache: the cache for the download results. + facts: the facts if they are available. logger: the logger for this builder. """ @@ -89,6 +91,10 @@ def hub_builder( # Functions to download according to the config # dict[str python_version, callable] _get_index_urls = {}, + # Contains the dict to store the facts to be written to the lockfile that + # can be safely cached for future invocations. + # dict[str, dict[str, str]] + _facts = facts, # Tells whether to use the downloader for a package. # dict[str python_version, dict[str package_name, bool use_downloader]] _use_downloader = {}, @@ -399,11 +405,16 @@ def _set_get_index_urls(self, pip_attr): d for d in distributions if _use_downloader(self, python_version, d) - ], + ] if type(distributions) == "list" else { + d: versions + for d, versions in distributions.items() + if _use_downloader(self, python_version, d) + }, envsubst = pip_attr.envsubst, # Auth related info netrc = pip_attr.netrc, auth_patterns = pip_attr.auth_patterns, + facts = self._facts, ), cache = self._simpleapi_cache, parallel_download = pip_attr.parallel_download, diff --git a/python/private/pypi/parse_requirements.bzl b/python/private/pypi/parse_requirements.bzl index 5c05c753fd..f9a6e672bf 100644 --- a/python/private/pypi/parse_requirements.bzl +++ b/python/private/pypi/parse_requirements.bzl @@ -170,16 +170,15 @@ def parse_requirements( index_urls = {} if get_index_urls: - index_urls = get_index_urls( - ctx, - # Use list({}) as a way to have a set - list({ - req.distribution: None - for reqs in requirements_by_platform.values() - for req in reqs.values() - if not req.srcs.url - }), - ) + distributions = {} + for reqs in requirements_by_platform.values(): + for req in reqs.values(): + if req.srcs.url: + continue + + distributions.setdefault(req.distribution, []).append(req.srcs.version) + + index_urls = get_index_urls(ctx, distributions) ret = [] for name, reqs in sorted(requirements_by_platform.items()): diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl index a41f0750c4..da11a77635 100644 --- a/python/private/pypi/parse_simpleapi_html.bzl +++ b/python/private/pypi/parse_simpleapi_html.bzl @@ -16,12 +16,14 @@ Parse SimpleAPI HTML in Starlark. """ -def parse_simpleapi_html(*, url, content): +def parse_simpleapi_html(*, url, content, distribution = None, return_absolute = True): """Get the package URLs for given shas by parsing the Simple API HTML. Args: url(str): The URL that the HTML content can be downloaded from. + distribution(str): TODO content(str): The Simple API HTML content. + return_absolute: {type}`bool` TODO Returns: A list of structs with: @@ -33,6 +35,9 @@ def parse_simpleapi_html(*, url, content): present, then the 'metadata_url' is also present. Defaults to "". * metadata_url: The URL for the METADATA if we can download it. Defaults to "". """ + if not distribution: + _, _, distribution = url.strip("/").rpartition("/") + sdists = {} whls = {} lines = content.split("") maybe_metadata, _, filename = head.rpartition(">") - version = _version(filename) + version = pkg_version(filename, distribution) sha256s_by_version.setdefault(version, []).append(sha256) metadata_sha256 = "" @@ -79,13 +85,17 @@ def parse_simpleapi_html(*, url, content): break if filename.endswith(".whl"): + metadata_url = metadata_url or "" + if return_absolute and metadata_url: + metadata_url = absolute_url(index_url = url, url = metadata_url) + whls[sha256] = struct( filename = filename, version = version, url = dist_url, sha256 = sha256, metadata_sha256 = metadata_sha256, - metadata_url = _absolute_url(url, metadata_url) if metadata_url else "", + metadata_url = metadata_url, yanked = yanked, ) else: @@ -110,18 +120,36 @@ _SDIST_EXTS = [ ".zip", ] -def _version(filename): +def pkg_version(filename, distribution = None): + """pkg_version extracts the version from the filename. + + TODO: move this to a different location + + Args: + filename: TODO + distribution: TODO + + Returns: + version string + """ # See https://packaging.python.org/en/latest/specifications/binary-distribution-format/#binary-distribution-format - _, _, tail = filename.partition("-") - version, _, _ = tail.partition("-") - if version != tail: - # The format is {name}-{version}-{whl_specifiers}.whl - return version + if filename.endswith(".whl"): + _, _, tail = filename.partition("-") + version, _, _ = tail.partition("-") + if version != tail: + # The format is {name}-{version}-{whl_specifiers}.whl + return version + + if not distribution: + fail("for parsing sdists passing 'distribution' is mandatory") # NOTE @aignas 2025-03-29: most of the files are wheels, so this is not the common path # {name}-{version}.{ext} + # TODO @aignas 2026-01-20: test for handling dashes in names, can't think of any other way to + # get the version from the filename but to pass in the distribution name to this function. + version = filename[len(distribution) + 1:] for ext in _SDIST_EXTS: version, _, _ = version.partition(ext) # build or name @@ -147,21 +175,30 @@ def _is_downloadable(url): """ return url.startswith("http://") or url.startswith("https://") or url.startswith("file://") -def _absolute_url(index_url, candidate): - if candidate == "": - return candidate +def absolute_url(*, index_url, url): + """Return an absolute URL in case the url is not absolute. + + Args: + index_url: {type}`str` The index_url. + url: {type}`str` The url of the artifact. + + Returns: + `url` if it is absolute, or absolute URL based on the `index_url`. + """ + if url == "": + return url - if _is_downloadable(candidate): - return candidate + if _is_downloadable(url): + return url - if candidate.startswith("/"): + if url.startswith("/"): # absolute path root_directory = _get_root_directory(index_url) - return "{}{}".format(root_directory, candidate) + return "{}{}".format(root_directory, url) - if candidate.startswith(".."): + if url.startswith(".."): # relative path with up references - candidate_parts = candidate.split("..") + candidate_parts = url.split("..") last = candidate_parts[-1] for _ in range(len(candidate_parts) - 1): index_url, _, _ = index_url.rstrip("/").rpartition("/") @@ -169,4 +206,4 @@ def _absolute_url(index_url, candidate): return "{}/{}".format(index_url, last.strip("/")) # relative path without up-references - return "{}/{}".format(index_url.rstrip("/"), candidate) + return "{}/{}".format(index_url.rstrip("/"), url) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 52ff02a178..b1481f8981 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -21,7 +21,9 @@ load("//python/private:auth.bzl", _get_auth = "get_auth") load("//python/private:envsubst.bzl", "envsubst") load("//python/private:normalize_name.bzl", "normalize_name") load("//python/private:text_util.bzl", "render") -load(":parse_simpleapi_html.bzl", "parse_simpleapi_html") +load(":parse_simpleapi_html.bzl", "absolute_url", "parse_simpleapi_html", "pkg_version") + +_FACT_VERSION = "v1" def simpleapi_download( ctx, @@ -43,12 +45,13 @@ def simpleapi_download( separate packages. * extra_index_urls: Extra index URLs that will be looked up after the main is looked up. - * sources: list[str], the sources to download things for. Each value is - the contents of requirements files. + * sources: list[str] | dict[str, list[str]], the sources to download things for. Each + value is the contents of requirements files. * envsubst: list[str], the envsubst vars for performing substitution in index url. * netrc: The netrc parameter for ctx.download, see http_file for docs. * auth_patterns: The auth_patterns parameter for ctx.download, see http_file for docs. + * facts: The facts to write to if we support them. cache: A dictionary that can be used as a cache between calls during a single evaluation of the extension. We use a dictionary as a cache so that we can reuse calls to the simple API when evaluating the @@ -81,27 +84,42 @@ def simpleapi_download( index_urls = [attr.index_url] + attr.extra_index_urls read_simpleapi = read_simpleapi or _read_simpleapi + if attr.facts: + ctx.report_progress("Fetch package lists from PyPI index or read from MODULE.bazel.lock") + else: + ctx.report_progress("Fetch package lists from PyPI index") + + cache = simpleapi_cache( + memory_cache = memory_cache(cache), + facts_cache = facts_cache(getattr(ctx, "facts", None), attr.facts), + ) + found_on_index = {} warn_overrides = False - ctx.report_progress("Fetch package lists from PyPI index") + + # Normalize the inputs + if type(attr.sources) == "list": + fail("TODO") + else: + input_sources = attr.sources + for i, index_url in enumerate(index_urls): if i != 0: # Warn the user about a potential fix for the overrides warn_overrides = True async_downloads = {} - sources = [pkg for pkg in attr.sources if pkg not in found_on_index] - for pkg in sources: + sources = {pkg: versions for pkg, versions in input_sources.items() if pkg not in found_on_index} + for pkg, versions in sources.items(): pkg_normalized = normalize_name(pkg) result = read_simpleapi( ctx = ctx, - url = "{}/{}/".format( - index_url_overrides.get(pkg_normalized, index_url).rstrip("/"), - pkg, - ), attr = attr, cache = cache, + index_url = index_url_overrides.get(pkg_normalized, index_url), + distribution = pkg, get_auth = get_auth, + requested_versions = {v: None for v in versions}, **download_kwargs ) if hasattr(result, "wait"): @@ -109,6 +127,7 @@ def simpleapi_download( async_downloads[pkg] = struct( pkg_normalized = pkg_normalized, wait = result.wait, + fns = result.fns, ) elif result.success: contents[pkg_normalized] = result.output @@ -164,49 +183,14 @@ If you would like to skip downloading metadata for these packages please add 'si return contents -def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs): - """Read SimpleAPI. - - Args: - ctx: The module_ctx or repository_ctx. - url: str, the url parameter that can be passed to ctx.download. - attr: The attribute that contains necessary info for downloading. The - following attributes must be present: - * envsubst: The envsubst values for performing substitutions in the URL. - * netrc: The netrc parameter for ctx.download, see http_file for docs. - * auth_patterns: The auth_patterns parameter for ctx.download, see - http_file for docs. - cache: A dict for storing the results. - get_auth: A function to get auth information. Used in tests. - **download_kwargs: Any extra params to ctx.download. - Note that output and auth will be passed for you. - - Returns: - A similar object to what `download` would return except that in result.out - will be the parsed simple api contents. - """ - # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for - # the whl location and we cannot handle multiple URLs at once by passing - # them to ctx.download if we want to correctly handle the relative URLs. - # TODO: Add a test that env subbed index urls do not leak into the lock file. - - real_url = strip_empty_path_segments(envsubst( - url, - attr.envsubst, - ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get, - )) - - cache_key = real_url - if cache_key in cache: - return struct(success = True, output = cache[cache_key]) - +def _download_simpleapi(*, ctx, url, real_url, attr_envsubst, get_auth, **kwargs): output_str = envsubst( url, - attr.envsubst, + attr_envsubst, # Use env names in the subst values - this will be unique over # the lifetime of the execution of this function and we also use # `~` as the separator to ensure that we don't get clashes. - {e: "~{}~".format(e) for e in attr.envsubst}.get, + {e: "~{}~".format(e) for e in attr_envsubst}.get, ) # Transform the URL into a valid filename @@ -217,22 +201,50 @@ def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs): get_auth = get_auth or _get_auth - # NOTE: this may have block = True or block = False in the download_kwargs + # NOTE: this may have block = True or block = False in the kwargs download = ctx.download( url = [real_url], output = output, auth = get_auth(ctx, [real_url], ctx_attr = attr), allow_fail = True, - **download_kwargs + **kwargs ) - if download_kwargs.get("block") == False: - # Simulate the same API as ctx.download has + return _await( + download, + _read, + ctx = ctx, + output = output, + ) + +def _await(download, fn, **kwargs): + if hasattr(download, "fns"): + download.fns.append( + lambda result: fn(result = result, **kwargs), + ) + return download + elif hasattr(download, "wait"): + # Have a reference type which we can iterate later when aggregating the result + fns = [lambda result: fn(result = result, **kwargs)] + + def wait(): + result = download.wait() + for fn in fns: + result = fn(result = result) + return result + return struct( - wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key), + wait = wait, + fns = fns, ) - return _read_index_result(ctx, download, output, real_url, cache, cache_key) + return fn(result = download, **kwargs) + +def _read(ctx, result, output): + if not result.success: + return result + + return struct(success = True, output = ctx.read(output)) def strip_empty_path_segments(url): """Removes empty path segments from a URL. Does nothing for urls with no scheme. @@ -255,15 +267,349 @@ def strip_empty_path_segments(url): else: return "{}://{}".format(scheme, stripped) -def _read_index_result(ctx, result, output, url, cache, cache_key): - if not result.success: - return struct(success = False) +def _read_simpleapi(ctx, index_url, distribution, attr, cache, requested_versions, get_auth = None, **download_kwargs): + """Read SimpleAPI. - content = ctx.read(output) + Args: + ctx: The module_ctx or repository_ctx. + index_url: str, the PyPI SimpleAPI index URL + distribution: str, the distribution to download + attr: The attribute that contains necessary info for downloading. The + following attributes must be present: + * envsubst: The envsubst values for performing substitutions in the URL. + * netrc: The netrc parameter for ctx.download, see http_file for docs. + * auth_patterns: The auth_patterns parameter for ctx.download, see + http_file for docs. + cache: A dict for storing the results. + get_auth: A function to get auth information. Used in tests. + requested_versions: the list of requested versions. + **download_kwargs: Any extra params to ctx.download. + Note that output and auth will be passed for you. - output = parse_simpleapi_html(url = url, content = content) - if output: - cache.setdefault(cache_key, output) - return struct(success = True, output = output, cache_key = cache_key) - else: + Returns: + A similar object to what `download` would return except that in result.out + will be the parsed simple api contents. + """ + + index_url = index_url.rstrip("/") + + # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for + # the whl location and we cannot handle multiple URLs at once by passing + # them to ctx.download if we want to correctly handle the relative URLs. + # TODO: Add a test that env subbed index urls do not leak into the lock file. + + cached = cache.get(index_url, distribution, requested_versions) + if cached: + return struct(success = True, output = cached) + + url = "{}/{}/".format(index_url, distribution) + real_url = strip_empty_path_segments(envsubst( + url, + attr.envsubst, + ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get, + )) + + download = _download_simpleapi( + ctx = ctx, + url = url, + real_url = real_url, + attr_envsubst = attr.envsubst, + get_auth = get_auth, + **download_kwargs + ) + + return _await( + download, + _read_index_result, + index_url = index_url, + distribution = distribution, + real_url = real_url, + cache = cache, + requested_versions = requested_versions, + ) + +def _read_index_result(*, result, index_url, distribution, real_url, cache, requested_versions): + if not result.success or not result.output: + return struct(success = False) + + # TODO @aignas 2026-02-08: make this the only behaviour, maybe can get rid of `real_url + output = parse_simpleapi_html( + url = real_url, + content = result.output, + return_absolute = False, + ) + if not output: return struct(success = False) + + cache.setdefault(index_url, distribution, requested_versions, output) + return struct(success = True, output = output) + +def simpleapi_cache(memory_cache, facts_cache): + """SimpleAPI cache for making fewer calls. + + Args: + memory_cache: the storage to store things in memory. + facts_cache: the storage to retrieve known facts. + + Returns: + struct with 2 methods, `get` and `setdefault`. + """ + return struct( + get = lambda index_url, distribution, versions: _cache_get( + memory_cache, + facts_cache, + index_url, + distribution, + versions, + ), + setdefault = lambda index_url, distribution, versions, value: _cache_setdefault( + memory_cache, + facts_cache, + index_url, + distribution, + versions, + value, + ), + ) + +def _cache_get(cache, facts, index_url, distribution, versions): + if not facts: + return cache.get(index_url, distribution, versions) + + if versions: + cached = facts.get(index_url, distribution, versions) + if cached: + return cached + + cached = cache.get(index_url, distribution, versions) + if not cached: + return None + + # Ensure that we write back to the facts, this happens if we request versions that + # we don't have facts for but we have in-memory cache of SimpleAPI query results + if versions: + facts.setdefault(index_url, distribution, cached) + return cached + +def _cache_setdefault(cache, facts, index_url, distribution, versions, value): + filtered = cache.setdefault(index_url, distribution, versions, value) + + if facts and versions: + facts.setdefault(index_url, distribution, filtered) + + return filtered + +def memory_cache(cache = None): + """SimpleAPI cache for making fewer calls. + + Args: + cache: the storage to store things in memory. + + Returns: + struct with 2 methods, `get` and `setdefault`. + """ + if cache == None: + cache = {} + + return struct( + get = lambda index_url, distribution, versions: _memcache_get( + cache, + index_url, + distribution, + versions, + ), + setdefault = lambda index_url, distribution, versions, value: _memcache_setdefault( + cache, + index_url, + distribution, + versions, + value, + ), + ) + +def _vkey(versions): + if not versions: + return "" + + if len(versions) == 1: + if type(versions) == "dict": + return versions.keys()[0] + else: + return versions[0] + + return ",".join(sorted(versions)) + +def _memcache_get(cache, index_url, distribution, versions): + if not versions: + return cache.get((index_url, distribution, "")) + + vkey = _vkey(versions) + filtered = cache.get((index_url, distribution, vkey)) + if filtered: + return filtered + + unfiltered = cache.get((index_url, distribution, "")) + if not unfiltered: + return None + + filtered = _filter_packages(unfiltered, versions, index_url, distribution) + cache.setdefault((index_url, distribution, vkey), filtered) + return filtered + +def _memcache_setdefault(cache, index_url, distribution, versions, value): + cache.setdefault((index_url, distribution, ""), value) + if not versions: + return value + + filtered = _filter_packages(value, versions, index_url, distribution) + + vkey = _vkey(versions) + cache.setdefault((index_url, distribution, vkey), filtered) + return filtered + +def _filter_packages(dists, requested_versions, index_url, distribution): + if dists == None: + return None + + if not requested_versions: + return dists + + sha256s_by_version = {} + whls = {} + sdists = {} + for sha256, d in dists.sdists.items(): + if d.version not in requested_versions: + continue + + sdists[sha256] = _with_absolute_url(d, index_url, distribution) + sha256s_by_version.setdefault(d.version, []).append(sha256) + + for sha256, d in dists.whls.items(): + if d.version not in requested_versions: + continue + + whls[sha256] = _with_absolute_url(d, index_url, distribution) + sha256s_by_version.setdefault(d.version, []).append(sha256) + + if not whls and not sdists: + return None + + return struct( + whls = whls, + sdists = sdists, + sha256s_by_version = sha256s_by_version, + ) + +def facts_cache(known_facts, facts, facts_version = _FACT_VERSION): + if known_facts == None: + return None + + return struct( + get = lambda index_url, distribution, versions: _get_from_facts( + facts, + known_facts, + index_url, + distribution, + versions, + facts_version, + ), + setdefault = lambda url, distribution, value: _store_facts(facts, facts_version, url, value), + known_facts = known_facts, + facts = facts, + ) + +def _get_from_facts(facts, known_facts, index_url, distribution, requested_versions, facts_version): + if known_facts.get("fact_version") != facts_version: + # cannot trust known facts, different version that we know how to parse + return None + + known_sources = {} + + known_facts = known_facts.get(index_url, {}) + + index_url_for_distro = "{}/{}/".format(index_url, distribution) + for url, sha256 in known_facts.get("dist_hashes", {}).items(): + filename = known_facts.get("dist_filenames", {}).get(sha256) + if not filename: + _, _, filename = url.rpartition("/") + + version = pkg_version(filename, distribution) + if version not in requested_versions: + # TODO @aignas 2026-01-21: do the check by requested shas at some point + # We don't have sufficient info in the lock file, need to call the API + # + continue + + if filename.endswith(".whl"): + dists = known_sources.setdefault("whls", {}) + else: + dists = known_sources.setdefault("sdists", {}) + + known_sources.setdefault("sha256s_by_version", {}).setdefault(version, []).append(sha256) + + dists.setdefault(sha256, struct( + sha256 = sha256, + filename = filename, + version = version, + url = absolute_url(index_url = index_url_for_distro, url = url), + yanked = known_facts.get("dist_yanked", {}).get(sha256, False), + )) + + if not known_sources: + return None + + output = struct( + whls = known_sources.get("whls", {}), + sdists = known_sources.get("sdists", {}), + sha256s_by_version = known_sources.get("sha256s_by_version", {}), + ) + _store_facts(facts, facts_version, index_url, output) + return output + +def _with_absolute_url(d, index_url, distribution): + index_url_for_distro = "{}/{}/".format(index_url.rstrip("/"), distribution) + + # TODO @aignas 2026-02-08: think of a better way to do this + # TODO @aignas 2026-02-08: if the url is absolute, return d + kwargs = dict() + for attr in [ + "sha256", + "filename", + "version", + "metadata_sha256", + "metadata_url", + "yanked", + "url", + ]: + if hasattr(d, attr): + kwargs[attr] = getattr(d, attr) + if attr == "url": + kwargs[attr] = absolute_url(index_url = index_url_for_distro, url = kwargs[attr]) + + return struct(**kwargs) + +def _store_facts(facts, fact_version, index_url, value): + """Store values as facts in the lock file. + + The main idea is to ensure that the lock file is small and it is only storing what + we would need to fetch from the internet. Any derivative information we can + from this that can be achieved using pure Starlark functions should be done in + Starlark. + """ + if not value: + return value + + facts["fact_version"] = fact_version + + # Store the distributions by index URL that we find them on. + facts = facts.setdefault(index_url, {}) + + for sha256, d in (value.sdists | value.whls).items(): + facts.setdefault("dist_hashes", {}).setdefault(d.url, sha256) + if not d.url.endswith(d.filename): + facts.setdefault("dist_filenames", {}).setdefault(d.url, d.filename) + if d.yanked: + # TODO @aignas 2026-01-21: store yank reason + facts.setdefault("dist_yanked", {}).setdefault(sha256, True) + + return value diff --git a/tests/pypi/hub_builder/hub_builder_tests.bzl b/tests/pypi/hub_builder/hub_builder_tests.bzl index 03cefd13c5..f73e23ba37 100644 --- a/tests/pypi/hub_builder/hub_builder_tests.bzl +++ b/tests/pypi/hub_builder/hub_builder_tests.bzl @@ -1036,7 +1036,13 @@ git_dep @ git+https://git.server/repo/project@deadbeefdeadbeef index_url = "pypi.org", index_url_overrides = {}, netrc = None, - sources = ["simple", "plat_pkg", "pip_fallback", "some_other_pkg"], + facts = None, + sources = { + "pip_fallback": ["0.0.1"], + "plat_pkg": ["0.0.4"], + "simple": ["0.0.1"], + "some_other_pkg": ["0.0.1"], + }, ), "cache": {}, "parallel_download": False, diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl index 8dc307235a..2a0b1f8811 100644 --- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl +++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl @@ -15,19 +15,28 @@ "" load("@rules_testing//lib:test_suite.bzl", "test_suite") -load("//python/private/pypi:simpleapi_download.bzl", "simpleapi_download", "strip_empty_path_segments") # buildifier: disable=bzl-visibility +load("@rules_testing//lib:truth.bzl", "subjects") +load( + "//python/private/pypi:simpleapi_download.bzl", + "memory_cache", + "simpleapi_download", + "strip_empty_path_segments", +) # buildifier: disable=bzl-visibility _tests = [] def _test_simple(env): calls = [] - def read_simpleapi(ctx, url, attr, cache, get_auth, block): + def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): _ = ctx # buildifier: disable=unused-variable + _ = distribution + _ = requested_versions _ = attr _ = cache _ = get_auth env.expect.that_bool(block).equals(False) + url = "{}/{}/".format(index_url, distribution) calls.append(url) if "foo" in url and "main" in url: return struct( @@ -49,8 +58,9 @@ def _test_simple(env): index_url_overrides = {}, index_url = "main", extra_index_urls = ["extra"], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = [], + facts = None, ), cache = {}, parallel_download = True, @@ -75,11 +85,14 @@ def _test_fail(env): calls = [] fails = [] - def read_simpleapi(ctx, url, attr, cache, get_auth, block): + def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): _ = ctx # buildifier: disable=unused-variable + _ = distribution + _ = requested_versions _ = attr _ = cache _ = get_auth + url = "{}/{}/".format(index_url, distribution) env.expect.that_bool(block).equals(False) calls.append(url) if "foo" in url: @@ -109,8 +122,9 @@ def _test_fail(env): }, index_url = "main", extra_index_urls = ["extra"], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = [], + facts = None, ), cache = {}, parallel_download = True, @@ -122,13 +136,13 @@ def _test_fail(env): """ Failed to download metadata of the following packages from urls: { - "foo": "invalid", "bar": ["main", "extra"], + "foo": "invalid", } If you would like to skip downloading metadata for these packages please add 'simpleapi_skip=[ - "foo", "bar", + "foo", ]' to your 'pip.parse' call. """, ]) @@ -162,8 +176,9 @@ def _test_download_url(env): index_url_overrides = {}, index_url = "https://example.com/main/simple/", extra_index_urls = [], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = [], + facts = None, ), cache = {}, parallel_download = False, @@ -198,8 +213,9 @@ def _test_download_url_parallel(env): index_url_overrides = {}, index_url = "https://example.com/main/simple/", extra_index_urls = [], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = [], + facts = None, ), cache = {}, parallel_download = True, @@ -234,8 +250,9 @@ def _test_download_envsubst_url(env): index_url_overrides = {}, index_url = "$INDEX_URL", extra_index_urls = [], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = ["INDEX_URL"], + facts = None, ), cache = {}, parallel_download = False, @@ -260,6 +277,68 @@ def _test_strip_empty_path_segments(env): _tests.append(_test_strip_empty_path_segments) +def _expect_cache_result(env, cache, key, sdists, whls): + got = env.expect.that_struct( + cache.get(*key), + attrs = dict( + whls = subjects.dict, + sdists = subjects.dict, + ), + ) + got.whls().contains_exactly(whls) + got.sdists().contains_exactly(sdists) + +def _test_memory_cache(env): + memory = {} + cache = memory_cache(memory) + all_packages = struct( + sdists = { + "aa": struct(version = "1.0"), + "ab": struct(version = "1.1"), + }, + whls = { + "ba": struct(version = "1.0"), + "bb": struct(version = "1.1"), + }, + ) + cache.setdefault("index", "distro", None, all_packages) + env.expect.that_dict(memory).contains_exactly({ + ("index", "distro", ""): all_packages, + }) + _expect_cache_result( + env, + cache, + ("index", "distro", ["1.0"]), + sdists = { + "aa": struct(version = "1.0"), + }, + whls = { + "ba": struct(version = "1.0"), + }, + ) + env.expect.that_dict(memory).keys().contains_exactly([ + ("index", "distro", ""), + ("index", "distro", "1.0"), + ]) + _expect_cache_result( + env, + cache, + ("index", "distro", ["1.1"]), + sdists = { + "ab": struct(version = "1.1"), + }, + whls = { + "bb": struct(version = "1.1"), + }, + ) + env.expect.that_dict(memory).keys().contains_exactly([ + ("index", "distro", ""), + ("index", "distro", "1.0"), + ("index", "distro", "1.1"), + ]) + +_tests.append(_test_memory_cache) + def simpleapi_download_test_suite(name): """Create the test suite.