From a0ff18d8624958656a03ad3eb1fd43f9adc47533 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Mon, 19 Jan 2026 22:54:41 +0900 Subject: [PATCH 01/13] exp(pypi): store necessary facts fetched from SimpleAPI This allows us to cache what we store from the SimpleAPI in between the runs. --- python/private/pypi/extension.bzl | 36 +- python/private/pypi/hub_builder.bzl | 10 +- python/private/pypi/parse_requirements.bzl | 19 +- python/private/pypi/parse_simpleapi_html.bzl | 77 ++-- python/private/pypi/simpleapi_download.bzl | 350 +++++++++++++++---- 5 files changed, 396 insertions(+), 96 deletions(-) diff --git a/python/private/pypi/extension.bzl b/python/private/pypi/extension.bzl index 3927f61c00..5b2b21829a 100644 --- a/python/private/pypi/extension.bzl +++ b/python/private/pypi/extension.bzl @@ -223,6 +223,7 @@ You cannot use both the additive_build_content and additive_build_content_file a # versions. pip_hub_map = {} simpleapi_cache = {} + facts = {} for mod in module_ctx.modules: for pip_attr in mod.tags.parse: @@ -240,6 +241,7 @@ You cannot use both the additive_build_content and additive_build_content_file a evaluate_markers_fn = kwargs.get("evaluate_markers", None), available_interpreters = kwargs.get("available_interpreters", INTERPRETER_LABELS), logger = repo_utils.logger(module_ctx, "pypi:hub:" + hub_name), + facts = facts, ) pip_hub_map[pip_attr.hub_name] = builder elif pip_hub_map[hub_name].module_name != mod.name: @@ -286,6 +288,25 @@ You cannot use both the additive_build_content and additive_build_content_file a hub_group_map[hub.name] = out.group_map hub_whl_map[hub.name] = out.whl_map + facts = { + "fact_version": facts.get("fact_version"), + } | { + index_url: { + k: _sorted_dict(f.get(k)) + for k in [ + "dist_filenames", + "dist_hashes", + "dist_yanked", + ] + if f.get(k) + } + for index_url, f in facts.items() + if index_url not in ["fact_version"] + } + if len(facts) == 1: + # only version is present, skip writing + facts = None + return struct( config = config, exposed_packages = exposed_packages, @@ -294,6 +315,7 @@ You cannot use both the additive_build_content and additive_build_content_file a hub_whl_map = hub_whl_map, whl_libraries = whl_libraries, whl_mods = whl_mods, + facts = facts, platform_config_settings = { hub_name: { platform_name: sorted([str(Label(cv)) for cv in p.config_settings]) @@ -303,6 +325,12 @@ You cannot use both the additive_build_content and additive_build_content_file a }, ) +def _sorted_dict(d): + if not d: + return {} + + return {k: v for k, v in sorted(d.items())} + def _pip_impl(module_ctx): """Implementation of a class tag that creates the pip hub and corresponding pip spoke whl repositories. @@ -391,9 +419,11 @@ def _pip_impl(module_ctx): groups = mods.hub_group_map.get(hub_name), ) - return module_ctx.extension_metadata( - reproducible = True, - ) + kwargs = {"reproducible": True} + if mods.facts: + kwargs["facts"] = mods.facts + + return module_ctx.extension_metadata(**kwargs) _default_attrs = { "arch_name": attr.string( diff --git a/python/private/pypi/hub_builder.bzl b/python/private/pypi/hub_builder.bzl index f54d02d8b0..2a75f0112e 100644 --- a/python/private/pypi/hub_builder.bzl +++ b/python/private/pypi/hub_builder.bzl @@ -31,6 +31,7 @@ def hub_builder( simpleapi_download_fn, evaluate_markers_fn, logger, + facts = None, simpleapi_cache = {}): """Return a hub builder instance @@ -47,6 +48,7 @@ def hub_builder( used during the `repository_rule` and must be always compatible with the host. simpleapi_download_fn: the function used to download from SimpleAPI. simpleapi_cache: the cache for the download results. + facts: the facts if they are available. logger: the logger for this builder. """ @@ -69,6 +71,7 @@ def hub_builder( _platforms = {}, _group_name_by_whl = {}, _get_index_urls = {}, + _facts = facts, _use_downloader = {}, _simpleapi_cache = simpleapi_cache, # instance constants @@ -335,11 +338,16 @@ def _set_get_index_urls(self, pip_attr): d for d in distributions if _use_downloader(self, python_version, d) - ], + ] if type(distributions) == "list" else { + d: versions + for d, versions in distributions.items() + if _use_downloader(self, python_version, d) + }, envsubst = pip_attr.envsubst, # Auth related info netrc = pip_attr.netrc, auth_patterns = pip_attr.auth_patterns, + facts = self._facts, ), cache = self._simpleapi_cache, parallel_download = pip_attr.parallel_download, diff --git a/python/private/pypi/parse_requirements.bzl b/python/private/pypi/parse_requirements.bzl index 5c05c753fd..f9a6e672bf 100644 --- a/python/private/pypi/parse_requirements.bzl +++ b/python/private/pypi/parse_requirements.bzl @@ -170,16 +170,15 @@ def parse_requirements( index_urls = {} if get_index_urls: - index_urls = get_index_urls( - ctx, - # Use list({}) as a way to have a set - list({ - req.distribution: None - for reqs in requirements_by_platform.values() - for req in reqs.values() - if not req.srcs.url - }), - ) + distributions = {} + for reqs in requirements_by_platform.values(): + for req in reqs.values(): + if req.srcs.url: + continue + + distributions.setdefault(req.distribution, []).append(req.srcs.version) + + index_urls = get_index_urls(ctx, distributions) ret = [] for name, reqs in sorted(requirements_by_platform.items()): diff --git a/python/private/pypi/parse_simpleapi_html.bzl b/python/private/pypi/parse_simpleapi_html.bzl index a41f0750c4..da11a77635 100644 --- a/python/private/pypi/parse_simpleapi_html.bzl +++ b/python/private/pypi/parse_simpleapi_html.bzl @@ -16,12 +16,14 @@ Parse SimpleAPI HTML in Starlark. """ -def parse_simpleapi_html(*, url, content): +def parse_simpleapi_html(*, url, content, distribution = None, return_absolute = True): """Get the package URLs for given shas by parsing the Simple API HTML. Args: url(str): The URL that the HTML content can be downloaded from. + distribution(str): TODO content(str): The Simple API HTML content. + return_absolute: {type}`bool` TODO Returns: A list of structs with: @@ -33,6 +35,9 @@ def parse_simpleapi_html(*, url, content): present, then the 'metadata_url' is also present. Defaults to "". * metadata_url: The URL for the METADATA if we can download it. Defaults to "". """ + if not distribution: + _, _, distribution = url.strip("/").rpartition("/") + sdists = {} whls = {} lines = content.split("") maybe_metadata, _, filename = head.rpartition(">") - version = _version(filename) + version = pkg_version(filename, distribution) sha256s_by_version.setdefault(version, []).append(sha256) metadata_sha256 = "" @@ -79,13 +85,17 @@ def parse_simpleapi_html(*, url, content): break if filename.endswith(".whl"): + metadata_url = metadata_url or "" + if return_absolute and metadata_url: + metadata_url = absolute_url(index_url = url, url = metadata_url) + whls[sha256] = struct( filename = filename, version = version, url = dist_url, sha256 = sha256, metadata_sha256 = metadata_sha256, - metadata_url = _absolute_url(url, metadata_url) if metadata_url else "", + metadata_url = metadata_url, yanked = yanked, ) else: @@ -110,18 +120,36 @@ _SDIST_EXTS = [ ".zip", ] -def _version(filename): +def pkg_version(filename, distribution = None): + """pkg_version extracts the version from the filename. + + TODO: move this to a different location + + Args: + filename: TODO + distribution: TODO + + Returns: + version string + """ # See https://packaging.python.org/en/latest/specifications/binary-distribution-format/#binary-distribution-format - _, _, tail = filename.partition("-") - version, _, _ = tail.partition("-") - if version != tail: - # The format is {name}-{version}-{whl_specifiers}.whl - return version + if filename.endswith(".whl"): + _, _, tail = filename.partition("-") + version, _, _ = tail.partition("-") + if version != tail: + # The format is {name}-{version}-{whl_specifiers}.whl + return version + + if not distribution: + fail("for parsing sdists passing 'distribution' is mandatory") # NOTE @aignas 2025-03-29: most of the files are wheels, so this is not the common path # {name}-{version}.{ext} + # TODO @aignas 2026-01-20: test for handling dashes in names, can't think of any other way to + # get the version from the filename but to pass in the distribution name to this function. + version = filename[len(distribution) + 1:] for ext in _SDIST_EXTS: version, _, _ = version.partition(ext) # build or name @@ -147,21 +175,30 @@ def _is_downloadable(url): """ return url.startswith("http://") or url.startswith("https://") or url.startswith("file://") -def _absolute_url(index_url, candidate): - if candidate == "": - return candidate +def absolute_url(*, index_url, url): + """Return an absolute URL in case the url is not absolute. + + Args: + index_url: {type}`str` The index_url. + url: {type}`str` The url of the artifact. + + Returns: + `url` if it is absolute, or absolute URL based on the `index_url`. + """ + if url == "": + return url - if _is_downloadable(candidate): - return candidate + if _is_downloadable(url): + return url - if candidate.startswith("/"): + if url.startswith("/"): # absolute path root_directory = _get_root_directory(index_url) - return "{}{}".format(root_directory, candidate) + return "{}{}".format(root_directory, url) - if candidate.startswith(".."): + if url.startswith(".."): # relative path with up references - candidate_parts = candidate.split("..") + candidate_parts = url.split("..") last = candidate_parts[-1] for _ in range(len(candidate_parts) - 1): index_url, _, _ = index_url.rstrip("/").rpartition("/") @@ -169,4 +206,4 @@ def _absolute_url(index_url, candidate): return "{}/{}".format(index_url, last.strip("/")) # relative path without up-references - return "{}/{}".format(index_url.rstrip("/"), candidate) + return "{}/{}".format(index_url.rstrip("/"), url) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 52ff02a178..f7aea1a7de 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -21,7 +21,9 @@ load("//python/private:auth.bzl", _get_auth = "get_auth") load("//python/private:envsubst.bzl", "envsubst") load("//python/private:normalize_name.bzl", "normalize_name") load("//python/private:text_util.bzl", "render") -load(":parse_simpleapi_html.bzl", "parse_simpleapi_html") +load(":parse_simpleapi_html.bzl", "absolute_url", "parse_simpleapi_html", "pkg_version") + +_FACT_VERSION = "v1" def simpleapi_download( ctx, @@ -29,6 +31,7 @@ def simpleapi_download( attr, cache, parallel_download = True, + store_facts = True, read_simpleapi = None, get_auth = None, _fail = fail): @@ -43,12 +46,13 @@ def simpleapi_download( separate packages. * extra_index_urls: Extra index URLs that will be looked up after the main is looked up. - * sources: list[str], the sources to download things for. Each value is - the contents of requirements files. + * sources: list[str] | dict[str, list[str]], the sources to download things for. Each + value is the contents of requirements files. * envsubst: list[str], the envsubst vars for performing substitution in index url. * netrc: The netrc parameter for ctx.download, see http_file for docs. * auth_patterns: The auth_patterns parameter for ctx.download, see http_file for docs. + * facts: The facts to write to if we support them. cache: A dictionary that can be used as a cache between calls during a single evaluation of the extension. We use a dictionary as a cache so that we can reuse calls to the simple API when evaluating the @@ -58,6 +62,9 @@ def simpleapi_download( reflected when re-evaluating the extension unless we do `bazel clean --expunge`. parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads. + store_facts: A boolean to enable usage of bazel 8.5 facts feature to store data in + MODULE.bazel.lock in between the runs. Can be disabled if urls contain sensitive + data and users would rather opt-out of the feature. read_simpleapi: a function for reading and parsing of the SimpleAPI contents. Used in tests. get_auth: A function to get auth information passed to read_simpleapi. Used in tests. @@ -80,28 +87,36 @@ def simpleapi_download( contents = {} index_urls = [attr.index_url] + attr.extra_index_urls read_simpleapi = read_simpleapi or _read_simpleapi + if hasattr(ctx, "facts") and store_facts: + download_kwargs["facts"] = _facts(ctx, attr.facts) + read_simpleapi = _read_simpleapi_with_facts + ctx.report_progress("Fetch package lists from PyPI index or read from MODULE.bazel.lock") + else: + ctx.report_progress("Fetch package lists from PyPI index") found_on_index = {} warn_overrides = False - ctx.report_progress("Fetch package lists from PyPI index") + + # Normalize the inputs + input_sources = {pkg: [] for pkg in attr.sources} if type(attr.sources) == "list" else attr.sources + for i, index_url in enumerate(index_urls): if i != 0: # Warn the user about a potential fix for the overrides warn_overrides = True async_downloads = {} - sources = [pkg for pkg in attr.sources if pkg not in found_on_index] - for pkg in sources: + sources = {pkg: versions for pkg, versions in input_sources.items() if pkg not in found_on_index} + for pkg, versions in sources.items(): pkg_normalized = normalize_name(pkg) result = read_simpleapi( ctx = ctx, - url = "{}/{}/".format( - index_url_overrides.get(pkg_normalized, index_url).rstrip("/"), - pkg, - ), attr = attr, cache = cache, + index_url = index_url_overrides.get(pkg_normalized, index_url), + distribution = pkg, get_auth = get_auth, + requested_versions = versions, **download_kwargs ) if hasattr(result, "wait"): @@ -109,6 +124,7 @@ def simpleapi_download( async_downloads[pkg] = struct( pkg_normalized = pkg_normalized, wait = result.wait, + fns = result.fns, ) elif result.success: contents[pkg_normalized] = result.output @@ -164,49 +180,14 @@ If you would like to skip downloading metadata for these packages please add 'si return contents -def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs): - """Read SimpleAPI. - - Args: - ctx: The module_ctx or repository_ctx. - url: str, the url parameter that can be passed to ctx.download. - attr: The attribute that contains necessary info for downloading. The - following attributes must be present: - * envsubst: The envsubst values for performing substitutions in the URL. - * netrc: The netrc parameter for ctx.download, see http_file for docs. - * auth_patterns: The auth_patterns parameter for ctx.download, see - http_file for docs. - cache: A dict for storing the results. - get_auth: A function to get auth information. Used in tests. - **download_kwargs: Any extra params to ctx.download. - Note that output and auth will be passed for you. - - Returns: - A similar object to what `download` would return except that in result.out - will be the parsed simple api contents. - """ - # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for - # the whl location and we cannot handle multiple URLs at once by passing - # them to ctx.download if we want to correctly handle the relative URLs. - # TODO: Add a test that env subbed index urls do not leak into the lock file. - - real_url = strip_empty_path_segments(envsubst( - url, - attr.envsubst, - ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get, - )) - - cache_key = real_url - if cache_key in cache: - return struct(success = True, output = cache[cache_key]) - +def _download_simpleapi(*, ctx, url, real_url, attr_envsubst, get_auth, **kwargs): output_str = envsubst( url, - attr.envsubst, + attr_envsubst, # Use env names in the subst values - this will be unique over # the lifetime of the execution of this function and we also use # `~` as the separator to ensure that we don't get clashes. - {e: "~{}~".format(e) for e in attr.envsubst}.get, + {e: "~{}~".format(e) for e in attr_envsubst}.get, ) # Transform the URL into a valid filename @@ -217,22 +198,50 @@ def _read_simpleapi(ctx, url, attr, cache, get_auth = None, **download_kwargs): get_auth = get_auth or _get_auth - # NOTE: this may have block = True or block = False in the download_kwargs + # NOTE: this may have block = True or block = False in the kwargs download = ctx.download( url = [real_url], output = output, auth = get_auth(ctx, [real_url], ctx_attr = attr), allow_fail = True, - **download_kwargs + **kwargs ) - if download_kwargs.get("block") == False: - # Simulate the same API as ctx.download has + return _await( + download, + _read, + ctx = ctx, + output = output, + ) + +def _await(download, fn, **kwargs): + if hasattr(download, "fns"): + download.fns.append( + lambda result: fn(result = result, **kwargs), + ) + return download + elif hasattr(download, "wait"): + # Have a reference type which we can iterate later when aggregating the result + fns = [lambda result: fn(result = result, **kwargs)] + + def wait(): + result = download.wait() + for fn in fns: + result = fn(result = result) + return result + return struct( - wait = lambda: _read_index_result(ctx, download.wait(), output, real_url, cache, cache_key), + wait = wait, + fns = fns, ) - return _read_index_result(ctx, download, output, real_url, cache, cache_key) + return fn(result = download, **kwargs) + +def _read(ctx, result, output): + if not result.success: + return result + + return struct(success = True, output = ctx.read(output)) def strip_empty_path_segments(url): """Removes empty path segments from a URL. Does nothing for urls with no scheme. @@ -255,15 +264,232 @@ def strip_empty_path_segments(url): else: return "{}://{}".format(scheme, stripped) -def _read_index_result(ctx, result, output, url, cache, cache_key): - if not result.success: +def _read_simpleapi(ctx, index_url, pkg, attr, cache, get_auth = None, return_absolute = True, **download_kwargs): + """Read SimpleAPI. + + Args: + ctx: The module_ctx or repository_ctx. + index_url: str, the PyPI SimpleAPI index URL + pkg: str, the distribution to download + attr: The attribute that contains necessary info for downloading. The + following attributes must be present: + * envsubst: The envsubst values for performing substitutions in the URL. + * netrc: The netrc parameter for ctx.download, see http_file for docs. + * auth_patterns: The auth_patterns parameter for ctx.download, see + http_file for docs. + cache: A dict for storing the results. + get_auth: A function to get auth information. Used in tests. + return_absolute: TODO + **download_kwargs: Any extra params to ctx.download. + Note that output and auth will be passed for you. + + Returns: + A similar object to what `download` would return except that in result.out + will be the parsed simple api contents. + """ + # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for + # the whl location and we cannot handle multiple URLs at once by passing + # them to ctx.download if we want to correctly handle the relative URLs. + # TODO: Add a test that env subbed index urls do not leak into the lock file. + + url = "{}/{}/".format(index_url.rstrip("/"), pkg) + + real_url = strip_empty_path_segments(envsubst( + url, + attr.envsubst, + ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get, + )) + cache_key = real_url + + cached = cache.get(cache_key) + if cached: + return struct(success = True, output = cached) + + download = _download_simpleapi( + ctx = ctx, + url = url, + real_url = real_url, + attr_envsubst = attr.envsubst, + get_auth = get_auth, + **download_kwargs + ) + + return _await( + download, + _read_index_result, + url = real_url, + cache = cache, + cache_key = cache_key, + return_absolute = return_absolute, + ) + +def _read_index_result(*, result, url, cache, cache_key, return_absolute): + if not result.success or not result.output: return struct(success = False) - content = ctx.read(output) + output = parse_simpleapi_html(url = url, content = result.output, return_absolute = return_absolute) + if not output: + return struct(success = False) - output = parse_simpleapi_html(url = url, content = content) - if output: - cache.setdefault(cache_key, output) - return struct(success = True, output = output, cache_key = cache_key) - else: + cache.setdefault(cache_key, output) + return struct(success = True, output = output) + +def _read_simpleapi_with_facts(ctx, index_url, distribution, facts = None, requested_versions = [], **download_kwargs): + """Read SimpleAPI or pull data from the known facts in the MODULE.bazel.lock file. + + Args: + ctx: The module_ctx or repository_ctx. + index_url: TODO + distribution: TODO + requested_versions: the list of requested versions. + facts: Facts to write back if we support facts. + **download_kwargs: Params passed to the _read_simpleapi. + + Returns: + A similar object to what `download` would return except that in result.out + will be the parsed simple api contents. We return only the `requested_versions` so that + the list of facts stored in the lock file is limited. + """ + cached = facts.get(index_url, distribution, requested_versions) + if cached: + return struct( + success = True, + output = cached, + ) + + # Fallback to pulling data from memory or downloading from the SimpleAPI + download = _read_simpleapi( + ctx, + index_url = index_url, + distribution = distribution, + return_absolute = False, + **download_kwargs + ) + + return _await( + download, + _read_index_result_with_facts, + facts = facts, + requested_versions = requested_versions, + index_url = index_url, + ) + +def _read_index_result_with_facts(*, result, facts, requested_versions, index_url): + if not result.success or not result.output: return struct(success = False) + + output = _filter_packages(result.output, requested_versions) + facts.setdefault(index_url, output) + return struct(success = True, output = output) + +def _filter_packages(dists, requested_versions): + sha256s_by_version = {} + whls = {} + sdists = {} + for sha256, d in (dists.sdists | dists.whls).items(): + if d.version not in requested_versions: + continue + + if d.filename.endswith(".whl"): + whls[sha256] = d + else: + sdists[sha256] = d + + sha256s_by_version.setdefault(d.version, []).append(sha256) + + return struct( + whls = whls, + sdists = sdists, + sha256s_by_version = sha256s_by_version, + ) + +def _facts(ctx, store_facts, facts_version = _FACT_VERSION): + known_facts = getattr(ctx, "facts", None) + if not known_facts: + return {} + + return struct( + get = lambda index_url, distribution, versions: _get_from_facts( + store_facts, + known_facts, + index_url, + distribution, + versions, + facts_version, + ), + setdefault = lambda url, value: _store_facts(store_facts, facts_version, url, value), + ) + +def _get_from_facts(facts, known_facts, index_url, distribution, requested_versions, facts_version): + if known_facts.get("fact_version") != facts_version: + # cannot trust known facts, different version that we know how to parse + return None + + known_sources = {} + + known_facts = known_facts.get(index_url, {}) + + index_url_for_distro = "{}/{}/".format(index_url, distribution) + for url, sha256 in known_facts.get("dist_hashes", {}).items(): + filename = known_facts.get("dist_filenames", {}).get(sha256) + if not filename: + _, _, filename = url.rpartition("/") + + version = pkg_version(filename, distribution) + if version not in requested_versions: + # TODO @aignas 2026-01-21: do the check by requested shas at some point + # We don't have sufficient info in the lock file, need to call the API + # + continue + + if filename.endswith(".whl"): + dists = known_sources.setdefault("whls", {}) + else: + dists = known_sources.setdefault("sdists", {}) + + known_sources.setdefault("sha256s_by_version", {}).setdefault(version, []).append(sha256) + + dists.setdefault(sha256, struct( + sha256 = sha256, + filename = filename, + version = version, + url = absolute_url(index_url = index_url_for_distro, url = url), + metadata_sha256 = "", + metadata_url = "", + yanked = known_facts.get("dist_yanked", {}).get(sha256, False), + )) + + if not known_sources: + return None + + output = struct( + whls = known_sources.get("whls", {}), + sdists = known_sources.get("sdists", {}), + sha256s_by_version = known_sources.get("sha256s_by_version", {}), + ) + _store_facts(facts, facts_version, index_url, output) + return output + +def _store_facts(facts, fact_version, index_url, value): + """Store values as facts in the lock file. + + The main idea is to ensure that the lock file is small and it is only storing what + we would need to fetch from the internet. Any derivative information we can + from this that can be achieved using pure Starlark functions should be done in + Starlark. + """ + + facts["fact_version"] = fact_version + + # Store the distributions by index URL that we find them on. + facts = facts.setdefault(index_url, {}) + + for sha256, d in (value.sdists | value.whls).items(): + facts.setdefault("dist_hashes", {}).setdefault(d.url, sha256) + if not d.url.endswith(d.filename): + facts.setdefault("dist_filenames", {}).setdefault(d.url, d.filename) + if d.yanked: + # TODO @aignas 2026-01-21: store yank reason + facts.setdefault("dist_yanked", {}).setdefault(sha256, True) + + return value From b78cf22beb6ac3c5da6a003c91d2ec9e256bb6f3 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Mon, 2 Feb 2026 22:09:42 +0900 Subject: [PATCH 02/13] fixup --- python/private/pypi/simpleapi_download.bzl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index f7aea1a7de..303affca1d 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -264,13 +264,13 @@ def strip_empty_path_segments(url): else: return "{}://{}".format(scheme, stripped) -def _read_simpleapi(ctx, index_url, pkg, attr, cache, get_auth = None, return_absolute = True, **download_kwargs): +def _read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth = None, return_absolute = True, **download_kwargs): """Read SimpleAPI. Args: ctx: The module_ctx or repository_ctx. index_url: str, the PyPI SimpleAPI index URL - pkg: str, the distribution to download + distribution: str, the distribution to download attr: The attribute that contains necessary info for downloading. The following attributes must be present: * envsubst: The envsubst values for performing substitutions in the URL. @@ -292,7 +292,7 @@ def _read_simpleapi(ctx, index_url, pkg, attr, cache, get_auth = None, return_ab # them to ctx.download if we want to correctly handle the relative URLs. # TODO: Add a test that env subbed index urls do not leak into the lock file. - url = "{}/{}/".format(index_url.rstrip("/"), pkg) + url = "{}/{}/".format(index_url.rstrip("/"), distribution) real_url = strip_empty_path_segments(envsubst( url, From 28f3871c6bb1bd68cb9399698140ec6b00abe859 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Tue, 3 Feb 2026 22:54:45 +0900 Subject: [PATCH 03/13] filter packages by requested versions --- python/private/pypi/simpleapi_download.bzl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 303affca1d..4757b83db6 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -264,7 +264,7 @@ def strip_empty_path_segments(url): else: return "{}://{}".format(scheme, stripped) -def _read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth = None, return_absolute = True, **download_kwargs): +def _read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth = None, return_absolute = True, requested_versions = [], **download_kwargs): """Read SimpleAPI. Args: @@ -279,6 +279,7 @@ def _read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth = None, http_file for docs. cache: A dict for storing the results. get_auth: A function to get auth information. Used in tests. + requested_versions: the list of requested versions. return_absolute: TODO **download_kwargs: Any extra params to ctx.download. Note that output and auth will be passed for you. @@ -303,6 +304,7 @@ def _read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth = None, cached = cache.get(cache_key) if cached: + cached = _filter_packages(cached, requested_versions) return struct(success = True, output = cached) download = _download_simpleapi( @@ -321,9 +323,10 @@ def _read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth = None, cache = cache, cache_key = cache_key, return_absolute = return_absolute, + requested_versions = requested_versions, ) -def _read_index_result(*, result, url, cache, cache_key, return_absolute): +def _read_index_result(*, result, url, cache, cache_key, return_absolute, requested_versions): if not result.success or not result.output: return struct(success = False) @@ -332,6 +335,7 @@ def _read_index_result(*, result, url, cache, cache_key, return_absolute): return struct(success = False) cache.setdefault(cache_key, output) + output = _filter_packages(output, requested_versions) return struct(success = True, output = output) def _read_simpleapi_with_facts(ctx, index_url, distribution, facts = None, requested_versions = [], **download_kwargs): From fd13f9082f95aa69c0f283f659ee8950781099ee Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Thu, 5 Feb 2026 21:38:49 +0900 Subject: [PATCH 04/13] fix unit tests --- tests/pypi/hub_builder/hub_builder_tests.bzl | 8 +++++++- .../simpleapi_download/simpleapi_download_tests.bzl | 10 ++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/pypi/hub_builder/hub_builder_tests.bzl b/tests/pypi/hub_builder/hub_builder_tests.bzl index 03cefd13c5..f73e23ba37 100644 --- a/tests/pypi/hub_builder/hub_builder_tests.bzl +++ b/tests/pypi/hub_builder/hub_builder_tests.bzl @@ -1036,7 +1036,13 @@ git_dep @ git+https://git.server/repo/project@deadbeefdeadbeef index_url = "pypi.org", index_url_overrides = {}, netrc = None, - sources = ["simple", "plat_pkg", "pip_fallback", "some_other_pkg"], + facts = None, + sources = { + "pip_fallback": ["0.0.1"], + "plat_pkg": ["0.0.4"], + "simple": ["0.0.1"], + "some_other_pkg": ["0.0.1"], + }, ), "cache": {}, "parallel_download": False, diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl index 8dc307235a..e8013c3ab7 100644 --- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl +++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl @@ -22,12 +22,15 @@ _tests = [] def _test_simple(env): calls = [] - def read_simpleapi(ctx, url, attr, cache, get_auth, block): + def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): _ = ctx # buildifier: disable=unused-variable + _ = distribution + _ = requested_versions _ = attr _ = cache _ = get_auth env.expect.that_bool(block).equals(False) + url = "{}/{}/".format(index_url, distribution) calls.append(url) if "foo" in url and "main" in url: return struct( @@ -75,11 +78,14 @@ def _test_fail(env): calls = [] fails = [] - def read_simpleapi(ctx, url, attr, cache, get_auth, block): + def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): _ = ctx # buildifier: disable=unused-variable + _ = distribution + _ = requested_versions _ = attr _ = cache _ = get_auth + url = "{}/{}/".format(index_url, distribution) env.expect.that_bool(block).equals(False) calls.append(url) if "foo" in url: From a11735fe7596770b1c63e0015ff274190cfba356 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Thu, 5 Feb 2026 23:40:17 +0900 Subject: [PATCH 05/13] fix requested versions --- python/private/pypi/simpleapi_download.bzl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 4757b83db6..9b6f0844b4 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -116,7 +116,7 @@ def simpleapi_download( index_url = index_url_overrides.get(pkg_normalized, index_url), distribution = pkg, get_auth = get_auth, - requested_versions = versions, + requested_versions = {v: None for v in versions}, **download_kwargs ) if hasattr(result, "wait"): @@ -264,7 +264,7 @@ def strip_empty_path_segments(url): else: return "{}://{}".format(scheme, stripped) -def _read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth = None, return_absolute = True, requested_versions = [], **download_kwargs): +def _read_simpleapi(ctx, index_url, distribution, attr, cache, requested_versions, get_auth = None, return_absolute = True, **download_kwargs): """Read SimpleAPI. Args: @@ -366,6 +366,7 @@ def _read_simpleapi_with_facts(ctx, index_url, distribution, facts = None, reque ctx, index_url = index_url, distribution = distribution, + requested_versions = requested_versions, return_absolute = False, **download_kwargs ) From db1e48087d87e8e74fd33e38f3c2fb31aac5cc06 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Sat, 7 Feb 2026 12:55:56 +0900 Subject: [PATCH 06/13] remove a bool toggle --- python/private/pypi/simpleapi_download.bzl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 9b6f0844b4..18065b5adb 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -31,7 +31,6 @@ def simpleapi_download( attr, cache, parallel_download = True, - store_facts = True, read_simpleapi = None, get_auth = None, _fail = fail): @@ -62,9 +61,6 @@ def simpleapi_download( reflected when re-evaluating the extension unless we do `bazel clean --expunge`. parallel_download: A boolean to enable usage of bazel 7.1 non-blocking downloads. - store_facts: A boolean to enable usage of bazel 8.5 facts feature to store data in - MODULE.bazel.lock in between the runs. Can be disabled if urls contain sensitive - data and users would rather opt-out of the feature. read_simpleapi: a function for reading and parsing of the SimpleAPI contents. Used in tests. get_auth: A function to get auth information passed to read_simpleapi. Used in tests. @@ -87,7 +83,7 @@ def simpleapi_download( contents = {} index_urls = [attr.index_url] + attr.extra_index_urls read_simpleapi = read_simpleapi or _read_simpleapi - if hasattr(ctx, "facts") and store_facts: + if hasattr(ctx, "facts"): download_kwargs["facts"] = _facts(ctx, attr.facts) read_simpleapi = _read_simpleapi_with_facts ctx.report_progress("Fetch package lists from PyPI index or read from MODULE.bazel.lock") From 5d3f8bff221fdf3cb5976e233cd5c38f64fdd123 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Sat, 7 Feb 2026 13:16:06 +0900 Subject: [PATCH 07/13] wip --- python/private/pypi/simpleapi_download.bzl | 131 +++++++++++---------- 1 file changed, 66 insertions(+), 65 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 18065b5adb..2e598fbbfe 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -83,18 +83,16 @@ def simpleapi_download( contents = {} index_urls = [attr.index_url] + attr.extra_index_urls read_simpleapi = read_simpleapi or _read_simpleapi - if hasattr(ctx, "facts"): - download_kwargs["facts"] = _facts(ctx, attr.facts) - read_simpleapi = _read_simpleapi_with_facts - ctx.report_progress("Fetch package lists from PyPI index or read from MODULE.bazel.lock") - else: - ctx.report_progress("Fetch package lists from PyPI index") + cache = _cache(ctx, cache, attr.facts) found_on_index = {} warn_overrides = False # Normalize the inputs - input_sources = {pkg: [] for pkg in attr.sources} if type(attr.sources) == "list" else attr.sources + if type(attr.sources) == "list": + fail("TODO") + else: + input_sources = attr.sources for i, index_url in enumerate(index_urls): if i != 0: @@ -284,24 +282,24 @@ def _read_simpleapi(ctx, index_url, distribution, attr, cache, requested_version A similar object to what `download` would return except that in result.out will be the parsed simple api contents. """ + + index_url = index_url.rstrip("/") + # NOTE @aignas 2024-03-31: some of the simple APIs use relative URLs for # the whl location and we cannot handle multiple URLs at once by passing # them to ctx.download if we want to correctly handle the relative URLs. # TODO: Add a test that env subbed index urls do not leak into the lock file. - url = "{}/{}/".format(index_url.rstrip("/"), distribution) + cached = cache.get(index_url, distribution, requested_versions) + if cached: + return struct(success = True, output = cached) + url = "{}/{}/".format(index_url, distribution) real_url = strip_empty_path_segments(envsubst( url, attr.envsubst, ctx.getenv if hasattr(ctx, "getenv") else ctx.os.environ.get, )) - cache_key = real_url - - cached = cache.get(cache_key) - if cached: - cached = _filter_packages(cached, requested_versions) - return struct(success = True, output = cached) download = _download_simpleapi( ctx = ctx, @@ -315,75 +313,78 @@ def _read_simpleapi(ctx, index_url, distribution, attr, cache, requested_version return _await( download, _read_index_result, - url = real_url, + index_url = index_url, + distribution = distribution, + real_url = real_url, cache = cache, - cache_key = cache_key, return_absolute = return_absolute, requested_versions = requested_versions, ) -def _read_index_result(*, result, url, cache, cache_key, return_absolute, requested_versions): +def _read_index_result(*, result, index_url, distribution, real_url, cache, return_absolute, requested_versions): if not result.success or not result.output: return struct(success = False) - output = parse_simpleapi_html(url = url, content = result.output, return_absolute = return_absolute) + output = parse_simpleapi_html(url = real_url, content = result.output, return_absolute = return_absolute) if not output: return struct(success = False) - cache.setdefault(cache_key, output) - output = _filter_packages(output, requested_versions) + cache.setdefault(index_url, distribution, requested_versions, output) return struct(success = True, output = output) -def _read_simpleapi_with_facts(ctx, index_url, distribution, facts = None, requested_versions = [], **download_kwargs): - """Read SimpleAPI or pull data from the known facts in the MODULE.bazel.lock file. +def _cache(ctx, cache, facts): + if hasattr(ctx, "facts"): + facts = _facts(ctx, facts) + ctx.report_progress("Fetch package lists from PyPI index or read from MODULE.bazel.lock") + else: + ctx.report_progress("Fetch package lists from PyPI index") - Args: - ctx: The module_ctx or repository_ctx. - index_url: TODO - distribution: TODO - requested_versions: the list of requested versions. - facts: Facts to write back if we support facts. - **download_kwargs: Params passed to the _read_simpleapi. + return struct( + get = lambda index_url, distribution, versions: _cache_get( + cache, + facts, + index_url, + distribution, + versions, + ), + setdefault = lambda index_url, distribution, versions, value: _cache_setdefault( + cache, + facts, + index_url, + distribution, + versions, + value, + ), + ) - Returns: - A similar object to what `download` would return except that in result.out - will be the parsed simple api contents. We return only the `requested_versions` so that - the list of facts stored in the lock file is limited. - """ - cached = facts.get(index_url, distribution, requested_versions) - if cached: - return struct( - success = True, - output = cached, - ) +def _cache_get(cache, facts, index_url, distribution, versions): + if facts: + cached = facts.get(index_url, distribution, versions) + if cached: + return cached - # Fallback to pulling data from memory or downloading from the SimpleAPI - download = _read_simpleapi( - ctx, - index_url = index_url, - distribution = distribution, - requested_versions = requested_versions, - return_absolute = False, - **download_kwargs - ) + cached = cache.get((index_url, distribution)) + if not cached: + return None - return _await( - download, - _read_index_result_with_facts, - facts = facts, - requested_versions = requested_versions, - index_url = index_url, - ) + cached = _filter_packages(cached, versions) + if facts: + # Ensure that we write back to the facts, this happens if we request versions that + # we don't have facts for but we have in-memory cache of SimpleAPI query results + facts.setdefault(index_url, distribution, versions, cached) + return cached -def _read_index_result_with_facts(*, result, facts, requested_versions, index_url): - if not result.success or not result.output: - return struct(success = False) +def _cache_setdefault(cache, facts, index_url, distribution, versions, value): + cache.setdefault((index_url, distribution), value) + value = _filter_packages(value, versions) - output = _filter_packages(result.output, requested_versions) - facts.setdefault(index_url, output) - return struct(success = True, output = output) + if facts: + facts.setdefault(index_url, distribution, value) def _filter_packages(dists, requested_versions): + if dists == None: + return None + sha256s_by_version = {} whls = {} sdists = {} @@ -404,21 +405,21 @@ def _filter_packages(dists, requested_versions): sha256s_by_version = sha256s_by_version, ) -def _facts(ctx, store_facts, facts_version = _FACT_VERSION): +def _facts(ctx, facts, facts_version = _FACT_VERSION): known_facts = getattr(ctx, "facts", None) if not known_facts: return {} return struct( get = lambda index_url, distribution, versions: _get_from_facts( - store_facts, + facts, known_facts, index_url, distribution, versions, facts_version, ), - setdefault = lambda url, value: _store_facts(store_facts, facts_version, url, value), + setdefault = lambda url, distribution, value: _store_facts(facts, facts_version, url, value), ) def _get_from_facts(facts, known_facts, index_url, distribution, requested_versions, facts_version): From a84d8cb4e720b2a0531a164a6e03b8bf2c6e553d Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Sat, 7 Feb 2026 13:44:11 +0900 Subject: [PATCH 08/13] wip --- python/private/pypi/simpleapi_download.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 2e598fbbfe..1d660b7c90 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -371,7 +371,7 @@ def _cache_get(cache, facts, index_url, distribution, versions): if facts: # Ensure that we write back to the facts, this happens if we request versions that # we don't have facts for but we have in-memory cache of SimpleAPI query results - facts.setdefault(index_url, distribution, versions, cached) + facts.setdefault(index_url, distribution, cached) return cached def _cache_setdefault(cache, facts, index_url, distribution, versions, value): From 4fb0d0ea0166a3cfd65a880a16867bb8caa8e808 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Sun, 8 Feb 2026 01:06:02 +0900 Subject: [PATCH 09/13] wip --- python/private/pypi/simpleapi_download.bzl | 54 +++++-- .../simpleapi_download_tests.bzl | 137 +++++++++++++++++- 2 files changed, 169 insertions(+), 22 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 1d660b7c90..837a470e8e 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -83,7 +83,13 @@ def simpleapi_download( contents = {} index_urls = [attr.index_url] + attr.extra_index_urls read_simpleapi = read_simpleapi or _read_simpleapi - cache = _cache(ctx, cache, attr.facts) + + if attr.facts: + ctx.report_progress("Fetch package lists from PyPI index or read from MODULE.bazel.lock") + else: + ctx.report_progress("Fetch package lists from PyPI index") + + cache = simpleapi_cache(cache, getattr(ctx, "facts", None), attr.facts) found_on_index = {} warn_overrides = False @@ -332,12 +338,21 @@ def _read_index_result(*, result, index_url, distribution, real_url, cache, retu cache.setdefault(index_url, distribution, requested_versions, output) return struct(success = True, output = output) -def _cache(ctx, cache, facts): - if hasattr(ctx, "facts"): - facts = _facts(ctx, facts) - ctx.report_progress("Fetch package lists from PyPI index or read from MODULE.bazel.lock") - else: - ctx.report_progress("Fetch package lists from PyPI index") +def simpleapi_cache(cache = None, known_facts = None, facts = None): + """SimpleAPI cache for making fewer calls. + + Args: + cache: the storage to store things in memory. + known_facts: the storage to retrieve known facts. + facts: the storage to store things in the lock file after we evaluate the extension. + + Returns: + struct with 2 methods, `get` and `setdefault`. + """ + if cache == None: + cache = {} + if facts != None: + facts = _facts(known_facts, facts) return struct( get = lambda index_url, distribution, versions: _cache_get( @@ -388,27 +403,32 @@ def _filter_packages(dists, requested_versions): sha256s_by_version = {} whls = {} sdists = {} - for sha256, d in (dists.sdists | dists.whls).items(): + for sha256, d in dists.sdists.items(): if d.version not in requested_versions: continue - if d.filename.endswith(".whl"): - whls[sha256] = d - else: - sdists[sha256] = d + sdists[sha256] = d + sha256s_by_version.setdefault(d.version, []).append(sha256) + + for sha256, d in dists.whls.items(): + if d.version not in requested_versions: + continue + whls[sha256] = d sha256s_by_version.setdefault(d.version, []).append(sha256) + if not whls and not sdists: + return None + return struct( whls = whls, sdists = sdists, sha256s_by_version = sha256s_by_version, ) -def _facts(ctx, facts, facts_version = _FACT_VERSION): - known_facts = getattr(ctx, "facts", None) - if not known_facts: - return {} +def _facts(known_facts, facts, facts_version = _FACT_VERSION): + if known_facts == None: + return None return struct( get = lambda index_url, distribution, versions: _get_from_facts( @@ -480,6 +500,8 @@ def _store_facts(facts, fact_version, index_url, value): from this that can be achieved using pure Starlark functions should be done in Starlark. """ + if not value: + return value facts["fact_version"] = fact_version diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl index e8013c3ab7..0da3a1e16a 100644 --- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl +++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl @@ -15,7 +15,13 @@ "" load("@rules_testing//lib:test_suite.bzl", "test_suite") -load("//python/private/pypi:simpleapi_download.bzl", "simpleapi_download", "strip_empty_path_segments") # buildifier: disable=bzl-visibility +load("@rules_testing//lib:truth.bzl", "subjects") +load( + "//python/private/pypi:simpleapi_download.bzl", + "simpleapi_cache", + "simpleapi_download", + "strip_empty_path_segments", +) # buildifier: disable=bzl-visibility _tests = [] @@ -52,8 +58,9 @@ def _test_simple(env): index_url_overrides = {}, index_url = "main", extra_index_urls = ["extra"], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = [], + facts = None, ), cache = {}, parallel_download = True, @@ -115,8 +122,9 @@ def _test_fail(env): }, index_url = "main", extra_index_urls = ["extra"], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = [], + facts = None, ), cache = {}, parallel_download = True, @@ -168,8 +176,9 @@ def _test_download_url(env): index_url_overrides = {}, index_url = "https://example.com/main/simple/", extra_index_urls = [], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = [], + facts = None, ), cache = {}, parallel_download = False, @@ -204,8 +213,9 @@ def _test_download_url_parallel(env): index_url_overrides = {}, index_url = "https://example.com/main/simple/", extra_index_urls = [], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = [], + facts = None, ), cache = {}, parallel_download = True, @@ -240,8 +250,9 @@ def _test_download_envsubst_url(env): index_url_overrides = {}, index_url = "$INDEX_URL", extra_index_urls = [], - sources = ["foo", "bar", "baz"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, envsubst = ["INDEX_URL"], + facts = None, ), cache = {}, parallel_download = False, @@ -266,6 +277,120 @@ def _test_strip_empty_path_segments(env): _tests.append(_test_strip_empty_path_segments) +def _expect_cache_result(env, cache, key, sdists, whls): + got = env.expect.that_struct( + cache.get(*key), + attrs = dict( + whls = subjects.dict, + sdists = subjects.dict, + ), + ) + got.whls().contains_exactly(whls) + got.sdists().contains_exactly(sdists) + +def _test_cache_without_facts(env): + cache = simpleapi_cache() + + env.expect.that_str(cache.get("index_url", "distro", ["1.0"])).equals(None) + cache.setdefault("index_url", "distro", ["1.0"], struct( + sdists = { + "a": struct(version = "1.0"), + }, + whls = { + "b": struct(version = "1.0"), + "c": struct(version = "1.1"), + "d": struct(version = "1.1"), + "e": struct(version = "1.0"), + }, + )) + _expect_cache_result( + env, + cache, + key = ("index_url", "distro", ["1.0"]), + sdists = {"a": struct(version = "1.0")}, + whls = {"b": struct(version = "1.0"), "e": struct(version = "1.0")}, + ) + env.expect.that_str(cache.get("index_url", "distro", ["1.2"])).equals(None) + + _expect_cache_result( + env, + cache, + key = ("index_url", "distro", ["1.1"]), + sdists = {}, + whls = {"c": struct(version = "1.1"), "d": struct(version = "1.1")}, + ) + +_tests.append(_test_cache_without_facts) + +def _test_cache_with_facts(env): + facts = {} + cache = simpleapi_cache( + cache = {}, + known_facts = {}, + facts = facts, + ) + + env.expect.that_str(cache.get("index_url", "distro", ["1.0"])).equals(None) + cache.setdefault("index_url", "distro", ["1.0"], struct( + sdists = { + "a": struct(version = "1.0", url = "url//a.tgz", filename = "a.tgz", yanked = False), + }, + whls = { + "b": struct(version = "1.0", url = "url//b.whl", filename = "b.whl", yanked = False), + "c": struct(version = "1.1", url = "url//c.whl", filename = "c.whl", yanked = False), + "d": struct(version = "1.1", url = "url//d.whl", filename = "d.whl", yanked = False), + "e": struct(version = "1.0", url = "url//e.whl", filename = "e.whl", yanked = False), + }, + )) + _expect_cache_result( + env, + cache, + key = ("index_url", "distro", ["1.0"]), + sdists = { + "a": struct(version = "1.0", url = "url//a.tgz", filename = "a.tgz", yanked = False), + }, + whls = { + "b": struct(version = "1.0", url = "url//b.whl", filename = "b.whl", yanked = False), + "e": struct(version = "1.0", url = "url//e.whl", filename = "e.whl", yanked = False), + }, + ) + env.expect.that_str(cache.get("index_url", "distro", ["1.2"])).equals(None) + env.expect.that_dict(facts).contains_exactly({ + "fact_version": "v1", + "index_url": { + "dist_hashes": { + "url//a.tgz": "a", + "url//b.whl": "b", + "url//e.whl": "e", + }, + }, + }) + + _expect_cache_result( + env, + cache, + key = ("index_url", "distro", ["1.1"]), + sdists = {}, + whls = { + "c": struct(version = "1.1", url = "url//c.whl", filename = "c.whl", yanked = False), + "d": struct(version = "1.1", url = "url//d.whl", filename = "d.whl", yanked = False), + }, + ) + env.expect.that_dict(facts).contains_exactly({ + "fact_version": "v1", + "index_url": { + "dist_hashes": { + "url//a.tgz": "a", + "url//b.whl": "b", + "url//c.whl": "c", + "url//d.whl": "d", + "url//e.whl": "e", + }, + }, + }) + +_tests.append(_test_cache_with_facts) + def simpleapi_download_test_suite(name): """Create the test suite. From ec50bb5d2c25c853184d94969ae40b6e8149c1f6 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Sun, 8 Feb 2026 01:39:36 +0900 Subject: [PATCH 10/13] wip --- python/private/pypi/simpleapi_download.bzl | 44 +++++++--- .../simpleapi_download_tests.bzl | 81 ++++++++++++++----- 2 files changed, 93 insertions(+), 32 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 837a470e8e..d545ebff87 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -264,7 +264,7 @@ def strip_empty_path_segments(url): else: return "{}://{}".format(scheme, stripped) -def _read_simpleapi(ctx, index_url, distribution, attr, cache, requested_versions, get_auth = None, return_absolute = True, **download_kwargs): +def _read_simpleapi(ctx, index_url, distribution, attr, cache, requested_versions, get_auth = None, **download_kwargs): """Read SimpleAPI. Args: @@ -280,7 +280,6 @@ def _read_simpleapi(ctx, index_url, distribution, attr, cache, requested_version cache: A dict for storing the results. get_auth: A function to get auth information. Used in tests. requested_versions: the list of requested versions. - return_absolute: TODO **download_kwargs: Any extra params to ctx.download. Note that output and auth will be passed for you. @@ -323,15 +322,15 @@ def _read_simpleapi(ctx, index_url, distribution, attr, cache, requested_version distribution = distribution, real_url = real_url, cache = cache, - return_absolute = return_absolute, requested_versions = requested_versions, ) -def _read_index_result(*, result, index_url, distribution, real_url, cache, return_absolute, requested_versions): +def _read_index_result(*, result, index_url, distribution, real_url, cache, requested_versions): if not result.success or not result.output: return struct(success = False) - output = parse_simpleapi_html(url = real_url, content = result.output, return_absolute = return_absolute) + # TODO @aignas 2026-02-08: make this the only behaviour, maybe can get rid of `real_url + output = parse_simpleapi_html(url = real_url, content = result.output, return_absolute = False) if not output: return struct(success = False) @@ -382,7 +381,7 @@ def _cache_get(cache, facts, index_url, distribution, versions): if not cached: return None - cached = _filter_packages(cached, versions) + cached = _filter_packages(cached, versions, index_url, distribution) if facts: # Ensure that we write back to the facts, this happens if we request versions that # we don't have facts for but we have in-memory cache of SimpleAPI query results @@ -391,12 +390,12 @@ def _cache_get(cache, facts, index_url, distribution, versions): def _cache_setdefault(cache, facts, index_url, distribution, versions, value): cache.setdefault((index_url, distribution), value) - value = _filter_packages(value, versions) + value = _filter_packages(value, versions, index_url, distribution) if facts: facts.setdefault(index_url, distribution, value) -def _filter_packages(dists, requested_versions): +def _filter_packages(dists, requested_versions, index_url, distribution): if dists == None: return None @@ -407,14 +406,14 @@ def _filter_packages(dists, requested_versions): if d.version not in requested_versions: continue - sdists[sha256] = d + sdists[sha256] = _with_absolute_url(d, index_url, distribution) sha256s_by_version.setdefault(d.version, []).append(sha256) for sha256, d in dists.whls.items(): if d.version not in requested_versions: continue - whls[sha256] = d + whls[sha256] = _with_absolute_url(d, index_url, distribution) sha256s_by_version.setdefault(d.version, []).append(sha256) if not whls and not sdists: @@ -440,6 +439,8 @@ def _facts(known_facts, facts, facts_version = _FACT_VERSION): facts_version, ), setdefault = lambda url, distribution, value: _store_facts(facts, facts_version, url, value), + known_facts = known_facts, + facts = facts, ) def _get_from_facts(facts, known_facts, index_url, distribution, requested_versions, facts_version): @@ -476,8 +477,6 @@ def _get_from_facts(facts, known_facts, index_url, distribution, requested_versi filename = filename, version = version, url = absolute_url(index_url = index_url_for_distro, url = url), - metadata_sha256 = "", - metadata_url = "", yanked = known_facts.get("dist_yanked", {}).get(sha256, False), )) @@ -492,6 +491,27 @@ def _get_from_facts(facts, known_facts, index_url, distribution, requested_versi _store_facts(facts, facts_version, index_url, output) return output +def _with_absolute_url(d, index_url, distribution): + index_url_for_distro = "{}/{}/".format(index_url.rstrip("/"), distribution) + + # TODO @aignas 2026-02-08: think of a better way to do this + kwargs = dict() + for attr in [ + "sha256", + "filename", + "version", + "metadata_sha256", + "metadata_url", + "yanked", + "url", + ]: + if hasattr(d, attr): + kwargs[attr] = getattr(d, attr) + if attr == "url": + kwargs[attr] = absolute_url(index_url = index_url_for_distro, url = kwargs[attr]) + + return struct(**kwargs) + def _store_facts(facts, fact_version, index_url, value): """Store values as facts in the lock file. diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl index 0da3a1e16a..0286916e8a 100644 --- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl +++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl @@ -324,22 +324,38 @@ _tests.append(_test_cache_without_facts) def _test_cache_with_facts(env): facts = {} + cache_storage = {} + known_facts = {} cache = simpleapi_cache( - cache = {}, - known_facts = {}, + cache = cache_storage, + known_facts = known_facts, facts = facts, ) + sdists = struct( + a = struct(version = "1.0", url = "https://index_url/distro/a-1.0.tgz", filename = "a-1.0.tgz", yanked = False, sha256 = "a"), + ) + + # TODO @aignas 2026-02-08: url is relative + # TODO @aignas 2026-02-08: url starts with `/` + # TODO @aignas 2026-02-08: url is absolute + whls = struct( + b = struct(version = "1.0", url = "https://index_url/distro/b-1.0-tail.whl", filename = "b-1.0-tail.whl", yanked = False, sha256 = "b"), + c = struct(version = "1.1", url = "https://index_url/distro/c-1.1-tail.whl", filename = "c-1.1-tail.whl", yanked = False, sha256 = "c"), + d = struct(version = "1.1", url = "https://index_url/distro/d-1.1-tail.whl", filename = "d-1.1-tail.whl", yanked = False, sha256 = "d"), + e = struct(version = "1.0", url = "https://index_url/distro/e-1.0-tail.whl", filename = "e-1.0-tail.whl", yanked = False, sha256 = "e"), + ) + env.expect.that_str(cache.get("index_url", "distro", ["1.0"])).equals(None) cache.setdefault("index_url", "distro", ["1.0"], struct( sdists = { - "a": struct(version = "1.0", url = "url//a.tgz", filename = "a.tgz", yanked = False), + "a": sdists.a, }, whls = { - "b": struct(version = "1.0", url = "url//b.whl", filename = "b.whl", yanked = False), - "c": struct(version = "1.1", url = "url//c.whl", filename = "c.whl", yanked = False), - "d": struct(version = "1.1", url = "url//d.whl", filename = "d.whl", yanked = False), - "e": struct(version = "1.0", url = "url//e.whl", filename = "e.whl", yanked = False), + "b": whls.b, + "c": whls.c, + "d": whls.d, + "e": whls.e, }, )) _expect_cache_result( @@ -347,11 +363,11 @@ def _test_cache_with_facts(env): cache, key = ("index_url", "distro", ["1.0"]), sdists = { - "a": struct(version = "1.0", url = "url//a.tgz", filename = "a.tgz", yanked = False), + "a": sdists.a, }, whls = { - "b": struct(version = "1.0", url = "url//b.whl", filename = "b.whl", yanked = False), - "e": struct(version = "1.0", url = "url//e.whl", filename = "e.whl", yanked = False), + "b": whls.b, + "e": whls.e, }, ) env.expect.that_str(cache.get("index_url", "distro", ["1.2"])).equals(None) @@ -359,32 +375,57 @@ def _test_cache_with_facts(env): "fact_version": "v1", "index_url": { "dist_hashes": { - "url//a.tgz": "a", - "url//b.whl": "b", - "url//e.whl": "e", + sdists.a.url: "a", + whls.b.url: "b", + whls.e.url: "e", + }, + }, + }) + + _expect_cache_result( + env, + cache, + key = ("index_url", "distro", ["1.1"]), + sdists = {}, + whls = { + "c": whls.c, + "d": whls.d, + }, + ) + env.expect.that_dict(facts).contains_exactly({ + "fact_version": "v1", + "index_url": { + "dist_hashes": { + sdists.a.url: "a", + whls.b.url: "b", + whls.c.url: "c", + whls.d.url: "d", + whls.e.url: "e", }, }, }) + # simulate write to MODULE.bazel.lock + known_facts.update(facts) + facts.clear() + cache_storage.clear() + _expect_cache_result( env, cache, key = ("index_url", "distro", ["1.1"]), sdists = {}, whls = { - "c": struct(version = "1.1", url = "url//c.whl", filename = "c.whl", yanked = False), - "d": struct(version = "1.1", url = "url//d.whl", filename = "d.whl", yanked = False), + "c": whls.c, + "d": whls.d, }, ) env.expect.that_dict(facts).contains_exactly({ "fact_version": "v1", "index_url": { "dist_hashes": { - "url//a.tgz": "a", - "url//b.whl": "b", - "url//c.whl": "c", - "url//d.whl": "d", - "url//e.whl": "e", + whls.c.url: "c", + whls.d.url: "d", }, }, }) From 97b8be1f461de6182c2cbc7deb1d4cf3647976f8 Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Sun, 8 Feb 2026 23:56:14 +0900 Subject: [PATCH 11/13] finish the in memory --- python/private/pypi/simpleapi_download.bzl | 116 ++- .../simpleapi_download_tests.bzl | 659 ++++++++---------- 2 files changed, 373 insertions(+), 402 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index d545ebff87..25ff73469b 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -89,7 +89,10 @@ def simpleapi_download( else: ctx.report_progress("Fetch package lists from PyPI index") - cache = simpleapi_cache(cache, getattr(ctx, "facts", None), attr.facts) + cache = simpleapi_cache( + memory_cache = memory_cache(cache), + facts_cache = facts_cache(getattr(ctx, "facts", None), attr.facts), + ) found_on_index = {} warn_overrides = False @@ -337,33 +340,27 @@ def _read_index_result(*, result, index_url, distribution, real_url, cache, requ cache.setdefault(index_url, distribution, requested_versions, output) return struct(success = True, output = output) -def simpleapi_cache(cache = None, known_facts = None, facts = None): +def simpleapi_cache(memory_cache, facts_cache): """SimpleAPI cache for making fewer calls. Args: - cache: the storage to store things in memory. - known_facts: the storage to retrieve known facts. - facts: the storage to store things in the lock file after we evaluate the extension. + memory_cache: the storage to store things in memory. + facts_cache: the storage to retrieve known facts. Returns: struct with 2 methods, `get` and `setdefault`. """ - if cache == None: - cache = {} - if facts != None: - facts = _facts(known_facts, facts) - return struct( get = lambda index_url, distribution, versions: _cache_get( - cache, - facts, + memory_cache, + facts_cache, index_url, distribution, versions, ), setdefault = lambda index_url, distribution, versions, value: _cache_setdefault( - cache, - facts, + memory_cache, + facts_cache, index_url, distribution, versions, @@ -372,20 +369,20 @@ def simpleapi_cache(cache = None, known_facts = None, facts = None): ) def _cache_get(cache, facts, index_url, distribution, versions): - if facts: - cached = facts.get(index_url, distribution, versions) - if cached: - return cached + if not facts: + return cache.get(index_url, distribution, versions) - cached = cache.get((index_url, distribution)) + cached = facts.get(index_url, distribution, versions) + if cached: + return cached + + cached = cache.get(index_url, distribution, versions) if not cached: return None - cached = _filter_packages(cached, versions, index_url, distribution) - if facts: - # Ensure that we write back to the facts, this happens if we request versions that - # we don't have facts for but we have in-memory cache of SimpleAPI query results - facts.setdefault(index_url, distribution, cached) + # Ensure that we write back to the facts, this happens if we request versions that + # we don't have facts for but we have in-memory cache of SimpleAPI query results + facts.setdefault(index_url, distribution, cached) return cached def _cache_setdefault(cache, facts, index_url, distribution, versions, value): @@ -395,10 +392,78 @@ def _cache_setdefault(cache, facts, index_url, distribution, versions, value): if facts: facts.setdefault(index_url, distribution, value) +def memory_cache(cache = None): + """SimpleAPI cache for making fewer calls. + + Args: + cache: the storage to store things in memory. + + Returns: + struct with 2 methods, `get` and `setdefault`. + """ + if cache == None: + cache = {} + + return struct( + get = lambda index_url, distribution, versions: _memcache_get( + cache, + index_url, + distribution, + versions, + ), + setdefault = lambda index_url, distribution, versions, value: _memcache_setdefault( + cache, + index_url, + distribution, + versions, + value, + ), + ) + +def _vkey(versions): + if not versions: + return "" + + if len(versions) == 1: + return versions[0] + + return ",".join(sorted(versions)) + +def _memcache_get(cache, index_url, distribution, versions): + if not versions: + return cache.get((index_url, distribution, "")) + + vkey = _vkey(versions) + filtered = cache.get((index_url, distribution, vkey)) + if filtered: + return filtered + + unfiltered = cache.get((index_url, distribution, "")) + if not unfiltered: + return None + + filtered = _filter_packages(unfiltered, versions, index_url, distribution) + cache.setdefault((index_url, distribution, vkey), filtered) + return filtered + +def _memcache_setdefault(cache, index_url, distribution, versions, value): + cache.setdefault((index_url, distribution, ""), value) + if not versions: + return value + + filtered = _filter_packages(value, versions, index_url, distribution) + + vkey = _vkey(versions) + cache.setdefault((index_url, distribution, vkey), filtered) + return filtered + def _filter_packages(dists, requested_versions, index_url, distribution): if dists == None: return None + if not requested_versions: + return dists + sha256s_by_version = {} whls = {} sdists = {} @@ -425,7 +490,7 @@ def _filter_packages(dists, requested_versions, index_url, distribution): sha256s_by_version = sha256s_by_version, ) -def _facts(known_facts, facts, facts_version = _FACT_VERSION): +def facts_cache(known_facts, facts, facts_version = _FACT_VERSION): if known_facts == None: return None @@ -495,6 +560,7 @@ def _with_absolute_url(d, index_url, distribution): index_url_for_distro = "{}/{}/".format(index_url.rstrip("/"), distribution) # TODO @aignas 2026-02-08: think of a better way to do this + # TODO @aignas 2026-02-08: if the url is absolute, return d kwargs = dict() for attr in [ "sha256", diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl index 0286916e8a..d29a0fb6b0 100644 --- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl +++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl @@ -18,265 +18,263 @@ load("@rules_testing//lib:test_suite.bzl", "test_suite") load("@rules_testing//lib:truth.bzl", "subjects") load( "//python/private/pypi:simpleapi_download.bzl", - "simpleapi_cache", - "simpleapi_download", - "strip_empty_path_segments", + "memory_cache", ) # buildifier: disable=bzl-visibility _tests = [] -def _test_simple(env): - calls = [] - - def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): - _ = ctx # buildifier: disable=unused-variable - _ = distribution - _ = requested_versions - _ = attr - _ = cache - _ = get_auth - env.expect.that_bool(block).equals(False) - url = "{}/{}/".format(index_url, distribution) - calls.append(url) - if "foo" in url and "main" in url: - return struct( - output = "", - success = False, - ) - else: - return struct( - output = "data from {}".format(url), - success = True, - ) - - contents = simpleapi_download( - ctx = struct( - os = struct(environ = {}), - report_progress = lambda _: None, - ), - attr = struct( - index_url_overrides = {}, - index_url = "main", - extra_index_urls = ["extra"], - sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, - envsubst = [], - facts = None, - ), - cache = {}, - parallel_download = True, - read_simpleapi = read_simpleapi, - ) - - env.expect.that_collection(calls).contains_exactly([ - "extra/foo/", - "main/bar/", - "main/baz/", - "main/foo/", - ]) - env.expect.that_dict(contents).contains_exactly({ - "bar": "data from main/bar/", - "baz": "data from main/baz/", - "foo": "data from extra/foo/", - }) - -_tests.append(_test_simple) - -def _test_fail(env): - calls = [] - fails = [] - - def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): - _ = ctx # buildifier: disable=unused-variable - _ = distribution - _ = requested_versions - _ = attr - _ = cache - _ = get_auth - url = "{}/{}/".format(index_url, distribution) - env.expect.that_bool(block).equals(False) - calls.append(url) - if "foo" in url: - return struct( - output = "", - success = False, - ) - if "bar" in url: - return struct( - output = "", - success = False, - ) - else: - return struct( - output = "data from {}".format(url), - success = True, - ) - - simpleapi_download( - ctx = struct( - os = struct(environ = {}), - report_progress = lambda _: None, - ), - attr = struct( - index_url_overrides = { - "foo": "invalid", - }, - index_url = "main", - extra_index_urls = ["extra"], - sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, - envsubst = [], - facts = None, - ), - cache = {}, - parallel_download = True, - read_simpleapi = read_simpleapi, - _fail = fails.append, - ) - - env.expect.that_collection(fails).contains_exactly([ - """ -Failed to download metadata of the following packages from urls: -{ - "foo": "invalid", - "bar": ["main", "extra"], -} - -If you would like to skip downloading metadata for these packages please add 'simpleapi_skip=[ - "foo", - "bar", -]' to your 'pip.parse' call. -""", - ]) - env.expect.that_collection(calls).contains_exactly([ - "invalid/foo/", - "main/bar/", - "main/baz/", - "invalid/foo/", - "extra/bar/", - ]) - -_tests.append(_test_fail) - -def _test_download_url(env): - downloads = {} - - def download(url, output, **kwargs): - _ = kwargs # buildifier: disable=unused-variable - downloads[url[0]] = output - return struct(success = True) - - simpleapi_download( - ctx = struct( - os = struct(environ = {}), - download = download, - report_progress = lambda _: None, - read = lambda i: "contents of " + i, - path = lambda i: "path/for/" + i, - ), - attr = struct( - index_url_overrides = {}, - index_url = "https://example.com/main/simple/", - extra_index_urls = [], - sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, - envsubst = [], - facts = None, - ), - cache = {}, - parallel_download = False, - get_auth = lambda ctx, urls, ctx_attr: struct(), - ) - - env.expect.that_dict(downloads).contains_exactly({ - "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html", - "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html", - "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html", - }) - -_tests.append(_test_download_url) - -def _test_download_url_parallel(env): - downloads = {} - - def download(url, output, **kwargs): - _ = kwargs # buildifier: disable=unused-variable - downloads[url[0]] = output - return struct(wait = lambda: struct(success = True)) - - simpleapi_download( - ctx = struct( - os = struct(environ = {}), - download = download, - report_progress = lambda _: None, - read = lambda i: "contents of " + i, - path = lambda i: "path/for/" + i, - ), - attr = struct( - index_url_overrides = {}, - index_url = "https://example.com/main/simple/", - extra_index_urls = [], - sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, - envsubst = [], - facts = None, - ), - cache = {}, - parallel_download = True, - get_auth = lambda ctx, urls, ctx_attr: struct(), - ) - - env.expect.that_dict(downloads).contains_exactly({ - "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html", - "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html", - "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html", - }) - -_tests.append(_test_download_url_parallel) - -def _test_download_envsubst_url(env): - downloads = {} - - def download(url, output, **kwargs): - _ = kwargs # buildifier: disable=unused-variable - downloads[url[0]] = output - return struct(success = True) - - simpleapi_download( - ctx = struct( - os = struct(environ = {"INDEX_URL": "https://example.com/main/simple/"}), - download = download, - report_progress = lambda _: None, - read = lambda i: "contents of " + i, - path = lambda i: "path/for/" + i, - ), - attr = struct( - index_url_overrides = {}, - index_url = "$INDEX_URL", - extra_index_urls = [], - sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, - envsubst = ["INDEX_URL"], - facts = None, - ), - cache = {}, - parallel_download = False, - get_auth = lambda ctx, urls, ctx_attr: struct(), - ) - - env.expect.that_dict(downloads).contains_exactly({ - "https://example.com/main/simple/bar/": "path/for/~index_url~_bar.html", - "https://example.com/main/simple/baz/": "path/for/~index_url~_baz.html", - "https://example.com/main/simple/foo/": "path/for/~index_url~_foo.html", - }) - -_tests.append(_test_download_envsubst_url) - -def _test_strip_empty_path_segments(env): - env.expect.that_str(strip_empty_path_segments("no/scheme//is/unchanged")).equals("no/scheme//is/unchanged") - env.expect.that_str(strip_empty_path_segments("scheme://with/no/empty/segments")).equals("scheme://with/no/empty/segments") - env.expect.that_str(strip_empty_path_segments("scheme://with//empty/segments")).equals("scheme://with/empty/segments") - env.expect.that_str(strip_empty_path_segments("scheme://with///multiple//empty/segments")).equals("scheme://with/multiple/empty/segments") - env.expect.that_str(strip_empty_path_segments("scheme://with//trailing/slash/")).equals("scheme://with/trailing/slash/") - env.expect.that_str(strip_empty_path_segments("scheme://with/trailing/slashes///")).equals("scheme://with/trailing/slashes/") - -_tests.append(_test_strip_empty_path_segments) - +# def _test_simple(env): +# calls = [] +# +# def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): +# _ = ctx # buildifier: disable=unused-variable +# _ = distribution +# _ = requested_versions +# _ = attr +# _ = cache +# _ = get_auth +# env.expect.that_bool(block).equals(False) +# url = "{}/{}/".format(index_url, distribution) +# calls.append(url) +# if "foo" in url and "main" in url: +# return struct( +# output = "", +# success = False, +# ) +# else: +# return struct( +# output = "data from {}".format(url), +# success = True, +# ) +# +# contents = simpleapi_download( +# ctx = struct( +# os = struct(environ = {}), +# report_progress = lambda _: None, +# ), +# attr = struct( +# index_url_overrides = {}, +# index_url = "main", +# extra_index_urls = ["extra"], +# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, +# envsubst = [], +# facts = None, +# ), +# cache = {}, +# parallel_download = True, +# read_simpleapi = read_simpleapi, +# ) +# +# env.expect.that_collection(calls).contains_exactly([ +# "extra/foo/", +# "main/bar/", +# "main/baz/", +# "main/foo/", +# ]) +# env.expect.that_dict(contents).contains_exactly({ +# "bar": "data from main/bar/", +# "baz": "data from main/baz/", +# "foo": "data from extra/foo/", +# }) +# +# _tests.append(_test_simple) +# +# def _test_fail(env): +# calls = [] +# fails = [] +# +# def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): +# _ = ctx # buildifier: disable=unused-variable +# _ = distribution +# _ = requested_versions +# _ = attr +# _ = cache +# _ = get_auth +# url = "{}/{}/".format(index_url, distribution) +# env.expect.that_bool(block).equals(False) +# calls.append(url) +# if "foo" in url: +# return struct( +# output = "", +# success = False, +# ) +# if "bar" in url: +# return struct( +# output = "", +# success = False, +# ) +# else: +# return struct( +# output = "data from {}".format(url), +# success = True, +# ) +# +# simpleapi_download( +# ctx = struct( +# os = struct(environ = {}), +# report_progress = lambda _: None, +# ), +# attr = struct( +# index_url_overrides = { +# "foo": "invalid", +# }, +# index_url = "main", +# extra_index_urls = ["extra"], +# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, +# envsubst = [], +# facts = None, +# ), +# cache = {}, +# parallel_download = True, +# read_simpleapi = read_simpleapi, +# _fail = fails.append, +# ) +# +# env.expect.that_collection(fails).contains_exactly([ +# """ +# Failed to download metadata of the following packages from urls: +# { +# "foo": "invalid", +# "bar": ["main", "extra"], +# } +# +# If you would like to skip downloading metadata for these packages please add 'simpleapi_skip=[ +# "foo", +# "bar", +# ]' to your 'pip.parse' call. +# """, +# ]) +# env.expect.that_collection(calls).contains_exactly([ +# "invalid/foo/", +# "main/bar/", +# "main/baz/", +# "invalid/foo/", +# "extra/bar/", +# ]) +# +# _tests.append(_test_fail) +# +# def _test_download_url(env): +# downloads = {} +# +# def download(url, output, **kwargs): +# _ = kwargs # buildifier: disable=unused-variable +# downloads[url[0]] = output +# return struct(success = True) +# +# simpleapi_download( +# ctx = struct( +# os = struct(environ = {}), +# download = download, +# report_progress = lambda _: None, +# read = lambda i: "contents of " + i, +# path = lambda i: "path/for/" + i, +# ), +# attr = struct( +# index_url_overrides = {}, +# index_url = "https://example.com/main/simple/", +# extra_index_urls = [], +# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, +# envsubst = [], +# facts = None, +# ), +# cache = {}, +# parallel_download = False, +# get_auth = lambda ctx, urls, ctx_attr: struct(), +# ) +# +# env.expect.that_dict(downloads).contains_exactly({ +# "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html", +# "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html", +# "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html", +# }) +# +# _tests.append(_test_download_url) +# +# def _test_download_url_parallel(env): +# downloads = {} +# +# def download(url, output, **kwargs): +# _ = kwargs # buildifier: disable=unused-variable +# downloads[url[0]] = output +# return struct(wait = lambda: struct(success = True)) +# +# simpleapi_download( +# ctx = struct( +# os = struct(environ = {}), +# download = download, +# report_progress = lambda _: None, +# read = lambda i: "contents of " + i, +# path = lambda i: "path/for/" + i, +# ), +# attr = struct( +# index_url_overrides = {}, +# index_url = "https://example.com/main/simple/", +# extra_index_urls = [], +# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, +# envsubst = [], +# facts = None, +# ), +# cache = {}, +# parallel_download = True, +# get_auth = lambda ctx, urls, ctx_attr: struct(), +# ) +# +# env.expect.that_dict(downloads).contains_exactly({ +# "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html", +# "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html", +# "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html", +# }) +# +# _tests.append(_test_download_url_parallel) +# +# def _test_download_envsubst_url(env): +# downloads = {} +# +# def download(url, output, **kwargs): +# _ = kwargs # buildifier: disable=unused-variable +# downloads[url[0]] = output +# return struct(success = True) +# +# simpleapi_download( +# ctx = struct( +# os = struct(environ = {"INDEX_URL": "https://example.com/main/simple/"}), +# download = download, +# report_progress = lambda _: None, +# read = lambda i: "contents of " + i, +# path = lambda i: "path/for/" + i, +# ), +# attr = struct( +# index_url_overrides = {}, +# index_url = "$INDEX_URL", +# extra_index_urls = [], +# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, +# envsubst = ["INDEX_URL"], +# facts = None, +# ), +# cache = {}, +# parallel_download = False, +# get_auth = lambda ctx, urls, ctx_attr: struct(), +# ) +# +# env.expect.that_dict(downloads).contains_exactly({ +# "https://example.com/main/simple/bar/": "path/for/~index_url~_bar.html", +# "https://example.com/main/simple/baz/": "path/for/~index_url~_baz.html", +# "https://example.com/main/simple/foo/": "path/for/~index_url~_foo.html", +# }) +# +# _tests.append(_test_download_envsubst_url) +# +# def _test_strip_empty_path_segments(env): +# env.expect.that_str(strip_empty_path_segments("no/scheme//is/unchanged")).equals("no/scheme//is/unchanged") +# env.expect.that_str(strip_empty_path_segments("scheme://with/no/empty/segments")).equals("scheme://with/no/empty/segments") +# env.expect.that_str(strip_empty_path_segments("scheme://with//empty/segments")).equals("scheme://with/empty/segments") +# env.expect.that_str(strip_empty_path_segments("scheme://with///multiple//empty/segments")).equals("scheme://with/multiple/empty/segments") +# env.expect.that_str(strip_empty_path_segments("scheme://with//trailing/slash/")).equals("scheme://with/trailing/slash/") +# env.expect.that_str(strip_empty_path_segments("scheme://with/trailing/slashes///")).equals("scheme://with/trailing/slashes/") +# +# _tests.append(_test_strip_empty_path_segments) +# def _expect_cache_result(env, cache, key, sdists, whls): got = env.expect.that_struct( cache.get(*key), @@ -288,149 +286,56 @@ def _expect_cache_result(env, cache, key, sdists, whls): got.whls().contains_exactly(whls) got.sdists().contains_exactly(sdists) -def _test_cache_without_facts(env): - cache = simpleapi_cache() - - env.expect.that_str(cache.get("index_url", "distro", ["1.0"])).equals(None) - cache.setdefault("index_url", "distro", ["1.0"], struct( +def _test_memory_cache(env): + memory = {} + cache = memory_cache(memory) + all_packages = struct( sdists = { - "a": struct(version = "1.0"), + "aa": struct(version = "1.0"), + "ab": struct(version = "1.1"), }, whls = { - "b": struct(version = "1.0"), - "c": struct(version = "1.1"), - "d": struct(version = "1.1"), - "e": struct(version = "1.0"), + "ba": struct(version = "1.0"), + "bb": struct(version = "1.1"), }, - )) - _expect_cache_result( - env, - cache, - key = ("index_url", "distro", ["1.0"]), - sdists = {"a": struct(version = "1.0")}, - whls = {"b": struct(version = "1.0"), "e": struct(version = "1.0")}, - ) - env.expect.that_str(cache.get("index_url", "distro", ["1.2"])).equals(None) - - _expect_cache_result( - env, - cache, - key = ("index_url", "distro", ["1.1"]), - sdists = {}, - whls = {"c": struct(version = "1.1"), "d": struct(version = "1.1")}, - ) - -_tests.append(_test_cache_without_facts) - -def _test_cache_with_facts(env): - facts = {} - cache_storage = {} - known_facts = {} - cache = simpleapi_cache( - cache = cache_storage, - known_facts = known_facts, - facts = facts, - ) - - sdists = struct( - a = struct(version = "1.0", url = "https://index_url/distro/a-1.0.tgz", filename = "a-1.0.tgz", yanked = False, sha256 = "a"), ) - - # TODO @aignas 2026-02-08: url is relative - # TODO @aignas 2026-02-08: url starts with `/` - # TODO @aignas 2026-02-08: url is absolute - whls = struct( - b = struct(version = "1.0", url = "https://index_url/distro/b-1.0-tail.whl", filename = "b-1.0-tail.whl", yanked = False, sha256 = "b"), - c = struct(version = "1.1", url = "https://index_url/distro/c-1.1-tail.whl", filename = "c-1.1-tail.whl", yanked = False, sha256 = "c"), - d = struct(version = "1.1", url = "https://index_url/distro/d-1.1-tail.whl", filename = "d-1.1-tail.whl", yanked = False, sha256 = "d"), - e = struct(version = "1.0", url = "https://index_url/distro/e-1.0-tail.whl", filename = "e-1.0-tail.whl", yanked = False, sha256 = "e"), - ) - - env.expect.that_str(cache.get("index_url", "distro", ["1.0"])).equals(None) - cache.setdefault("index_url", "distro", ["1.0"], struct( - sdists = { - "a": sdists.a, - }, - whls = { - "b": whls.b, - "c": whls.c, - "d": whls.d, - "e": whls.e, - }, - )) + cache.setdefault("index", "distro", None, all_packages) + env.expect.that_dict(memory).contains_exactly({ + ("index", "distro", ""): all_packages, + }) _expect_cache_result( env, cache, - key = ("index_url", "distro", ["1.0"]), + ("index", "distro", ["1.0"]), sdists = { - "a": sdists.a, + "aa": struct(version = "1.0"), }, whls = { - "b": whls.b, - "e": whls.e, + "ba": struct(version = "1.0"), }, ) - env.expect.that_str(cache.get("index_url", "distro", ["1.2"])).equals(None) - env.expect.that_dict(facts).contains_exactly({ - "fact_version": "v1", - "index_url": { - "dist_hashes": { - sdists.a.url: "a", - whls.b.url: "b", - whls.e.url: "e", - }, - }, - }) - + env.expect.that_dict(memory).keys().contains_exactly([ + ("index", "distro", ""), + ("index", "distro", "1.0"), + ]) _expect_cache_result( env, cache, - key = ("index_url", "distro", ["1.1"]), - sdists = {}, - whls = { - "c": whls.c, - "d": whls.d, - }, - ) - env.expect.that_dict(facts).contains_exactly({ - "fact_version": "v1", - "index_url": { - "dist_hashes": { - sdists.a.url: "a", - whls.b.url: "b", - whls.c.url: "c", - whls.d.url: "d", - whls.e.url: "e", - }, + ("index", "distro", ["1.1"]), + sdists = { + "ab": struct(version = "1.1"), }, - }) - - # simulate write to MODULE.bazel.lock - known_facts.update(facts) - facts.clear() - cache_storage.clear() - - _expect_cache_result( - env, - cache, - key = ("index_url", "distro", ["1.1"]), - sdists = {}, whls = { - "c": whls.c, - "d": whls.d, + "bb": struct(version = "1.1"), }, ) - env.expect.that_dict(facts).contains_exactly({ - "fact_version": "v1", - "index_url": { - "dist_hashes": { - whls.c.url: "c", - whls.d.url: "d", - }, - }, - }) + env.expect.that_dict(memory).keys().contains_exactly([ + ("index", "distro", ""), + ("index", "distro", "1.0"), + ("index", "distro", "1.1"), + ]) -_tests.append(_test_cache_with_facts) +_tests.append(_test_memory_cache) def simpleapi_download_test_suite(name): """Create the test suite. From 96ccbc693ac37a2f0f71d299df2f92b3bbf5970c Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Mon, 9 Feb 2026 00:05:36 +0900 Subject: [PATCH 12/13] wip --- python/private/pypi/simpleapi_download.bzl | 16 +- .../simpleapi_download_tests.bzl | 506 +++++++++--------- 2 files changed, 264 insertions(+), 258 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 25ff73469b..633b2b64dc 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -382,15 +382,16 @@ def _cache_get(cache, facts, index_url, distribution, versions): # Ensure that we write back to the facts, this happens if we request versions that # we don't have facts for but we have in-memory cache of SimpleAPI query results - facts.setdefault(index_url, distribution, cached) + facts.setdefault(index_url, distribution, versions, cached) return cached def _cache_setdefault(cache, facts, index_url, distribution, versions, value): - cache.setdefault((index_url, distribution), value) - value = _filter_packages(value, versions, index_url, distribution) + filtered = cache.setdefault(index_url, distribution, versions, value) - if facts: - facts.setdefault(index_url, distribution, value) + if facts and versions: + facts.setdefault(index_url, distribution, versions, filtered) + + return filtered def memory_cache(cache = None): """SimpleAPI cache for making fewer calls. @@ -425,7 +426,10 @@ def _vkey(versions): return "" if len(versions) == 1: - return versions[0] + if type(versions) == "dict": + return versions.keys()[0] + else: + return versions[0] return ",".join(sorted(versions)) diff --git a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl index d29a0fb6b0..2a0b1f8811 100644 --- a/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl +++ b/tests/pypi/simpleapi_download/simpleapi_download_tests.bzl @@ -19,262 +19,264 @@ load("@rules_testing//lib:truth.bzl", "subjects") load( "//python/private/pypi:simpleapi_download.bzl", "memory_cache", + "simpleapi_download", + "strip_empty_path_segments", ) # buildifier: disable=bzl-visibility _tests = [] -# def _test_simple(env): -# calls = [] -# -# def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): -# _ = ctx # buildifier: disable=unused-variable -# _ = distribution -# _ = requested_versions -# _ = attr -# _ = cache -# _ = get_auth -# env.expect.that_bool(block).equals(False) -# url = "{}/{}/".format(index_url, distribution) -# calls.append(url) -# if "foo" in url and "main" in url: -# return struct( -# output = "", -# success = False, -# ) -# else: -# return struct( -# output = "data from {}".format(url), -# success = True, -# ) -# -# contents = simpleapi_download( -# ctx = struct( -# os = struct(environ = {}), -# report_progress = lambda _: None, -# ), -# attr = struct( -# index_url_overrides = {}, -# index_url = "main", -# extra_index_urls = ["extra"], -# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, -# envsubst = [], -# facts = None, -# ), -# cache = {}, -# parallel_download = True, -# read_simpleapi = read_simpleapi, -# ) -# -# env.expect.that_collection(calls).contains_exactly([ -# "extra/foo/", -# "main/bar/", -# "main/baz/", -# "main/foo/", -# ]) -# env.expect.that_dict(contents).contains_exactly({ -# "bar": "data from main/bar/", -# "baz": "data from main/baz/", -# "foo": "data from extra/foo/", -# }) -# -# _tests.append(_test_simple) -# -# def _test_fail(env): -# calls = [] -# fails = [] -# -# def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): -# _ = ctx # buildifier: disable=unused-variable -# _ = distribution -# _ = requested_versions -# _ = attr -# _ = cache -# _ = get_auth -# url = "{}/{}/".format(index_url, distribution) -# env.expect.that_bool(block).equals(False) -# calls.append(url) -# if "foo" in url: -# return struct( -# output = "", -# success = False, -# ) -# if "bar" in url: -# return struct( -# output = "", -# success = False, -# ) -# else: -# return struct( -# output = "data from {}".format(url), -# success = True, -# ) -# -# simpleapi_download( -# ctx = struct( -# os = struct(environ = {}), -# report_progress = lambda _: None, -# ), -# attr = struct( -# index_url_overrides = { -# "foo": "invalid", -# }, -# index_url = "main", -# extra_index_urls = ["extra"], -# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, -# envsubst = [], -# facts = None, -# ), -# cache = {}, -# parallel_download = True, -# read_simpleapi = read_simpleapi, -# _fail = fails.append, -# ) -# -# env.expect.that_collection(fails).contains_exactly([ -# """ -# Failed to download metadata of the following packages from urls: -# { -# "foo": "invalid", -# "bar": ["main", "extra"], -# } -# -# If you would like to skip downloading metadata for these packages please add 'simpleapi_skip=[ -# "foo", -# "bar", -# ]' to your 'pip.parse' call. -# """, -# ]) -# env.expect.that_collection(calls).contains_exactly([ -# "invalid/foo/", -# "main/bar/", -# "main/baz/", -# "invalid/foo/", -# "extra/bar/", -# ]) -# -# _tests.append(_test_fail) -# -# def _test_download_url(env): -# downloads = {} -# -# def download(url, output, **kwargs): -# _ = kwargs # buildifier: disable=unused-variable -# downloads[url[0]] = output -# return struct(success = True) -# -# simpleapi_download( -# ctx = struct( -# os = struct(environ = {}), -# download = download, -# report_progress = lambda _: None, -# read = lambda i: "contents of " + i, -# path = lambda i: "path/for/" + i, -# ), -# attr = struct( -# index_url_overrides = {}, -# index_url = "https://example.com/main/simple/", -# extra_index_urls = [], -# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, -# envsubst = [], -# facts = None, -# ), -# cache = {}, -# parallel_download = False, -# get_auth = lambda ctx, urls, ctx_attr: struct(), -# ) -# -# env.expect.that_dict(downloads).contains_exactly({ -# "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html", -# "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html", -# "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html", -# }) -# -# _tests.append(_test_download_url) -# -# def _test_download_url_parallel(env): -# downloads = {} -# -# def download(url, output, **kwargs): -# _ = kwargs # buildifier: disable=unused-variable -# downloads[url[0]] = output -# return struct(wait = lambda: struct(success = True)) -# -# simpleapi_download( -# ctx = struct( -# os = struct(environ = {}), -# download = download, -# report_progress = lambda _: None, -# read = lambda i: "contents of " + i, -# path = lambda i: "path/for/" + i, -# ), -# attr = struct( -# index_url_overrides = {}, -# index_url = "https://example.com/main/simple/", -# extra_index_urls = [], -# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, -# envsubst = [], -# facts = None, -# ), -# cache = {}, -# parallel_download = True, -# get_auth = lambda ctx, urls, ctx_attr: struct(), -# ) -# -# env.expect.that_dict(downloads).contains_exactly({ -# "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html", -# "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html", -# "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html", -# }) -# -# _tests.append(_test_download_url_parallel) -# -# def _test_download_envsubst_url(env): -# downloads = {} -# -# def download(url, output, **kwargs): -# _ = kwargs # buildifier: disable=unused-variable -# downloads[url[0]] = output -# return struct(success = True) -# -# simpleapi_download( -# ctx = struct( -# os = struct(environ = {"INDEX_URL": "https://example.com/main/simple/"}), -# download = download, -# report_progress = lambda _: None, -# read = lambda i: "contents of " + i, -# path = lambda i: "path/for/" + i, -# ), -# attr = struct( -# index_url_overrides = {}, -# index_url = "$INDEX_URL", -# extra_index_urls = [], -# sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, -# envsubst = ["INDEX_URL"], -# facts = None, -# ), -# cache = {}, -# parallel_download = False, -# get_auth = lambda ctx, urls, ctx_attr: struct(), -# ) -# -# env.expect.that_dict(downloads).contains_exactly({ -# "https://example.com/main/simple/bar/": "path/for/~index_url~_bar.html", -# "https://example.com/main/simple/baz/": "path/for/~index_url~_baz.html", -# "https://example.com/main/simple/foo/": "path/for/~index_url~_foo.html", -# }) -# -# _tests.append(_test_download_envsubst_url) -# -# def _test_strip_empty_path_segments(env): -# env.expect.that_str(strip_empty_path_segments("no/scheme//is/unchanged")).equals("no/scheme//is/unchanged") -# env.expect.that_str(strip_empty_path_segments("scheme://with/no/empty/segments")).equals("scheme://with/no/empty/segments") -# env.expect.that_str(strip_empty_path_segments("scheme://with//empty/segments")).equals("scheme://with/empty/segments") -# env.expect.that_str(strip_empty_path_segments("scheme://with///multiple//empty/segments")).equals("scheme://with/multiple/empty/segments") -# env.expect.that_str(strip_empty_path_segments("scheme://with//trailing/slash/")).equals("scheme://with/trailing/slash/") -# env.expect.that_str(strip_empty_path_segments("scheme://with/trailing/slashes///")).equals("scheme://with/trailing/slashes/") -# -# _tests.append(_test_strip_empty_path_segments) -# +def _test_simple(env): + calls = [] + + def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): + _ = ctx # buildifier: disable=unused-variable + _ = distribution + _ = requested_versions + _ = attr + _ = cache + _ = get_auth + env.expect.that_bool(block).equals(False) + url = "{}/{}/".format(index_url, distribution) + calls.append(url) + if "foo" in url and "main" in url: + return struct( + output = "", + success = False, + ) + else: + return struct( + output = "data from {}".format(url), + success = True, + ) + + contents = simpleapi_download( + ctx = struct( + os = struct(environ = {}), + report_progress = lambda _: None, + ), + attr = struct( + index_url_overrides = {}, + index_url = "main", + extra_index_urls = ["extra"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, + envsubst = [], + facts = None, + ), + cache = {}, + parallel_download = True, + read_simpleapi = read_simpleapi, + ) + + env.expect.that_collection(calls).contains_exactly([ + "extra/foo/", + "main/bar/", + "main/baz/", + "main/foo/", + ]) + env.expect.that_dict(contents).contains_exactly({ + "bar": "data from main/bar/", + "baz": "data from main/baz/", + "foo": "data from extra/foo/", + }) + +_tests.append(_test_simple) + +def _test_fail(env): + calls = [] + fails = [] + + def read_simpleapi(ctx, index_url, distribution, attr, cache, get_auth, requested_versions, block): + _ = ctx # buildifier: disable=unused-variable + _ = distribution + _ = requested_versions + _ = attr + _ = cache + _ = get_auth + url = "{}/{}/".format(index_url, distribution) + env.expect.that_bool(block).equals(False) + calls.append(url) + if "foo" in url: + return struct( + output = "", + success = False, + ) + if "bar" in url: + return struct( + output = "", + success = False, + ) + else: + return struct( + output = "data from {}".format(url), + success = True, + ) + + simpleapi_download( + ctx = struct( + os = struct(environ = {}), + report_progress = lambda _: None, + ), + attr = struct( + index_url_overrides = { + "foo": "invalid", + }, + index_url = "main", + extra_index_urls = ["extra"], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, + envsubst = [], + facts = None, + ), + cache = {}, + parallel_download = True, + read_simpleapi = read_simpleapi, + _fail = fails.append, + ) + + env.expect.that_collection(fails).contains_exactly([ + """ +Failed to download metadata of the following packages from urls: +{ + "bar": ["main", "extra"], + "foo": "invalid", +} + +If you would like to skip downloading metadata for these packages please add 'simpleapi_skip=[ + "bar", + "foo", +]' to your 'pip.parse' call. +""", + ]) + env.expect.that_collection(calls).contains_exactly([ + "invalid/foo/", + "main/bar/", + "main/baz/", + "invalid/foo/", + "extra/bar/", + ]) + +_tests.append(_test_fail) + +def _test_download_url(env): + downloads = {} + + def download(url, output, **kwargs): + _ = kwargs # buildifier: disable=unused-variable + downloads[url[0]] = output + return struct(success = True) + + simpleapi_download( + ctx = struct( + os = struct(environ = {}), + download = download, + report_progress = lambda _: None, + read = lambda i: "contents of " + i, + path = lambda i: "path/for/" + i, + ), + attr = struct( + index_url_overrides = {}, + index_url = "https://example.com/main/simple/", + extra_index_urls = [], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, + envsubst = [], + facts = None, + ), + cache = {}, + parallel_download = False, + get_auth = lambda ctx, urls, ctx_attr: struct(), + ) + + env.expect.that_dict(downloads).contains_exactly({ + "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html", + "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html", + "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html", + }) + +_tests.append(_test_download_url) + +def _test_download_url_parallel(env): + downloads = {} + + def download(url, output, **kwargs): + _ = kwargs # buildifier: disable=unused-variable + downloads[url[0]] = output + return struct(wait = lambda: struct(success = True)) + + simpleapi_download( + ctx = struct( + os = struct(environ = {}), + download = download, + report_progress = lambda _: None, + read = lambda i: "contents of " + i, + path = lambda i: "path/for/" + i, + ), + attr = struct( + index_url_overrides = {}, + index_url = "https://example.com/main/simple/", + extra_index_urls = [], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, + envsubst = [], + facts = None, + ), + cache = {}, + parallel_download = True, + get_auth = lambda ctx, urls, ctx_attr: struct(), + ) + + env.expect.that_dict(downloads).contains_exactly({ + "https://example.com/main/simple/bar/": "path/for/https___example_com_main_simple_bar.html", + "https://example.com/main/simple/baz/": "path/for/https___example_com_main_simple_baz.html", + "https://example.com/main/simple/foo/": "path/for/https___example_com_main_simple_foo.html", + }) + +_tests.append(_test_download_url_parallel) + +def _test_download_envsubst_url(env): + downloads = {} + + def download(url, output, **kwargs): + _ = kwargs # buildifier: disable=unused-variable + downloads[url[0]] = output + return struct(success = True) + + simpleapi_download( + ctx = struct( + os = struct(environ = {"INDEX_URL": "https://example.com/main/simple/"}), + download = download, + report_progress = lambda _: None, + read = lambda i: "contents of " + i, + path = lambda i: "path/for/" + i, + ), + attr = struct( + index_url_overrides = {}, + index_url = "$INDEX_URL", + extra_index_urls = [], + sources = {"bar": ["1.0"], "baz": ["1.0"], "foo": ["1.0"]}, + envsubst = ["INDEX_URL"], + facts = None, + ), + cache = {}, + parallel_download = False, + get_auth = lambda ctx, urls, ctx_attr: struct(), + ) + + env.expect.that_dict(downloads).contains_exactly({ + "https://example.com/main/simple/bar/": "path/for/~index_url~_bar.html", + "https://example.com/main/simple/baz/": "path/for/~index_url~_baz.html", + "https://example.com/main/simple/foo/": "path/for/~index_url~_foo.html", + }) + +_tests.append(_test_download_envsubst_url) + +def _test_strip_empty_path_segments(env): + env.expect.that_str(strip_empty_path_segments("no/scheme//is/unchanged")).equals("no/scheme//is/unchanged") + env.expect.that_str(strip_empty_path_segments("scheme://with/no/empty/segments")).equals("scheme://with/no/empty/segments") + env.expect.that_str(strip_empty_path_segments("scheme://with//empty/segments")).equals("scheme://with/empty/segments") + env.expect.that_str(strip_empty_path_segments("scheme://with///multiple//empty/segments")).equals("scheme://with/multiple/empty/segments") + env.expect.that_str(strip_empty_path_segments("scheme://with//trailing/slash/")).equals("scheme://with/trailing/slash/") + env.expect.that_str(strip_empty_path_segments("scheme://with/trailing/slashes///")).equals("scheme://with/trailing/slashes/") + +_tests.append(_test_strip_empty_path_segments) + def _expect_cache_result(env, cache, key, sdists, whls): got = env.expect.that_struct( cache.get(*key), From 4c9f7298cd2d30d313cd5aaf6742c221f03bcbfe Mon Sep 17 00:00:00 2001 From: Ignas Anikevicius <240938+aignas@users.noreply.github.com> Date: Mon, 9 Feb 2026 00:26:09 +0900 Subject: [PATCH 13/13] wip --- python/private/pypi/simpleapi_download.bzl | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/python/private/pypi/simpleapi_download.bzl b/python/private/pypi/simpleapi_download.bzl index 633b2b64dc..b1481f8981 100644 --- a/python/private/pypi/simpleapi_download.bzl +++ b/python/private/pypi/simpleapi_download.bzl @@ -333,7 +333,11 @@ def _read_index_result(*, result, index_url, distribution, real_url, cache, requ return struct(success = False) # TODO @aignas 2026-02-08: make this the only behaviour, maybe can get rid of `real_url - output = parse_simpleapi_html(url = real_url, content = result.output, return_absolute = False) + output = parse_simpleapi_html( + url = real_url, + content = result.output, + return_absolute = False, + ) if not output: return struct(success = False) @@ -372,9 +376,10 @@ def _cache_get(cache, facts, index_url, distribution, versions): if not facts: return cache.get(index_url, distribution, versions) - cached = facts.get(index_url, distribution, versions) - if cached: - return cached + if versions: + cached = facts.get(index_url, distribution, versions) + if cached: + return cached cached = cache.get(index_url, distribution, versions) if not cached: @@ -382,14 +387,15 @@ def _cache_get(cache, facts, index_url, distribution, versions): # Ensure that we write back to the facts, this happens if we request versions that # we don't have facts for but we have in-memory cache of SimpleAPI query results - facts.setdefault(index_url, distribution, versions, cached) + if versions: + facts.setdefault(index_url, distribution, cached) return cached def _cache_setdefault(cache, facts, index_url, distribution, versions, value): filtered = cache.setdefault(index_url, distribution, versions, value) if facts and versions: - facts.setdefault(index_url, distribution, versions, filtered) + facts.setdefault(index_url, distribution, filtered) return filtered