diff --git a/model2vec/distill/distillation.py b/model2vec/distill/distillation.py index 898a199..3e1f800 100644 --- a/model2vec/distill/distillation.py +++ b/model2vec/distill/distillation.py @@ -7,6 +7,7 @@ import numpy as np from huggingface_hub.hf_api import model_info +from skeletoken import TokenizerModel from transformers import AutoModel, AutoTokenizer from transformers.modeling_utils import PreTrainedModel from transformers.tokenization_utils_fast import PreTrainedTokenizerFast @@ -15,7 +16,7 @@ from model2vec.distill.utils import select_optimal_device from model2vec.model import StaticModel from model2vec.quantization import DType, quantize_embeddings -from model2vec.tokenizer import clean_and_create_vocabulary, replace_vocabulary, turn_tokens_into_ids +from model2vec.tokenizer import clean_and_create_vocabulary, turn_tokens_into_ids from model2vec.vocabulary_quantization import quantize_vocabulary logger = logging.getLogger(__name__) @@ -37,7 +38,8 @@ def distill_from_model( Distill a staticmodel from a sentence transformer. This function creates a set of embeddings from a sentence transformer. It does this by doing either - a forward pass for all subword tokens in the tokenizer, or by doing a forward pass for all tokens in a passed vocabulary. + a forward pass for all subword tokens in the tokenizer, or by doing a forward pass for all tokens in a passed + vocabulary. If you pass through a vocabulary, we create a custom word tokenizer for that vocabulary. If you don't pass a vocabulary, we use the model's tokenizer directly. @@ -51,10 +53,13 @@ def distill_from_model( If this is 'auto', we don't reduce dimensionality, but still apply PCA. :param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied. Should be a value > 0 and < 1.0. A value of 1e-4 is a good default. - :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary. - If the pattern is so general that it removes all tokens, we throw an error. If the pattern can't be compiled into a valid regex, we also throw an error. + :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to + this regex pattern will be removed from the vocabulary. + If the pattern is so general that it removes all tokens, we throw an error. If the pattern can't be compiled + into a valid regex, we also throw an error. :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents. - :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed. + :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no + quantization is performed. :param pooling: The pooling mode to use for creating embeddings. Can be one of: 'mean' (default): mean over all tokens. Robust and works well in most cases. 'last': use the last token's hidden state (often the [EOS] token). Common for decoder-style models. @@ -65,59 +70,43 @@ def distill_from_model( """ quantize_to = DType(quantize_to) - backend_tokenizer = tokenizer.backend_tokenizer sif_coefficient, token_remove_regex = _validate_parameters(sif_coefficient, token_remove_pattern) if vocabulary is None: vocabulary = [] device = select_optimal_device(device) + original_tokenizer_model = TokenizerModel.from_transformers_tokenizer(tokenizer) - n_tokens_before = len(vocabulary) # Clean the vocabulary by removing duplicate tokens and tokens that are in the internal vocabulary. - all_tokens, backend_tokenizer = clean_and_create_vocabulary( - tokenizer, vocabulary, token_remove_regex=token_remove_regex - ) - n_tokens_after = len([token for token in all_tokens if not token.is_internal]) - if n_tokens_before: - logger.info( - f"Adding {n_tokens_after} tokens to the vocabulary. Removed {n_tokens_before - n_tokens_after} tokens during preprocessing." - ) - + # Copy the original tokenizer model. + tokenizer_model = original_tokenizer_model._deep_copy() + if tokenizer_model.adds_prefix_space is not None: + tokenizer_model.adds_prefix_space = True + + # Create the vocabulary in the new tokenizer. + tokenizer_model = clean_and_create_vocabulary(tokenizer_model, vocabulary, token_remove_regex=token_remove_regex) + # Remove the post processor, this is not necessary. + tokenizer_model.post_processor = None + + # All tokens in a single list. + all_tokens = tokenizer_model.sorted_vocabulary if not all_tokens: raise ValueError("The vocabulary is empty after preprocessing. Please check your token_remove_pattern.") - unk_token = cast(str | None, tokenizer.special_tokens_map.get("unk_token")) - pad_token = cast(str | None, tokenizer.special_tokens_map.get("pad_token")) - - # Weird if to satsify mypy - if pad_token is None: - if unk_token is not None: - pad_token = unk_token - logger.warning( - "The pad token is not set. Setting it to the unk token. This is a workaround for models that don't have a pad token." - ) - else: - pad_token = unk_token or all_tokens[0].form - logger.warning( - "The pad token is not set. Setting it to the first token in the vocabulary. This is a workaround for models that don't have a pad token." - ) - - # Replace the vocabulary in the tokenizer with the new vocabulary. - backend_tokenizer = replace_vocabulary(backend_tokenizer, all_tokens, unk_token=unk_token, pad_token=pad_token) - logger.info(f"Creating embeddings for {len(all_tokens)} tokens") - # Convert tokens to IDs - token_ids = turn_tokens_into_ids(all_tokens, tokenizer, unk_token) - - # Create the embeddings + # Turn all _new_ tokens into ids using the original tokenizer + token_ids = turn_tokens_into_ids(all_tokens, original_tokenizer_model) + + # Create the embeddings using the ids from the original tokenizer. embeddings = create_embeddings( tokenized=token_ids, model=model, device=device, - pad_token_id=tokenizer.get_vocab()[pad_token], + pad_token_id=tokenizer_model.pad_token_id or 0, pooling=pooling, ) + # Maybe apply quantization if vocabulary_quantization is not None: _, weights = post_process_embeddings(np.asarray(embeddings), None, sif_coefficient=sif_coefficient) embeddings, token_mapping, weights = quantize_vocabulary( @@ -163,7 +152,7 @@ def distill_from_model( vectors=embeddings, weights=weights, token_mapping=token_mapping, - tokenizer=backend_tokenizer, + tokenizer=tokenizer_model.to_tokenizer(), config=config, base_model_name=model_name, language=language, @@ -174,13 +163,14 @@ def distill_from_model( def _validate_parameters( sif_coefficient: float | None, token_remove_pattern: str | None, -) -> tuple[float | None, re.Pattern | None]: +) -> tuple[float | None, re.Pattern[str] | None]: """ Validate the parameters passed to the distillation function. :param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied. Should be a value >= 0 and < 1.0. A value of 1e-4 is a good default. - :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary. + :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to + this regex pattern will be removed from the vocabulary. :return: The SIF coefficient to use. :raises: ValueError if the regex can't be compiled. @@ -189,7 +179,7 @@ def _validate_parameters( if not 0 < sif_coefficient < 1.0: raise ValueError("SIF coefficient must be a value > 0 and < 1.0.") - token_remove_regex: re.Pattern | None = None + token_remove_regex: re.Pattern[str] | None = None if token_remove_pattern is not None: try: token_remove_regex = re.compile(token_remove_pattern) @@ -215,7 +205,8 @@ def distill( Distill a staticmodel from a sentence transformer. This function creates a set of embeddings from a sentence transformer. It does this by doing either - a forward pass for all subword tokens in the tokenizer, or by doing a forward pass for all tokens in a passed vocabulary. + a forward pass for all subword tokens in the tokenizer, or by doing a forward pass for all tokens in a passed + vocabulary. If you pass through a vocabulary, we create a custom word tokenizer for that vocabulary. If you don't pass a vocabulary, we use the model's tokenizer directly. @@ -228,10 +219,13 @@ def distill( If this is 'auto', we don't reduce dimenionality, but still apply PCA. :param sif_coefficient: The SIF coefficient to use. If this is None, no weighting is applied. Should be a value >= 0 and < 1.0. A value of 1e-4 is a good default. - :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to this regex pattern will be removed from the vocabulary. - :param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming from `transformers`. If this is True, we will load all components. + :param token_remove_pattern: If this is set to a string, we compile this into a regex. Any tokens that conform to + this regex pattern will be removed from the vocabulary. + :param trust_remote_code: Whether to trust the remote code. If this is False, we will only load components coming + from `transformers`. If this is True, we will load all components. :param quantize_to: The data type to quantize to. Can be any of the DType enum members or their string equivalents. - :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no quantization is performed. + :param vocabulary_quantization: The number of clusters to use for vocabulary quantization. If this is None, no + quantization is performed. :param pooling: The pooling mode to use for creating embeddings. Can be one of: 'mean' (default): mean over all tokens. Robust and works well in most cases. 'last': use the last token's hidden state (often the [EOS] token). Common for decoder-style models. diff --git a/model2vec/tokenizer/__init__.py b/model2vec/tokenizer/__init__.py index 4cfe498..7ed0d8e 100644 --- a/model2vec/tokenizer/__init__.py +++ b/model2vec/tokenizer/__init__.py @@ -4,9 +4,7 @@ from model2vec.tokenizer.tokenizer import ( clean_and_create_vocabulary, - create_tokenizer, - replace_vocabulary, turn_tokens_into_ids, ) -__all__ = ["clean_and_create_vocabulary", "create_tokenizer", "turn_tokens_into_ids", "replace_vocabulary"] +__all__ = ["clean_and_create_vocabulary", "turn_tokens_into_ids"] diff --git a/model2vec/tokenizer/datamodels.py b/model2vec/tokenizer/datamodels.py deleted file mode 100644 index 6aa5ecf..0000000 --- a/model2vec/tokenizer/datamodels.py +++ /dev/null @@ -1,14 +0,0 @@ -from dataclasses import dataclass - - -@dataclass -class Token: - """A class to represent a token.""" - - form: str - # The normalized and pretokenized form of the token - normalized_form: str - # Whether the word is a continuing subword. - is_subword: bool - # Whether the token is internal to the model. - is_internal: bool diff --git a/model2vec/tokenizer/model.py b/model2vec/tokenizer/model.py deleted file mode 100644 index 12dd388..0000000 --- a/model2vec/tokenizer/model.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import annotations - -from typing import Any - -import numpy as np - - -def process_tokenizer( - tokenizer_json: dict[str, Any], pre_tokenized_tokens: list[str], unk_token: str | None -) -> dict[str, Any]: - """Process the WordPiece tokenizer JSON.""" - if tokenizer_json["model"]["type"] == "Unigram": - return _process_unigram(tokenizer_json, pre_tokenized_tokens, unk_token) - tokenizer_json["model"]["type"] = "Unigram" - tokenizer_json["model"]["unk_id"] = pre_tokenized_tokens.index(unk_token) if unk_token else None - - token_weights = np.asarray([_calculate_token_weight_for_unigram(token) for token in pre_tokenized_tokens]) - proba = (token_weights / np.sum(token_weights)).tolist() - tokenizer_json["model"]["vocab"] = [(token, np.log(p)) for token, p in zip(pre_tokenized_tokens, proba)] - - return tokenizer_json - - -def _process_unigram( - tokenizer_json: dict[str, Any], pre_tokenized_tokens: list[str], unk_token: str | None -) -> dict[str, Any]: - """Process the Unigram tokenizer JSON.""" - current_probas = dict(tokenizer_json["model"]["vocab"]) - avg_proba = sum(current_probas.values()) / len(current_probas) - new_probas = [[word, current_probas.get(word, avg_proba)] for word in pre_tokenized_tokens] - tokenizer_json["model"]["vocab"] = new_probas - - tokens, _ = zip(*tokenizer_json["model"]["vocab"]) - if unk_token is not None: - tokenizer_json["model"]["unk_id"] = list(tokens).index(unk_token) - - return tokenizer_json - - -def _calculate_token_weight_for_unigram(token: str) -> float: - """Calculate the token weight for Unigram.""" - # Always prefer longer tokens. - return len(token) + token.count("▁") + token.count("Ġ") diff --git a/model2vec/tokenizer/normalizer.py b/model2vec/tokenizer/normalizer.py deleted file mode 100644 index 15cb11e..0000000 --- a/model2vec/tokenizer/normalizer.py +++ /dev/null @@ -1,42 +0,0 @@ -from string import punctuation - -from tokenizers import Regex, Tokenizer -from tokenizers.normalizers import Replace, Sequence, Strip - - -def replace_normalizer( - tokenizer: Tokenizer, -) -> Tokenizer: - """ - Replace the normalizer for the tokenizer. - - The new normalizer will replace punctuation with a space before and after the punctuation. - It will also replace multiple spaces with a single space and strip the right side of the string. - If the tokenizer already has a normalizer, it will be added to the new normalizer. - If the tokenizer does not have a normalizer, a new normalizer will be created. - - :param tokenizer: The tokenizer to change. - :return: The tokenizer with a replaced normalizer. - """ - spaces_punctuation = tokenizer.encode("a, ,", add_special_tokens=False).tokens - if len(spaces_punctuation) != 3: - add_space = False - else: - _, first_comma, second_comma = spaces_punctuation - add_space = first_comma == second_comma == "," - - normalizer = tokenizer.normalizer - new_normalizers = [] - for char in punctuation: - replacement = f" {char} " if add_space else f"{char} " - new_normalizers.append(Replace(char, replacement)) - - new_normalizers.append(Replace(Regex(r"\s+"), " ")) - new_normalizers.append(Strip(right=True)) - if normalizer is None: - normalizer = Sequence(new_normalizers) # type: ignore - else: - normalizer = Sequence([normalizer] + new_normalizers) # type: ignore - tokenizer.normalizer = normalizer # type: ignore - - return tokenizer diff --git a/model2vec/tokenizer/pretokenizer.py b/model2vec/tokenizer/pretokenizer.py deleted file mode 100644 index ac7ee42..0000000 --- a/model2vec/tokenizer/pretokenizer.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import annotations - -import json -from typing import Any - -from tokenizers import Tokenizer - -_FORBIDDEN_PRETOKENIZERS = ( - "WhiteSpace", - "WhitespaceSplit", - "BertPreTokenizer", - "CharDelimiterSplit", - "Punctuation", - "Split", - "UnicodeScripts", -) -_BASIC_METASPACE = {"type": "Metaspace", "replacement": "▁", "prepend_scheme": "always", "split": False} - - -def _fix_single_pretokenizer(pre_tokenizer: dict[str, Any]) -> dict[str, Any] | None: - """Fixes a single pretokenizer to allow multiword units.""" - if pre_tokenizer["type"] in _FORBIDDEN_PRETOKENIZERS: - return None - if pre_tokenizer["type"] == "ByteLevel": - pre_tokenizer["add_prefix_space"] = True - pre_tokenizer["use_regex"] = False - if pre_tokenizer["type"] == "Metaspace": - pre_tokenizer["split"] = False - pre_tokenizer["prepend_scheme"] = "always" - - return pre_tokenizer - - -def replace_pretokenizer(tokenizer: Tokenizer) -> Tokenizer: - """Fixes a single pretokenizer to allow multiword units.""" - tokenizer_json = json.loads(tokenizer.to_str()) - pre_tokenizer_json = tokenizer_json.get("pre_tokenizer", None) - - if pre_tokenizer_json is None: - pre_tokenizer_json = _BASIC_METASPACE - - elif pre_tokenizer_json["type"] == "Sequence": - new_pretokenizers = [] - for single_pretokenizer in pre_tokenizer_json["pretokenizers"]: - new_pretokenizer = _fix_single_pretokenizer(single_pretokenizer) - if new_pretokenizer is not None: - new_pretokenizers.append(new_pretokenizer) - - if new_pretokenizers: - pre_tokenizer_json["pretokenizers"] = new_pretokenizers - else: - pre_tokenizer_json = _BASIC_METASPACE - - pre_tokenizer_json = _fix_single_pretokenizer(pre_tokenizer_json) or _BASIC_METASPACE - tokenizer_json["pre_tokenizer"] = pre_tokenizer_json - - return tokenizer.from_str(json.dumps(tokenizer_json)) diff --git a/model2vec/tokenizer/tokenizer.py b/model2vec/tokenizer/tokenizer.py index 24d082a..fbcb804 100644 --- a/model2vec/tokenizer/tokenizer.py +++ b/model2vec/tokenizer/tokenizer.py @@ -1,398 +1,98 @@ from __future__ import annotations -import json import logging import re -from typing import Any, cast -from tokenizers import Tokenizer -from tokenizers.normalizers import Normalizer -from tokenizers.pre_tokenizers import ( - PreTokenizer, -) -from transformers.tokenization_utils_fast import PreTrainedTokenizerFast - -from model2vec.tokenizer.datamodels import Token -from model2vec.tokenizer.model import process_tokenizer -from model2vec.tokenizer.normalizer import replace_normalizer -from model2vec.tokenizer.pretokenizer import replace_pretokenizer +from skeletoken import TokenizerModel logger = logging.getLogger(__name__) -_DEFAULT_POST_PROCESSOR_TEMPLATE = { - "type": "TemplateProcessing", - "single": [{"Sequence": {"id": "A", "type_id": 0}}], - "pair": [{"Sequence": {"id": "A", "type_id": 0}}, {"Sequence": {"id": "B", "type_id": 0}}], - "special_tokens": {}, -} - - -def _remap_added_tokens( - special_tokens: list[dict[str, Any]], - vocabulary: list[str], -) -> list[dict[str, Any]]: - """ - Remap special tokens in the tokenizer. - - This function updates the special tokens in the tokenizer based on a mapping provided. - It also ensures that the special tokens are present in the vocabulary. - - :param special_tokens: The special tokens to remap. - :param vocabulary: The vocabulary as a list of tokens. - :return: The updated special tokens. - """ - # Deepcopy - special_tokens = [{**x} for x in special_tokens] - for token in special_tokens: - token["id"] = vocabulary.index(token["content"]) - - return special_tokens - - -def replace_vocabulary( - tokenizer: Tokenizer, new_vocabulary: list[Token], unk_token: str | None, pad_token: str | None -) -> Tokenizer: - """Replace the vocabulary of a tokenizer with a new one.""" - tokenizer_json: dict[str, Any] = json.loads(tokenizer.to_str()) - added_tokens: list[dict[str, Any]] = tokenizer_json["added_tokens"] - - pre_tokenized_tokens = [x.normalized_form for x in new_vocabulary] - - # We need to remove the added tokens but keep [UNK] and [PAD] tokens. - added_tokens = _rename_added_token(unk_token, "[UNK]", added_tokens, pre_tokenized_tokens) - added_tokens = _rename_added_token(pad_token, "[PAD]", added_tokens, pre_tokenized_tokens) - - # Remove old added tokens from added tokens - tokenizer_json["added_tokens"] = [x for x in added_tokens if x["content"] in {"[UNK]", "[PAD]"}] - tokenizer_json = process_tokenizer( - tokenizer_json, pre_tokenized_tokens, "[UNK]" if "[UNK]" in pre_tokenized_tokens else None - ) - - # Remap special tokens - tokenizer_json["added_tokens"] = _remap_added_tokens( - special_tokens=tokenizer_json["added_tokens"], - vocabulary=pre_tokenized_tokens, - ) - tokenizer_json["post_processor"] = _DEFAULT_POST_PROCESSOR_TEMPLATE - - return Tokenizer.from_str(json.dumps(tokenizer_json)) - - -def _rename_added_token( - form: str | None, new_form: str, added_tokens: list[dict[str, Any]], vocabulary: list[str] -) -> list[dict[str, Any]]: - """Rename added tokens in the tokenizer.""" - if form is None: - return added_tokens - - idx = vocabulary.index(form) - added_token = [x for x in added_tokens if x["content"] == form] - if added_token: - added_token[0]["id"] = idx - added_token[0]["content"] = new_form - vocabulary[idx] = new_form - - return added_tokens - - def clean_and_create_vocabulary( - tokenizer: PreTrainedTokenizerFast, - vocabulary: list[str], - token_remove_regex: re.Pattern | None, -) -> tuple[list[Token], Tokenizer]: + model: TokenizerModel, + vocabulary_to_add: list[str], + token_remove_regex: re.Pattern[str] | None, +) -> TokenizerModel: """Cleans a vocabulary by removing duplicates and tokens that were already in the vocabulary.""" seen_tokens = set() - post_normalize_seen_tokens = set() - n_empty = 0 - n_duplicates = 0 - - backend_tokenizer = tokenizer.backend_tokenizer - - # Make a base list of tokens. - internal_vocab: dict[str, int] = tokenizer.get_vocab() - internal_tokens: list[str] = [k for k, _ in sorted(internal_vocab.items(), key=lambda x: x[1])] - cleaned_vocabulary = _process_internal_tokens(tokenizer, backend_tokenizer, internal_tokens, token_remove_regex) - # Copy the backend tokenizer to avoid modifying the original. - backend_tokenizer = backend_tokenizer.from_str(backend_tokenizer.to_str()) - backend_tokenizer = replace_normalizer(backend_tokenizer) - - internal_tokens_set = {token.form for token in cleaned_vocabulary} - - normalizer: Normalizer | None = backend_tokenizer.normalizer - for token in vocabulary: - if normalizer is not None: - token = cast(str, normalizer.normalize_str(token)) - - if not token: + n_duplicate = 0 + n_empty = 0 + n_regex_removed = 0 + + internal_tokens: list[str] = model.sorted_vocabulary + if token_remove_regex: + len_before = len(internal_tokens) + tokens_to_remove = [token for token in internal_tokens if token_remove_regex.match(token)] + model = model.remove_tokens_from_vocabulary(tokens_to_remove) + n_regex_removed = len_before - len(internal_tokens) + preprocessor = model.preprocessor + + seen_tokens = set(internal_tokens) + tokens_to_add: list[str] = [] + added_tokens_to_add: list[str] = [] + for token in vocabulary_to_add: + preprocessed = preprocessor.preprocess(token) + if len(preprocessed) < 1: + logger.warning(f"Token '{token}' was empty after preprocessing.") n_empty += 1 continue - - pre_tokenizer: PreTokenizer | None = backend_tokenizer.pre_tokenizer - normalized_token = token - if pre_tokenizer is not None: - normalized_token = _normalize_vocabulary_token( - token=token, - pre_tokenizer=pre_tokenizer, - ) - - # We need to check whether the pretokenized token is in the vocabulary. - # But we need to return the original token, because that will be tokenized - # again by the tokenizer during featurization. - if normalized_token in seen_tokens or normalized_token in internal_tokens_set: - n_duplicates += 1 + if len(preprocessed) > 1: + tokens_as_str = [f"'{subword}'" for subword in token] + split_into = ",".join(tokens_as_str) + logger.warning(f"Token '{token}' was split into multiple tokens after preprocessing: [{split_into}]") + added_tokens_to_add.append(token) continue - - # Add the possibly pretokenized token to seen - seen_tokens.add(normalized_token) - - # After checking the token exists, we need to normalize it into the token - # it will become. For byte tokens, this means we don't do anything. For - # other types of tokens, we will insert a metaspace. - # In the case of multiword tokens, we replace any spaces with the metaspace - # or byte prefix token. - if not normalized_token.startswith(("▁", "Ġ")): - normalized_token = normalized_token.replace(" ", "▁") - normalized_token = f"▁{normalized_token}" - else: - normalized_token = normalized_token.replace(" ", normalized_token[0]) - - if normalized_token in post_normalize_seen_tokens: - n_duplicates += 1 + token = preprocessed[0] + if token in seen_tokens: + logger.warning(f"Token '{token}' was already in the vocabulary.") + n_duplicate += 1 continue + if token_remove_regex and token_remove_regex.match(token): + logger.warning(f"Token '{token}' was removed due to regex match.") + n_regex_removed += 1 + continue + seen_tokens.add(token) + tokens_to_add.append(token) - post_normalize_seen_tokens.add(normalized_token) - # Add the original string to the vocabulary. - cleaned_vocabulary.append( - Token(form=token, normalized_form=normalized_token, is_subword=False, is_internal=False) - ) - - if n_duplicates: - logger.warning(f"Removed {n_duplicates} duplicate tokens.") - if n_empty: - logger.warning(f"Removed {n_empty} empty tokens.") - - return cleaned_vocabulary, replace_pretokenizer(backend_tokenizer) - - -def _process_internal_tokens( - tokenizer: PreTrainedTokenizerFast, - backend_tokenizer: Tokenizer, - internal_tokens: list[str], - token_remove_regex: re.Pattern | None, -) -> list[Token]: - """Clean internal tokens.""" - # Get the pad and unk token from the tokenizer. - pad_token: str | None = tokenizer.special_tokens_map.get("pad_token") # type: ignore[assignment] - unk_token: str | None = tokenizer.special_tokens_map.get("unk_token") # type: ignore[assignment] - # Empty set if no pad or unk token is set. - added_tokens_to_keep: set[str] = {x for x in (pad_token, unk_token) if x is not None} - added_tokens_to_remove = set(tokenizer.added_tokens_encoder) - added_tokens_to_keep - cleaned_internal_tokens: list[Token] = [] - - # Figure out whether token is a subword or not. - encoded = backend_tokenizer.encode(f" {'a' * 25}", add_special_tokens=False) - first_token, second_token, *_ = encoded.tokens - # Isolate the prefix. We can't do first_token[0] because we don't know - # how long the prefix is. - # e.g., "Ġaaaa" -> "Ġ" - a_index = None if "a" not in first_token else first_token.index("a") - word_prefix = first_token[:a_index] - is_byte_prefix = word_prefix == "Ġ" - second_token = encoded.tokens[1] - # The second token is the first subword token. - # If a tokenizer uses subwords, this token will have been prefixed. - # We don't know how long the prefix is. - a_index = None if "a" not in second_token else second_token.index("a") - subword_prefix = second_token[:a_index] - - pre_tokenizer: PreTokenizer | None = backend_tokenizer.pre_tokenizer - - for token in internal_tokens: - # Create the token objects. If this returns None, it was unsucessful for some reason. - if token_object := _create_single_internal_token( - token=token, - subword_prefix=subword_prefix, - word_prefix=word_prefix, - pre_tokenizer=pre_tokenizer, - is_byte_prefix=is_byte_prefix, - token_remove_regex=token_remove_regex, - added_tokens_to_keep=added_tokens_to_keep, - added_tokens_to_remove=added_tokens_to_remove, - ): - cleaned_internal_tokens.append(token_object) - - if len(cleaned_internal_tokens) != len(internal_tokens): - logger.info( - f"Removed {len(internal_tokens) - len(cleaned_internal_tokens)} internal tokens from the vocabulary." - ) - - return cleaned_internal_tokens - - -def _create_single_internal_token( - token: str, - subword_prefix: str, - word_prefix: str, - pre_tokenizer: PreTokenizer | None, - is_byte_prefix: bool, - token_remove_regex: re.Pattern | None, - added_tokens_to_keep: set[str], - added_tokens_to_remove: set[str], -) -> Token | None: - """Create a token object from a string.""" - if token in added_tokens_to_remove: - # We remove any tokens that are added tokens that aren't [UNK] or [PAD]. - return None - if token in added_tokens_to_keep: - # Don't put added tokens through the regular motions. - return Token(form=token, normalized_form=token, is_subword=False, is_internal=True) - if token_remove_regex and token_remove_regex.match(token): - # If the regex matches, remove the token. - return None - - # A token is a subword if there is a subword prefix and the word - # starts with a subword prefix, or if there is a WORD prefix, and the word - # does not start with this prefix. For metaspace tokenizers, for example: - # "doghouse" -> ["_dog", "house"] - # So we can only tell that "house" is a subword by knowing that it is not prefixed - # and word-initial tokens are. - is_subword = False - if subword_prefix: - is_subword = bool(token.startswith(subword_prefix)) - if word_prefix: - is_subword = not bool(token.startswith(word_prefix)) - - # Byte prefixed tokenizers don't need to be checked. - if pre_tokenizer is not None and not is_byte_prefix: - # We need to check the thing without prefixes. If we have a word prefix, - # we need to check tokens that have are subwords. Other way around for subword - # prefixes. - if (subword_prefix and not is_subword) or (word_prefix and is_subword): - # If this is True, the token is unreachable, even though it is a subword token. - if len(pre_tokenizer.pre_tokenize_str(token)) > 1: - return None + model = model.add_tokens_to_vocabulary(tokens_to_add, preprocess_tokens=True) + model = model.add_addedtokens(added_tokens_to_add, is_special=False, single_word=False, normalized=True) - # Turn a token into a normalized form for later processing. - normalized_form = _create_normalized_form(token, subword_prefix, word_prefix, is_byte_prefix, is_subword) + n_multiword = len(added_tokens_to_add) + _report_statistics(n_multiword, n_duplicate, n_regex_removed, n_empty) - return Token(form=token, normalized_form=normalized_form, is_subword=is_subword, is_internal=True) + return model -def _create_normalized_form( - token: str, subword_prefix: str, word_prefix: str, is_byte_prefix: bool, is_subword: bool -) -> str: - """Turn an internal token string into a normalized form.""" - # We don't need to check byte prefixed strings. - if is_byte_prefix: - return token - # We need to check if the token is a subword or not and remove the prefix. - if is_subword: - return token.removeprefix(subword_prefix) - # If the token is not a subword, we need to remove the word prefix, and add metaspace. - return f"▁{token.removeprefix(word_prefix)}" +def _report_statistics(n_multiword: int, n_duplicate: int, n_regex_removed: int, n_empty: int) -> None: + """Helper function to avoid increasing complexity in main function.""" + if n_multiword: + logger.info(f"Added {n_multiword} multi-word tokens to the vocabulary.") + if n_duplicate: + logger.info(f"Removed {n_duplicate} duplicate tokens.") + if n_regex_removed: + logger.info(f"Removed {n_regex_removed} tokens due to regex match.") + if n_empty: + logger.info(f"Removed {n_empty} empty tokens.") -def turn_tokens_into_ids( - tokens: list[Token], tokenizer: PreTrainedTokenizerFast, unk_token: str | None -) -> list[list[int]]: +def turn_tokens_into_ids(tokens: list[str], model: TokenizerModel) -> list[list[int]]: """ Convert a list of Token objects to their corresponding token ID sequences. :param tokens: List of Token objects to convert - :param tokenizer: The tokenizer to use for converting tokens to IDs - :param unk_token: The string form of the unk token. + :param model: The tokenizermodel of the tokenizer. :return: List of token IDs corresponding to the input tokens """ - unk_id = None if unk_token is None else tokenizer.convert_tokens_to_ids(unk_token) - prefix, suffix = find_eos_bos(tokenizer) + prefix, suffix = model.bos_ids or [], model.eos_ids or [] + vocabulary = model.vocabulary + tokenizer = model.to_tokenizer() token_ids: list[list[int]] = [] for token in tokens: - if token.is_internal: - # Careful. Any incorrect tokens will just get `[UNK]``, so this could go horribly wrong - # Cast because return type is wrong. - token_id: int = cast(int, tokenizer.convert_tokens_to_ids(token.form)) or 0 - # Explicitly check and warn if `unk_id` appears, but don't crash. - if unk_id is not None and token_id == unk_id and token.form != unk_token: - logger.warning(f"Token {token.form} was set to unk. This is wrong.") + if token_id := vocabulary.get(token): token_ids.append([*prefix, token_id, *suffix]) else: - token_ids.append(tokenizer.encode(token.form)) + token_ids.append(tokenizer.encode(token).ids) return token_ids - - -def find_eos_bos(tokenizer: PreTrainedTokenizerFast) -> tuple[list[int], list[int]]: - """Finds the eos and bos tokens for a tokenizer.""" - # Little bit complicated, because not all tokenizers have eos and bos tokens. - encoding = tokenizer.encode("a", add_special_tokens=True) - if len(encoding) != 3: - a_encoded = tokenizer.encode("a", add_special_tokens=False) - if len(a_encoded) != 1: - raise ValueError( - f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{a_encoded}'" - ) - a_idx = encoding.index(a_encoded[0]) - prefix, suffix = encoding[:a_idx], encoding[a_idx + 1 :] - else: - prefix, suffix = encoding[:1], encoding[2:] - return prefix, suffix - - -def _normalize_vocabulary_token(token: str, pre_tokenizer: PreTokenizer) -> str: - """Normalize a token that is not in the initial token vocabulary.""" - # Add prefix space for byte tokenizers. - prefixed_token = f" {token}" - pretokenized_tokens: tuple[str, ...] - pretokenized_tokens, offsets = zip(*pre_tokenizer.pre_tokenize_str(prefixed_token)) - # The first item is always the start of the token. - new_token = [pretokenized_tokens[0]] - # Loop over the subtokens and offsets. - for t, (s, _) in zip(pretokenized_tokens[1:], offsets[1:]): - # Do not prefix the token with a space if it starts with a metaspace. - if t.startswith("▁"): - new_token.append(t) - # If the character before the subtoken is a space, we have a - # multiword token. e.g., "room for the moon", which is split into - # ["room", "for", "the", "moon"]. - # If it doesn't have a space, it is part of a complex multiword token, - # e.g., "chat-gpt", which is split into ["chat", "-", "gpt"]. - elif prefixed_token[s - 1] == " ": - new_token.append(f" {t}") - else: - new_token.append(t) - normalized_token = "".join(new_token) - - return normalized_token - - -def create_tokenizer( - tokenizer: PreTrainedTokenizerFast, - vocabulary: list[str], - token_remove_regex: re.Pattern | None = None, -) -> PreTrainedTokenizerFast: - """ - Create a tokenizer by adding tokens to the vocabulary. - - This function turns any tokenizer into a supertoken tokenizer. It does the following: - 1. Turns the tokenizer model into a unigram model. - 2. Adds a new pretokenizer, splitting on punctuation. - 3. Adds all tokens in vocabulary to the model. - 4. Removes any internal tokens that conform to the regex. - - :param tokenizer: The tokenizer to use. - :param vocabulary: The vocabulary to use. - :param token_remove_regex: The regex to use to remove tokens from the vocabulary. - :return: The created tokenizer. - """ - unk_token = cast(str | None, tokenizer.special_tokens_map.get("unk_token")) - pad_token = cast(str | None, tokenizer.special_tokens_map.get("pad_token")) - cleaned_vocabulary, backend_tokenizer = clean_and_create_vocabulary(tokenizer, vocabulary, token_remove_regex) - new_tokenizer = replace_vocabulary(backend_tokenizer, cleaned_vocabulary, unk_token, pad_token) - - tokenizer_object = PreTrainedTokenizerFast(tokenizer_object=new_tokenizer) - tokenizer_object.add_special_tokens({"pad_token": "[PAD]", "unk_token": "[UNK]"}) - - return tokenizer_object diff --git a/pyproject.toml b/pyproject.toml index 812638e..c17f554 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ dev = [ "ruff", ] -distill = ["torch", "transformers", "scikit-learn"] +distill = ["torch", "transformers", "scikit-learn", "skeletoken>=0.3.0"] onnx = ["onnx", "torch"] # train also installs inference train = ["torch", "lightning", "scikit-learn", "skops"] diff --git a/tests/conftest.py b/tests/conftest.py index dae9b1c..676392a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,7 @@ import numpy as np import pytest import torch +from skeletoken import TokenizerModel from tokenizers import Tokenizer from tokenizers.models import BPE, Unigram, WordPiece from tokenizers.pre_tokenizers import Whitespace @@ -54,6 +55,12 @@ def mock_berttokenizer() -> PreTrainedTokenizerFast: return cast(PreTrainedTokenizerFast, AutoTokenizer.from_pretrained("tests/data/test_tokenizer")) +@pytest.fixture(scope="function") +def mock_tokenizermodel() -> TokenizerModel: + """Loads the tokenizer as a TokenizerModel.""" + return TokenizerModel.from_pretrained("tests/data/test_tokenizer") + + @pytest.fixture def mock_transformer() -> PreTrainedModel: """Create a mock transformer model.""" diff --git a/tests/test_distillation.py b/tests/test_distillation.py index 3c50ae6..4a9fc30 100644 --- a/tests/test_distillation.py +++ b/tests/test_distillation.py @@ -8,6 +8,8 @@ import numpy as np import pytest from pytest import LogCaptureFixture +from skeletoken import TokenizerModel +from transformers import BertTokenizerFast from transformers.modeling_utils import PreTrainedModel from transformers.tokenization_utils_fast import PreTrainedTokenizerFast @@ -78,6 +80,28 @@ def test_distill_from_model( assert static_model.base_model_name == static_model2.base_model_name +@patch.object(import_module("model2vec.distill.distillation"), "model_info") +@patch("transformers.AutoModel.from_pretrained") +def test_distill_removal_pattern_all_tokens( + mock_auto_model: MagicMock, + mock_model_info: MagicMock, + mock_berttokenizer: BertTokenizerFast, + mock_transformer: PreTrainedModel, +) -> None: + """Test the removal pattern.""" + mock_model_info.return_value = type("ModelInfo", (object,), {"cardData": {"language": "en"}}) + mock_auto_model.return_value = mock_transformer + + with pytest.raises(ValueError): + distill_from_model( + model=mock_transformer, + tokenizer=mock_berttokenizer, + vocabulary=None, + device="cpu", + token_remove_pattern=r".*", + ) + + @patch.object(import_module("model2vec.distill.distillation"), "model_info") @patch("transformers.AutoModel.from_pretrained") def test_distill_removal_pattern( @@ -90,8 +114,7 @@ def test_distill_removal_pattern( mock_model_info.return_value = type("ModelInfo", (object,), {"cardData": {"language": "en"}}) mock_auto_model.return_value = mock_transformer - # The vocab size is 30522, but we remove 998 tokens: [CLS], [SEP], and [MASK], and all [unused] tokens. - expected_vocab_size = mock_berttokenizer.vocab_size - 998 + expected_vocab_size = mock_berttokenizer.vocab_size static_model = distill_from_model( model=mock_transformer, @@ -112,6 +135,16 @@ def test_distill_removal_pattern( ) assert len(static_model.embedding) == expected_vocab_size + # Test whether regexes remove words from the vocabulary + static_model = distill_from_model( + model=mock_transformer, + tokenizer=mock_berttokenizer, + vocabulary=["hellooooooo"], + device="cpu", + token_remove_pattern="hellooooooo", + ) + assert "hellooooooo" not in static_model.tokens + # Weird pattern. with pytest.raises(ValueError): _ = distill_from_model( @@ -126,14 +159,14 @@ def test_distill_removal_pattern( @pytest.mark.parametrize( "vocabulary, pca_dims, sif_coefficient, expected_shape", [ - (None, 256, None, (29524, 256)), # PCA applied, SIF off - (None, "auto", None, (29524, 768)), # PCA 'auto', SIF off - (None, "auto", 1e-4, (29524, 768)), # PCA 'auto', SIF on + (None, 256, None, (30522, 256)), # PCA applied, SIF off + (None, "auto", None, (30522, 768)), # PCA 'auto', SIF off + (None, "auto", 1e-4, (30522, 768)), # PCA 'auto', SIF on (None, "auto", 0, None), # invalid SIF (too low) -> raises (None, "auto", 1, None), # invalid SIF (too high) -> raises - (None, 1024, None, (29524, 768)), # PCA set high (no reduction) - (["wordA", "wordB"], 4, None, (29526, 4)), # Custom vocab, PCA applied - (None, None, None, (29524, 768)), # No PCA, SIF off + (None, 1024, None, (30522, 768)), # PCA set high (no reduction) + (["wordA", "wordB"], 4, None, (30524, 4)), # Custom vocab, PCA applied + (None, None, None, (30522, 768)), # No PCA, SIF off ], ) @patch.object(import_module("model2vec.distill.distillation"), "model_info") @@ -161,6 +194,7 @@ def test_distill( device="cpu", pca_dims=pca_dims, sif_coefficient=sif_coefficient, + token_remove_pattern=None, ) else: static_model = distill( @@ -169,6 +203,7 @@ def test_distill( device="cpu", pca_dims=pca_dims, sif_coefficient=sif_coefficient, + token_remove_pattern=None, ) assert isinstance(static_model, StaticModel) assert static_model.embedding.shape == expected_shape @@ -231,15 +266,16 @@ def test__post_process_embeddings( "added_tokens, expected_output, expected_warnings", [ # Case: duplicates ("2010", "government") and an empty token ("") - (["2010", "government", "nerv", ""], ["nerv"], ["Removed", "duplicate", "empty"]), + (["2010", "government", "nerv", ""], ["nerv"], ["already", "empty"]), # Case: No duplicates, no empty tokens (["worda", "wordb", "wordc"], ["worda", "wordb", "wordc"], []), # Case: Only empty token (""), should return an empty list - ([""], [], ["Removed", "empty"]), + ([""], [], ["empty"]), + (["multi word token"], ["multi word token"], []), ], ) def test_clean_and_create_vocabulary( - mock_berttokenizer: PreTrainedTokenizerFast, + mock_tokenizermodel: TokenizerModel, added_tokens: list[str], expected_output: list[str], expected_warnings: list[str], @@ -247,11 +283,12 @@ def test_clean_and_create_vocabulary( ) -> None: """Test the clean_and_create_vocabulary helper.""" with caplog.at_level("WARNING"): - tokens, _ = clean_and_create_vocabulary(mock_berttokenizer, added_tokens, None) + old_tokens = mock_tokenizermodel.sorted_vocabulary + tokenizer_model = clean_and_create_vocabulary(mock_tokenizermodel, added_tokens, None) + tokens = set(tokenizer_model.sorted_vocabulary) - set(old_tokens) - cleaned_vocab = [token.form for token in tokens if not token.is_internal] # Check the cleaned vocabulary matches the expected output - assert cleaned_vocab == expected_output + assert tokens == set(expected_output) # Check the warnings were logged as expected logged_warnings = [record.message for record in caplog.records] @@ -268,9 +305,11 @@ def test_clean_and_create_vocabulary( (PoolingMode.POOLER, True, [7.0, 7.0]), # pooler_output used ], ) -def test_pooling_strategies(mock_transformer, pooling, with_pooler, expected_rows) -> None: +def test_pooling_strategies( + mock_transformer: PreTrainedModel, pooling: PoolingMode, with_pooler: bool, expected_rows: tuple[float, float] +) -> None: """Test different pooling strategies.""" - mock_transformer.with_pooler = with_pooler + mock_transformer.with_pooler = with_pooler # type: ignore tokenized = [[10, 11, 12], [20]] out = create_embeddings( model=mock_transformer, @@ -284,9 +323,9 @@ def test_pooling_strategies(mock_transformer, pooling, with_pooler, expected_row assert np.allclose(out, expected, rtol=1e-6, atol=0.0) -def test_pooler_raises_without_pooler_output(mock_transformer) -> None: +def test_pooler_raises_without_pooler_output(mock_transformer: PreTrainedModel) -> None: """POOLER should raise when the model doesn't expose pooler_output.""" - mock_transformer.with_pooler = False + mock_transformer.with_pooler = False # type: ignore tokenized = [[10, 11, 12], [20]] with pytest.raises(ValueError, match="pooler_output"): _ = create_embeddings( diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py deleted file mode 100644 index e1267df..0000000 --- a/tests/test_tokenizer.py +++ /dev/null @@ -1,123 +0,0 @@ -import json - -import pytest -from transformers.tokenization_utils_fast import PreTrainedTokenizerFast - -from model2vec.tokenizer.model import _calculate_token_weight_for_unigram, _process_unigram, process_tokenizer -from model2vec.tokenizer.normalizer import replace_normalizer -from model2vec.tokenizer.pretokenizer import _FORBIDDEN_PRETOKENIZERS, _fix_single_pretokenizer, replace_pretokenizer -from model2vec.tokenizer.tokenizer import _rename_added_token, create_tokenizer - - -def test_fix_single_pretokenizer() -> None: - """Test the _fix_single_pretokenizer function.""" - result = _fix_single_pretokenizer({"type": "ByteLevel", "add_prefix_space": False, "use_regex": True}) - assert result == {"type": "ByteLevel", "add_prefix_space": True, "use_regex": False} - - for tokenizer_type in _FORBIDDEN_PRETOKENIZERS: - result = _fix_single_pretokenizer({"type": tokenizer_type}) - assert result is None - - result = _fix_single_pretokenizer( - {"type": "Metaspace", "split": True, "prepend_scheme": "never", "replacement": "▁"} - ) - assert result == {"type": "Metaspace", "replacement": "▁", "prepend_scheme": "always", "split": False} - - -def test_replace_pretokenizer(mock_berttokenizer: PreTrainedTokenizerFast) -> None: - """Test the replace_pretokenizer function.""" - tokenizer = replace_pretokenizer(mock_berttokenizer.backend_tokenizer) - assert tokenizer.pre_tokenizer is not None - assert tokenizer.pre_tokenizer.__class__.__name__ == "Metaspace" - assert tokenizer.pre_tokenizer.replacement == "▁" - assert tokenizer.pre_tokenizer.prepend_scheme == "always" - assert not tokenizer.pre_tokenizer.split - - tokenizer.pre_tokenizer = None # type: ignore - tokenizer = replace_pretokenizer(tokenizer) - assert tokenizer.pre_tokenizer is not None - assert tokenizer.pre_tokenizer.__class__.__name__ == "Metaspace" - assert tokenizer.pre_tokenizer.replacement == "▁" - assert tokenizer.pre_tokenizer.prepend_scheme == "always" - assert tokenizer.pre_tokenizer.split is False - - -def test_replace_normalizer(mock_berttokenizer: PreTrainedTokenizerFast) -> None: - """Test the replace_normalizer function.""" - tokenizer = replace_normalizer(mock_berttokenizer.backend_tokenizer) - assert tokenizer.normalizer is not None - assert tokenizer.normalizer.__class__.__name__ == "Sequence" - - assert tokenizer.normalizer.normalize_str("Hello, World!") == "hello , world !" - - tokenizer.normalizer = None # type: ignore - tokenizer = replace_normalizer(tokenizer) - assert tokenizer.normalizer.normalize_str("Hello, World!") == "Hello , World !" - - -@pytest.mark.parametrize( - "word,weight", - [ - ("dog", 3), - ("cat", 3), - ("▁longer▁word", 14), - ("▁word", 6), - ("▁", 2), # Single underscore - ("", 0), # Empty string - ("▁a" * 100, 300), # Long word with underscores - ], -) -def test_calculate_token_weight_for_unigram(word: str, weight: int) -> None: - """Test the _calculate_token_weight_for_unigram function.""" - assert _calculate_token_weight_for_unigram(word) == weight - - -def test_process_tokenizer(mock_berttokenizer: PreTrainedTokenizerFast) -> None: - """Test the process_tokenizer function.""" - vocab = ["dog", "cat", "longer_word", "word", "a" * 100, "[UNK]"] - tokenizer_json = json.loads(mock_berttokenizer.backend_tokenizer.to_str()) - tokenizer_json = process_tokenizer(tokenizer_json=tokenizer_json, pre_tokenized_tokens=vocab, unk_token="[UNK]") - - assert tokenizer_json["model"]["type"] == "Unigram" - assert tokenizer_json["model"]["unk_id"] == 5 # Index of "[UNK]" - assert len(tokenizer_json["model"]["vocab"]) == 6 - assert all(isinstance(token, tuple) and len(token) == 2 for token in tokenizer_json["model"]["vocab"]) - for (x, _), y in zip(tokenizer_json["model"]["vocab"], vocab): - assert x == y, f"Expected {y}, but got {x}" - - -def test_process_unigram() -> None: - """Test the _process_unigram function.""" - vocab = ["dog", "cat", "longer_word", "word", "a" * 100, "[UNK]"] - orig_vocab = [("dog", 0), ("cat", 0)] - model = {"model": {"type": "Unigram", "vocab": orig_vocab}} - processed_model = _process_unigram(model, vocab, "[UNK]") - assert processed_model["model"]["type"] == "Unigram" - assert processed_model["model"]["unk_id"] == 5 # Index of "[UNK]" - assert len(processed_model["model"]["vocab"]) == 6 - assert all(isinstance(token, list) and len(token) == 2 for token in processed_model["model"]["vocab"]) - - for (x, score), y in zip(processed_model["model"]["vocab"], vocab): - assert x == y, f"Expected {y}, but got {x}" - if x in orig_vocab: - assert score == 0 - - assert process_tokenizer(model, vocab, "[UNK]") == processed_model - - -def test_rename_added_token() -> None: - """Test the _rename_added_token function.""" - # Invalid input - result = _rename_added_token(None, "a", [{"content": "a", "id": 0}], ["a"]) - assert result == [{"content": "a", "id": 0}] - - # Rename 'a' to 'c' - result = _rename_added_token("a", "c", [{"content": "a"}], ["a"]) - assert result == [{"content": "c", "id": 0}] - - -def test_create_tokenizer(mock_berttokenizer: PreTrainedTokenizerFast) -> None: - """Test the create_tokenizer function.""" - tokenizer = create_tokenizer(tokenizer=mock_berttokenizer, vocabulary=["dog", "catssssss"], token_remove_regex=None) - assert tokenizer.backend_tokenizer.get_vocab_size() == 29525 - assert tokenizer.encode("catssssss") == [29524] diff --git a/tests/test_utils.py b/tests/test_utils.py index 7ae331c..e5f6187 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -76,7 +76,7 @@ def test_importable() -> None: def test_get_package_extras() -> None: """Test package extras.""" extras = set(get_package_extras("model2vec", "distill")) - assert extras == {"torch", "transformers", "scikit-learn"} + assert extras == {"skeletoken", "torch", "transformers", "scikit-learn"} def test_get_package_extras_empty() -> None: diff --git a/uv.lock b/uv.lock index 231ed24..8c0668f 100644 --- a/uv.lock +++ b/uv.lock @@ -116,6 +116,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/76/641ae371508676492379f16e2fa48f4e2c11741bd63c48be4b12a6b09cba/aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e", size = 7490, upload-time = "2025-07-03T22:54:42.156Z" }, ] +[[package]] +name = "annotated-types" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, +] + [[package]] name = "asttokens" version = "3.0.0" @@ -879,6 +888,7 @@ dev = [ ] distill = [ { name = "scikit-learn" }, + { name = "skeletoken" }, { name = "torch" }, { name = "transformers" }, ] @@ -924,6 +934,7 @@ requires-dist = [ { name = "scikit-learn", marker = "extra == 'quantization'" }, { name = "scikit-learn", marker = "extra == 'train'" }, { name = "setuptools" }, + { name = "skeletoken", marker = "extra == 'distill'", specifier = ">=0.3.0" }, { name = "skops", marker = "extra == 'inference'" }, { name = "skops", marker = "extra == 'train'" }, { name = "tokenizers", specifier = ">=0.20" }, @@ -1674,6 +1685,139 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, ] +[[package]] +name = "pydantic" +version = "2.12.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "annotated-types" }, + { name = "pydantic-core" }, + { name = "typing-extensions" }, + { name = "typing-inspection" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" }, +] + +[[package]] +name = "pydantic-core" +version = "2.41.5" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/90/32c9941e728d564b411d574d8ee0cf09b12ec978cb22b294995bae5549a5/pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146", size = 2107298, upload-time = "2025-11-04T13:39:04.116Z" }, + { url = "https://files.pythonhosted.org/packages/fb/a8/61c96a77fe28993d9a6fb0f4127e05430a267b235a124545d79fea46dd65/pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2", size = 1901475, upload-time = "2025-11-04T13:39:06.055Z" }, + { url = "https://files.pythonhosted.org/packages/5d/b6/338abf60225acc18cdc08b4faef592d0310923d19a87fba1faf05af5346e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5921a4d3ca3aee735d9fd163808f5e8dd6c6972101e4adbda9a4667908849b97", size = 1918815, upload-time = "2025-11-04T13:39:10.41Z" }, + { url = "https://files.pythonhosted.org/packages/d1/1c/2ed0433e682983d8e8cba9c8d8ef274d4791ec6a6f24c58935b90e780e0a/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25c479382d26a2a41b7ebea1043564a937db462816ea07afa8a44c0866d52f9", size = 2065567, upload-time = "2025-11-04T13:39:12.244Z" }, + { url = "https://files.pythonhosted.org/packages/b3/24/cf84974ee7d6eae06b9e63289b7b8f6549d416b5c199ca2d7ce13bbcf619/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f547144f2966e1e16ae626d8ce72b4cfa0caedc7fa28052001c94fb2fcaa1c52", size = 2230442, upload-time = "2025-11-04T13:39:13.962Z" }, + { url = "https://files.pythonhosted.org/packages/fd/21/4e287865504b3edc0136c89c9c09431be326168b1eb7841911cbc877a995/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f52298fbd394f9ed112d56f3d11aabd0d5bd27beb3084cc3d8ad069483b8941", size = 2350956, upload-time = "2025-11-04T13:39:15.889Z" }, + { url = "https://files.pythonhosted.org/packages/a8/76/7727ef2ffa4b62fcab916686a68a0426b9b790139720e1934e8ba797e238/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:100baa204bb412b74fe285fb0f3a385256dad1d1879f0a5cb1499ed2e83d132a", size = 2068253, upload-time = "2025-11-04T13:39:17.403Z" }, + { url = "https://files.pythonhosted.org/packages/d5/8c/a4abfc79604bcb4c748e18975c44f94f756f08fb04218d5cb87eb0d3a63e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05a2c8852530ad2812cb7914dc61a1125dc4e06252ee98e5638a12da6cc6fb6c", size = 2177050, upload-time = "2025-11-04T13:39:19.351Z" }, + { url = "https://files.pythonhosted.org/packages/67/b1/de2e9a9a79b480f9cb0b6e8b6ba4c50b18d4e89852426364c66aa82bb7b3/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29452c56df2ed968d18d7e21f4ab0ac55e71dc59524872f6fc57dcf4a3249ed2", size = 2147178, upload-time = "2025-11-04T13:39:21Z" }, + { url = "https://files.pythonhosted.org/packages/16/c1/dfb33f837a47b20417500efaa0378adc6635b3c79e8369ff7a03c494b4ac/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:d5160812ea7a8a2ffbe233d8da666880cad0cbaf5d4de74ae15c313213d62556", size = 2341833, upload-time = "2025-11-04T13:39:22.606Z" }, + { url = "https://files.pythonhosted.org/packages/47/36/00f398642a0f4b815a9a558c4f1dca1b4020a7d49562807d7bc9ff279a6c/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:df3959765b553b9440adfd3c795617c352154e497a4eaf3752555cfb5da8fc49", size = 2321156, upload-time = "2025-11-04T13:39:25.843Z" }, + { url = "https://files.pythonhosted.org/packages/7e/70/cad3acd89fde2010807354d978725ae111ddf6d0ea46d1ea1775b5c1bd0c/pydantic_core-2.41.5-cp310-cp310-win32.whl", hash = "sha256:1f8d33a7f4d5a7889e60dc39856d76d09333d8a6ed0f5f1190635cbec70ec4ba", size = 1989378, upload-time = "2025-11-04T13:39:27.92Z" }, + { url = "https://files.pythonhosted.org/packages/76/92/d338652464c6c367e5608e4488201702cd1cbb0f33f7b6a85a60fe5f3720/pydantic_core-2.41.5-cp310-cp310-win_amd64.whl", hash = "sha256:62de39db01b8d593e45871af2af9e497295db8d73b085f6bfd0b18c83c70a8f9", size = 2013622, upload-time = "2025-11-04T13:39:29.848Z" }, + { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" }, + { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" }, + { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" }, + { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" }, + { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" }, + { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" }, + { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" }, + { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" }, + { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" }, + { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" }, + { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" }, + { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" }, + { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" }, + { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" }, + { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" }, + { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" }, + { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" }, + { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" }, + { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" }, + { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" }, + { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" }, + { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" }, + { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" }, + { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" }, + { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" }, + { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" }, + { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" }, + { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" }, + { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" }, + { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" }, + { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" }, + { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" }, + { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" }, + { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" }, + { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" }, + { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" }, + { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" }, + { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" }, + { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" }, + { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" }, + { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" }, + { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" }, + { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" }, + { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" }, + { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" }, + { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" }, + { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" }, + { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" }, + { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" }, + { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" }, + { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" }, + { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" }, + { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" }, + { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" }, + { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" }, + { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" }, + { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" }, + { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" }, + { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" }, + { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" }, + { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" }, + { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" }, + { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" }, + { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" }, + { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" }, + { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" }, + { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" }, + { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" }, + { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" }, + { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" }, + { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" }, + { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" }, + { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" }, + { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" }, + { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" }, + { url = "https://files.pythonhosted.org/packages/e6/b0/1a2aa41e3b5a4ba11420aba2d091b2d17959c8d1519ece3627c371951e73/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b5819cd790dbf0c5eb9f82c73c16b39a65dd6dd4d1439dcdea7816ec9adddab8", size = 2103351, upload-time = "2025-11-04T13:43:02.058Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ee/31b1f0020baaf6d091c87900ae05c6aeae101fa4e188e1613c80e4f1ea31/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5a4e67afbc95fa5c34cf27d9089bca7fcab4e51e57278d710320a70b956d1b9a", size = 1925363, upload-time = "2025-11-04T13:43:05.159Z" }, + { url = "https://files.pythonhosted.org/packages/e1/89/ab8e86208467e467a80deaca4e434adac37b10a9d134cd2f99b28a01e483/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ece5c59f0ce7d001e017643d8d24da587ea1f74f6993467d85ae8a5ef9d4f42b", size = 2135615, upload-time = "2025-11-04T13:43:08.116Z" }, + { url = "https://files.pythonhosted.org/packages/99/0a/99a53d06dd0348b2008f2f30884b34719c323f16c3be4e6cc1203b74a91d/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16f80f7abe3351f8ea6858914ddc8c77e02578544a0ebc15b4c2e1a0e813b0b2", size = 2175369, upload-time = "2025-11-04T13:43:12.49Z" }, + { url = "https://files.pythonhosted.org/packages/6d/94/30ca3b73c6d485b9bb0bc66e611cff4a7138ff9736b7e66bcf0852151636/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:33cb885e759a705b426baada1fe68cbb0a2e68e34c5d0d0289a364cf01709093", size = 2144218, upload-time = "2025-11-04T13:43:15.431Z" }, + { url = "https://files.pythonhosted.org/packages/87/57/31b4f8e12680b739a91f472b5671294236b82586889ef764b5fbc6669238/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:c8d8b4eb992936023be7dee581270af5c6e0697a8559895f527f5b7105ecd36a", size = 2329951, upload-time = "2025-11-04T13:43:18.062Z" }, + { url = "https://files.pythonhosted.org/packages/7d/73/3c2c8edef77b8f7310e6fb012dbc4b8551386ed575b9eb6fb2506e28a7eb/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:242a206cd0318f95cd21bdacff3fcc3aab23e79bba5cac3db5a841c9ef9c6963", size = 2318428, upload-time = "2025-11-04T13:43:20.679Z" }, + { url = "https://files.pythonhosted.org/packages/2f/02/8559b1f26ee0d502c74f9cca5c0d2fd97e967e083e006bbbb4e97f3a043a/pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a", size = 2147009, upload-time = "2025-11-04T13:43:23.286Z" }, + { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" }, + { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" }, + { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" }, + { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" }, + { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" }, + { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" }, + { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" }, +] + [[package]] name = "pygments" version = "2.19.2" @@ -2117,6 +2261,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a3/dc/17031897dae0efacfea57dfd3a82fdd2a2aeb58e0ff71b77b87e44edc772/setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922", size = 1201486, upload-time = "2025-05-27T00:56:49.664Z" }, ] +[[package]] +name = "skeletoken" +version = "0.3.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "protobuf" }, + { name = "pydantic" }, + { name = "regex" }, + { name = "tokenizers" }, + { name = "transformers" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/73/16/c4b9107914b6ff0408a93fe330c59ff6f2deb4684d3932d9e1823ba71b0b/skeletoken-0.3.0.tar.gz", hash = "sha256:d35c957e28a7484a9628752340928ba857fd44834ba2b528ffd3c18f088c9086", size = 230755, upload-time = "2026-02-06T05:20:19.367Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/0c/cda6fe8ce5e7eafac5ee9cdeec6d8ce832ff4e5a9576a40f043d03552d46/skeletoken-0.3.0-py3-none-any.whl", hash = "sha256:88e5e2338ba871d2a888511469bbdffa876881aa46dd428759188bd1b5440426", size = 38831, upload-time = "2026-02-06T05:20:17.721Z" }, +] + [[package]] name = "skops" version = "0.13.0" @@ -2369,6 +2529,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, ] +[[package]] +name = "typing-inspection" +version = "0.4.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typing-extensions" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949, upload-time = "2025-10-01T02:14:41.687Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611, upload-time = "2025-10-01T02:14:40.154Z" }, +] + [[package]] name = "urllib3" version = "2.5.0"