From 580c81b903b8f195f91b4d15f2551b08af5c9fa1 Mon Sep 17 00:00:00 2001 From: Jan Lukas Rinker Date: Thu, 5 Feb 2026 14:39:40 +0100 Subject: [PATCH 1/6] Fix TableModel validator to accept pandas StringDtype MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The TableModel validator now properly accepts modern pandas StringDtype for instance_key columns, along with CategoricalDtype with string categories. This fixes #1062 where the validator incorrectly rejected StringDtype columns, forcing users to use deprecated object dtypes. The new validation logic: - Explicitly checks for pd.StringDtype instances - Accepts pd.CategoricalDtype with string categories - Maintains backward compatibility with integer and object dtypes - Provides clearer error messages 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- src/spatialdata/models/models.py | 49 ++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index 6a126b02..f68b01ac 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -1047,25 +1047,38 @@ def _validate_table_annotation_metadata(self, data: AnnData) -> None: raise ValueError(f"`{attr[self.REGION_KEY_KEY]}` not found in `adata.obs`. Please create the column.") if attr[self.INSTANCE_KEY] not in data.obs: raise ValueError(f"`{attr[self.INSTANCE_KEY]}` not found in `adata.obs`. Please create the column.") - if ( - (dtype := data.obs[attr[self.INSTANCE_KEY]].dtype) - not in [ - int, - np.int16, - np.uint16, - np.int32, - np.uint32, - np.int64, - np.uint64, - "O", - ] - and not pd.api.types.is_string_dtype(data.obs[attr[self.INSTANCE_KEY]]) - or (dtype == "O" and (val_dtype := type(data.obs[attr[self.INSTANCE_KEY]].iloc[0])) is not str) - ): - dtype = dtype if dtype != "O" else val_dtype + dtype = data.obs[attr[self.INSTANCE_KEY]].dtype + + # Check if dtype is valid for instance_key column + is_valid_dtype = False + + # Check for integer types + if dtype in [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64]: + is_valid_dtype = True + # Check for pandas StringDtype + elif isinstance(dtype, pd.StringDtype): + is_valid_dtype = True + # Check for CategoricalDtype with string categories + elif isinstance(dtype, pd.CategoricalDtype): + if pd.api.types.is_string_dtype(dtype.categories.dtype) or isinstance(dtype.categories.dtype, pd.StringDtype): + is_valid_dtype = True + # Check for object dtype with string values + elif dtype == "O": + if len(data.obs[attr[self.INSTANCE_KEY]]) > 0: + val_dtype = type(data.obs[attr[self.INSTANCE_KEY]].iloc[0]) + if val_dtype is str: + is_valid_dtype = True + else: + # Empty column with object dtype is acceptable + is_valid_dtype = True + # Fallback check using pandas is_string_dtype + elif pd.api.types.is_string_dtype(dtype): + is_valid_dtype = True + + if not is_valid_dtype: raise TypeError( - f"Only int, np.int16, np.int32, np.int64, uint equivalents or string allowed as dtype for " - f"instance_key column in obs. Dtype found to be {dtype}" + f"Only int, np.int16, np.int32, np.int64, uint equivalents, pandas StringDtype, or string " + f"allowed as dtype for instance_key column in obs. Dtype found to be {dtype}" ) expected_regions = attr[self.REGION_KEY] if isinstance(attr[self.REGION_KEY], list) else [attr[self.REGION_KEY]] found_regions = data.obs[attr[self.REGION_KEY_KEY]].unique().tolist() From eb19535418486dc8f88c5d0768b753b6048c9c17 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 5 Feb 2026 13:47:24 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata/models/models.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index f68b01ac..e148a986 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -1053,14 +1053,15 @@ def _validate_table_annotation_metadata(self, data: AnnData) -> None: is_valid_dtype = False # Check for integer types - if dtype in [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64]: - is_valid_dtype = True - # Check for pandas StringDtype - elif isinstance(dtype, pd.StringDtype): + if dtype in [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64] or isinstance( + dtype, pd.StringDtype + ): is_valid_dtype = True # Check for CategoricalDtype with string categories elif isinstance(dtype, pd.CategoricalDtype): - if pd.api.types.is_string_dtype(dtype.categories.dtype) or isinstance(dtype.categories.dtype, pd.StringDtype): + if pd.api.types.is_string_dtype(dtype.categories.dtype) or isinstance( + dtype.categories.dtype, pd.StringDtype + ): is_valid_dtype = True # Check for object dtype with string values elif dtype == "O": From 8cb3267c8fe9f721ec99a3761f6462a627c8d20d Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Mon, 16 Feb 2026 16:10:59 +0100 Subject: [PATCH 3/6] simplify isntance_key validation; add tests for instance_key dtypes --- src/spatialdata/models/models.py | 45 +++++++++---------------- tests/models/test_models.py | 57 ++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 29 deletions(-) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index e148a986..41922a12 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -1047,39 +1047,26 @@ def _validate_table_annotation_metadata(self, data: AnnData) -> None: raise ValueError(f"`{attr[self.REGION_KEY_KEY]}` not found in `adata.obs`. Please create the column.") if attr[self.INSTANCE_KEY] not in data.obs: raise ValueError(f"`{attr[self.INSTANCE_KEY]}` not found in `adata.obs`. Please create the column.") - dtype = data.obs[attr[self.INSTANCE_KEY]].dtype + instance_col = data.obs[attr[self.INSTANCE_KEY]] + dtype = instance_col.dtype - # Check if dtype is valid for instance_key column - is_valid_dtype = False + _INT_TYPES = [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64] - # Check for integer types - if dtype in [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64] or isinstance( - dtype, pd.StringDtype - ): - is_valid_dtype = True - # Check for CategoricalDtype with string categories - elif isinstance(dtype, pd.CategoricalDtype): - if pd.api.types.is_string_dtype(dtype.categories.dtype) or isinstance( - dtype.categories.dtype, pd.StringDtype - ): - is_valid_dtype = True - # Check for object dtype with string values - elif dtype == "O": - if len(data.obs[attr[self.INSTANCE_KEY]]) > 0: - val_dtype = type(data.obs[attr[self.INSTANCE_KEY]].iloc[0]) - if val_dtype is str: - is_valid_dtype = True - else: - # Empty column with object dtype is acceptable - is_valid_dtype = True - # Fallback check using pandas is_string_dtype - elif pd.api.types.is_string_dtype(dtype): - is_valid_dtype = True + def _is_int_or_str_dtype(d: np.dtype) -> bool: + return d in _INT_TYPES or isinstance(d, pd.StringDtype) + + is_valid = _is_int_or_str_dtype(dtype) or ( + isinstance(dtype, pd.CategoricalDtype) and _is_int_or_str_dtype(dtype.categories.dtype) + ) + # the string case is already covered above, the check below covers the case of dtype("O") with string dtype + is_valid = is_valid or pd.api.types.is_string_dtype(instance_col) - if not is_valid_dtype: + if not is_valid: raise TypeError( - f"Only int, np.int16, np.int32, np.int64, uint equivalents, pandas StringDtype, or string " - f"allowed as dtype for instance_key column in obs. Dtype found to be {dtype}" + f"Only integer (int, np.int16, np.int32, np.int64, and uint equivalents), string " + f"(including pandas StringDtype and object dtype with string values), or categorical " + f"with integer/string categories allowed as dtype for instance_key column in obs. " + f"Dtype found to be {dtype}" ) expected_regions = attr[self.REGION_KEY] if isinstance(attr[self.REGION_KEY], list) else [attr[self.REGION_KEY]] found_regions = data.obs[attr[self.REGION_KEY_KEY]].unique().tolist() diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 1e82b698..fe044d98 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -471,6 +471,63 @@ def test_table_model( del table.uns[TableModel.ATTRS_KEY] _ = TableModel.parse(table) + @pytest.mark.parametrize( + "instance_key_values,instance_key_dtype,should_pass", + [ + # pd.StringDtype: accepted (issue #1062) + (["id_0", "id_1", "id_2", "id_3", "id_4"], pd.StringDtype(), True), + # object dtype with string values: accepted + (["id_0", "id_1", "id_2", "id_3", "id_4"], object, True), + # CategoricalDtype with object (string) categories: accepted (issue #1062) + ( + pd.Categorical(["id_0", "id_1", "id_2", "id_3", "id_4"]), + None, + True, + ), + # CategoricalDtype with StringDtype categories: accepted (issue #1062) + ( + pd.Categorical(pd.array(["id_0", "id_1", "id_2", "id_3", "id_4"], dtype="string")), + None, + True, + ), + # CategoricalDtype with integer categories: accepted + ( + pd.Categorical([0, 1, 2, 3, 4]), + None, + True, + ), + # CategoricalDtype with float categories: rejected + ( + pd.Categorical([0.0, 1.0, 2.0, 3.0, 4.0]), + None, + False, + ), + # integer dtype: accepted + ([0, 1, 2, 3, 4], np.int64, True), + # float dtype: rejected + ([0.0, 1.0, 2.0, 3.0, 4.0], np.float64, False), + # object dtype with non-string values: rejected + ([0, 1, 2, 3, 4], object, False), + ], + ) + def test_table_instance_key_dtype_validation(self, instance_key_values, instance_key_dtype, should_pass): + """Test that _validate_table_annotation_metadata accepts/rejects the correct dtypes for instance_key.""" + n = 5 + region = ["sample"] * n + region_key = "region" + obs = pd.DataFrame(index=list(map(str, range(n)))) + obs[region_key] = pd.Categorical(region) + if instance_key_dtype is not None: + obs["instance_id"] = pd.array(instance_key_values, dtype=instance_key_dtype) + else: + obs["instance_id"] = instance_key_values + adata = AnnData(RNG.normal(size=(n, 2)), obs=obs) + if should_pass: + _ = TableModel.parse(adata, region=region, region_key=region_key, instance_key="instance_id") + else: + with pytest.raises(TypeError, match="allowed as dtype for instance_key column"): + TableModel.parse(adata, region=region, region_key=region_key, instance_key="instance_id") + @pytest.mark.parametrize( "name", [ From 9a2f16a88f93a27ae5c7cf99f69692880c15a23b Mon Sep 17 00:00:00 2001 From: LucaMarconato <2664412+LucaMarconato@users.noreply.github.com> Date: Mon, 16 Feb 2026 18:04:09 +0100 Subject: [PATCH 4/6] Update src/spatialdata/models/models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/spatialdata/models/models.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index d93e0010..18c285c1 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -1042,9 +1042,13 @@ def _validate_table_annotation_metadata(cls, data: AnnData) -> None: def _is_int_or_str_dtype(d: np.dtype) -> bool: return d in _INT_TYPES or isinstance(d, pd.StringDtype) - is_valid = _is_int_or_str_dtype(dtype) or ( - isinstance(dtype, pd.CategoricalDtype) and _is_int_or_str_dtype(dtype.categories.dtype) - ) + # First, check the top-level dtype (covers plain int and StringDtype cases) + is_valid = _is_int_or_str_dtype(dtype) + # Explicitly handle categorical dtypes by inspecting the categories' dtype, including + # object-backed string categories via is_string_dtype on the categories' dtype. + if isinstance(dtype, pd.CategoricalDtype): + cat_dtype = dtype.categories.dtype + is_valid = is_valid or _is_int_or_str_dtype(cat_dtype) or pd.api.types.is_string_dtype(cat_dtype) # the string case is already covered above, the check below covers the case of dtype("O") with string dtype is_valid = is_valid or pd.api.types.is_string_dtype(instance_col) From b4ad2189d323687686bb2cd8439a11559d778323 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Mon, 16 Feb 2026 18:04:24 +0100 Subject: [PATCH 5/6] fix test --- tests/models/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 611d0422..90f58b7b 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -508,7 +508,7 @@ def test_table_model( def test_table_instance_key_dtype_validation(self, instance_key_values, instance_key_dtype, should_pass): """Test that _validate_table_annotation_metadata accepts/rejects the correct dtypes for instance_key.""" n = 5 - region = ["sample"] * n + region = "sample" region_key = "region" obs = pd.DataFrame(index=list(map(str, range(n)))) obs[region_key] = pd.Categorical(region) From cee88e18b15e2b870d4e5286398d3fb00cac2fc8 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Tue, 17 Feb 2026 10:47:08 +0100 Subject: [PATCH 6/6] fix tests --- tests/models/test_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_models.py b/tests/models/test_models.py index 90f58b7b..c4ac3347 100644 --- a/tests/models/test_models.py +++ b/tests/models/test_models.py @@ -511,7 +511,7 @@ def test_table_instance_key_dtype_validation(self, instance_key_values, instance region = "sample" region_key = "region" obs = pd.DataFrame(index=list(map(str, range(n)))) - obs[region_key] = pd.Categorical(region) + obs[region_key] = pd.Categorical([region] * n) if instance_key_dtype is not None: obs["instance_id"] = pd.array(instance_key_values, dtype=instance_key_dtype) else: