From 580c81b903b8f195f91b4d15f2551b08af5c9fa1 Mon Sep 17 00:00:00 2001
From: Jan Lukas Rinker <jan.rinker@chuv.ch>
Date: Thu, 5 Feb 2026 14:39:40 +0100
Subject: [PATCH 1/6] Fix TableModel validator to accept pandas StringDtype
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The TableModel validator now properly accepts modern pandas StringDtype
for instance_key columns, along with CategoricalDtype with string categories.

This fixes #1062 where the validator incorrectly rejected StringDtype columns,
forcing users to use deprecated object dtypes. The new validation logic:
- Explicitly checks for pd.StringDtype instances
- Accepts pd.CategoricalDtype with string categories
- Maintains backward compatibility with integer and object dtypes
- Provides clearer error messages

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 src/spatialdata/models/models.py | 49 ++++++++++++++++++++------------
 1 file changed, 31 insertions(+), 18 deletions(-)

diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py
index 6a126b02..f68b01ac 100644
--- a/src/spatialdata/models/models.py
+++ b/src/spatialdata/models/models.py
@@ -1047,25 +1047,38 @@ def _validate_table_annotation_metadata(self, data: AnnData) -> None:
             raise ValueError(f"`{attr[self.REGION_KEY_KEY]}` not found in `adata.obs`. Please create the column.")
         if attr[self.INSTANCE_KEY] not in data.obs:
             raise ValueError(f"`{attr[self.INSTANCE_KEY]}` not found in `adata.obs`. Please create the column.")
-        if (
-            (dtype := data.obs[attr[self.INSTANCE_KEY]].dtype)
-            not in [
-                int,
-                np.int16,
-                np.uint16,
-                np.int32,
-                np.uint32,
-                np.int64,
-                np.uint64,
-                "O",
-            ]
-            and not pd.api.types.is_string_dtype(data.obs[attr[self.INSTANCE_KEY]])
-            or (dtype == "O" and (val_dtype := type(data.obs[attr[self.INSTANCE_KEY]].iloc[0])) is not str)
-        ):
-            dtype = dtype if dtype != "O" else val_dtype
+        dtype = data.obs[attr[self.INSTANCE_KEY]].dtype
+
+        # Check if dtype is valid for instance_key column
+        is_valid_dtype = False
+
+        # Check for integer types
+        if dtype in [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64]:
+            is_valid_dtype = True
+        # Check for pandas StringDtype
+        elif isinstance(dtype, pd.StringDtype):
+            is_valid_dtype = True
+        # Check for CategoricalDtype with string categories
+        elif isinstance(dtype, pd.CategoricalDtype):
+            if pd.api.types.is_string_dtype(dtype.categories.dtype) or isinstance(dtype.categories.dtype, pd.StringDtype):
+                is_valid_dtype = True
+        # Check for object dtype with string values
+        elif dtype == "O":
+            if len(data.obs[attr[self.INSTANCE_KEY]]) > 0:
+                val_dtype = type(data.obs[attr[self.INSTANCE_KEY]].iloc[0])
+                if val_dtype is str:
+                    is_valid_dtype = True
+            else:
+                # Empty column with object dtype is acceptable
+                is_valid_dtype = True
+        # Fallback check using pandas is_string_dtype
+        elif pd.api.types.is_string_dtype(dtype):
+            is_valid_dtype = True
+
+        if not is_valid_dtype:
             raise TypeError(
-                f"Only int, np.int16, np.int32, np.int64, uint equivalents or string allowed as dtype for "
-                f"instance_key column in obs. Dtype found to be {dtype}"
+                f"Only int, np.int16, np.int32, np.int64, uint equivalents, pandas StringDtype, or string "
+                f"allowed as dtype for instance_key column in obs. Dtype found to be {dtype}"
             )
         expected_regions = attr[self.REGION_KEY] if isinstance(attr[self.REGION_KEY], list) else [attr[self.REGION_KEY]]
         found_regions = data.obs[attr[self.REGION_KEY_KEY]].unique().tolist()

From eb19535418486dc8f88c5d0768b753b6048c9c17 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 5 Feb 2026 13:47:24 +0000
Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 src/spatialdata/models/models.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py
index f68b01ac..e148a986 100644
--- a/src/spatialdata/models/models.py
+++ b/src/spatialdata/models/models.py
@@ -1053,14 +1053,15 @@ def _validate_table_annotation_metadata(self, data: AnnData) -> None:
         is_valid_dtype = False
 
         # Check for integer types
-        if dtype in [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64]:
-            is_valid_dtype = True
-        # Check for pandas StringDtype
-        elif isinstance(dtype, pd.StringDtype):
+        if dtype in [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64] or isinstance(
+            dtype, pd.StringDtype
+        ):
             is_valid_dtype = True
         # Check for CategoricalDtype with string categories
         elif isinstance(dtype, pd.CategoricalDtype):
-            if pd.api.types.is_string_dtype(dtype.categories.dtype) or isinstance(dtype.categories.dtype, pd.StringDtype):
+            if pd.api.types.is_string_dtype(dtype.categories.dtype) or isinstance(
+                dtype.categories.dtype, pd.StringDtype
+            ):
                 is_valid_dtype = True
         # Check for object dtype with string values
         elif dtype == "O":

From 8cb3267c8fe9f721ec99a3761f6462a627c8d20d Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Mon, 16 Feb 2026 16:10:59 +0100
Subject: [PATCH 3/6] simplify isntance_key validation; add tests for
 instance_key dtypes

---
 src/spatialdata/models/models.py | 45 +++++++++----------------
 tests/models/test_models.py      | 57 ++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py
index e148a986..41922a12 100644
--- a/src/spatialdata/models/models.py
+++ b/src/spatialdata/models/models.py
@@ -1047,39 +1047,26 @@ def _validate_table_annotation_metadata(self, data: AnnData) -> None:
             raise ValueError(f"`{attr[self.REGION_KEY_KEY]}` not found in `adata.obs`. Please create the column.")
         if attr[self.INSTANCE_KEY] not in data.obs:
             raise ValueError(f"`{attr[self.INSTANCE_KEY]}` not found in `adata.obs`. Please create the column.")
-        dtype = data.obs[attr[self.INSTANCE_KEY]].dtype
+        instance_col = data.obs[attr[self.INSTANCE_KEY]]
+        dtype = instance_col.dtype
 
-        # Check if dtype is valid for instance_key column
-        is_valid_dtype = False
+        _INT_TYPES = [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64]
 
-        # Check for integer types
-        if dtype in [int, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64] or isinstance(
-            dtype, pd.StringDtype
-        ):
-            is_valid_dtype = True
-        # Check for CategoricalDtype with string categories
-        elif isinstance(dtype, pd.CategoricalDtype):
-            if pd.api.types.is_string_dtype(dtype.categories.dtype) or isinstance(
-                dtype.categories.dtype, pd.StringDtype
-            ):
-                is_valid_dtype = True
-        # Check for object dtype with string values
-        elif dtype == "O":
-            if len(data.obs[attr[self.INSTANCE_KEY]]) > 0:
-                val_dtype = type(data.obs[attr[self.INSTANCE_KEY]].iloc[0])
-                if val_dtype is str:
-                    is_valid_dtype = True
-            else:
-                # Empty column with object dtype is acceptable
-                is_valid_dtype = True
-        # Fallback check using pandas is_string_dtype
-        elif pd.api.types.is_string_dtype(dtype):
-            is_valid_dtype = True
+        def _is_int_or_str_dtype(d: np.dtype) -> bool:
+            return d in _INT_TYPES or isinstance(d, pd.StringDtype)
+
+        is_valid = _is_int_or_str_dtype(dtype) or (
+            isinstance(dtype, pd.CategoricalDtype) and _is_int_or_str_dtype(dtype.categories.dtype)
+        )
+        # the string case is already covered above, the check below covers the case of dtype("O") with string dtype
+        is_valid = is_valid or pd.api.types.is_string_dtype(instance_col)
 
-        if not is_valid_dtype:
+        if not is_valid:
             raise TypeError(
-                f"Only int, np.int16, np.int32, np.int64, uint equivalents, pandas StringDtype, or string "
-                f"allowed as dtype for instance_key column in obs. Dtype found to be {dtype}"
+                f"Only integer (int, np.int16, np.int32, np.int64, and uint equivalents), string "
+                f"(including pandas StringDtype and object dtype with string values), or categorical "
+                f"with integer/string categories allowed as dtype for instance_key column in obs. "
+                f"Dtype found to be {dtype}"
             )
         expected_regions = attr[self.REGION_KEY] if isinstance(attr[self.REGION_KEY], list) else [attr[self.REGION_KEY]]
         found_regions = data.obs[attr[self.REGION_KEY_KEY]].unique().tolist()
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 1e82b698..fe044d98 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -471,6 +471,63 @@ def test_table_model(
         del table.uns[TableModel.ATTRS_KEY]
         _ = TableModel.parse(table)
 
+    @pytest.mark.parametrize(
+        "instance_key_values,instance_key_dtype,should_pass",
+        [
+            # pd.StringDtype: accepted (issue #1062)
+            (["id_0", "id_1", "id_2", "id_3", "id_4"], pd.StringDtype(), True),
+            # object dtype with string values: accepted
+            (["id_0", "id_1", "id_2", "id_3", "id_4"], object, True),
+            # CategoricalDtype with object (string) categories: accepted (issue #1062)
+            (
+                pd.Categorical(["id_0", "id_1", "id_2", "id_3", "id_4"]),
+                None,
+                True,
+            ),
+            # CategoricalDtype with StringDtype categories: accepted (issue #1062)
+            (
+                pd.Categorical(pd.array(["id_0", "id_1", "id_2", "id_3", "id_4"], dtype="string")),
+                None,
+                True,
+            ),
+            # CategoricalDtype with integer categories: accepted
+            (
+                pd.Categorical([0, 1, 2, 3, 4]),
+                None,
+                True,
+            ),
+            # CategoricalDtype with float categories: rejected
+            (
+                pd.Categorical([0.0, 1.0, 2.0, 3.0, 4.0]),
+                None,
+                False,
+            ),
+            # integer dtype: accepted
+            ([0, 1, 2, 3, 4], np.int64, True),
+            # float dtype: rejected
+            ([0.0, 1.0, 2.0, 3.0, 4.0], np.float64, False),
+            # object dtype with non-string values: rejected
+            ([0, 1, 2, 3, 4], object, False),
+        ],
+    )
+    def test_table_instance_key_dtype_validation(self, instance_key_values, instance_key_dtype, should_pass):
+        """Test that _validate_table_annotation_metadata accepts/rejects the correct dtypes for instance_key."""
+        n = 5
+        region = ["sample"] * n
+        region_key = "region"
+        obs = pd.DataFrame(index=list(map(str, range(n))))
+        obs[region_key] = pd.Categorical(region)
+        if instance_key_dtype is not None:
+            obs["instance_id"] = pd.array(instance_key_values, dtype=instance_key_dtype)
+        else:
+            obs["instance_id"] = instance_key_values
+        adata = AnnData(RNG.normal(size=(n, 2)), obs=obs)
+        if should_pass:
+            _ = TableModel.parse(adata, region=region, region_key=region_key, instance_key="instance_id")
+        else:
+            with pytest.raises(TypeError, match="allowed as dtype for instance_key column"):
+                TableModel.parse(adata, region=region, region_key=region_key, instance_key="instance_id")
+
     @pytest.mark.parametrize(
         "name",
         [

From 9a2f16a88f93a27ae5c7cf99f69692880c15a23b Mon Sep 17 00:00:00 2001
From: LucaMarconato <2664412+LucaMarconato@users.noreply.github.com>
Date: Mon, 16 Feb 2026 18:04:09 +0100
Subject: [PATCH 4/6] Update src/spatialdata/models/models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/spatialdata/models/models.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py
index d93e0010..18c285c1 100644
--- a/src/spatialdata/models/models.py
+++ b/src/spatialdata/models/models.py
@@ -1042,9 +1042,13 @@ def _validate_table_annotation_metadata(cls, data: AnnData) -> None:
         def _is_int_or_str_dtype(d: np.dtype) -> bool:
             return d in _INT_TYPES or isinstance(d, pd.StringDtype)
 
-        is_valid = _is_int_or_str_dtype(dtype) or (
-            isinstance(dtype, pd.CategoricalDtype) and _is_int_or_str_dtype(dtype.categories.dtype)
-        )
+        # First, check the top-level dtype (covers plain int and StringDtype cases)
+        is_valid = _is_int_or_str_dtype(dtype)
+        # Explicitly handle categorical dtypes by inspecting the categories' dtype, including
+        # object-backed string categories via is_string_dtype on the categories' dtype.
+        if isinstance(dtype, pd.CategoricalDtype):
+            cat_dtype = dtype.categories.dtype
+            is_valid = is_valid or _is_int_or_str_dtype(cat_dtype) or pd.api.types.is_string_dtype(cat_dtype)
         # the string case is already covered above, the check below covers the case of dtype("O") with string dtype
         is_valid = is_valid or pd.api.types.is_string_dtype(instance_col)
 

From b4ad2189d323687686bb2cd8439a11559d778323 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Mon, 16 Feb 2026 18:04:24 +0100
Subject: [PATCH 5/6] fix test

---
 tests/models/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 611d0422..90f58b7b 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -508,7 +508,7 @@ def test_table_model(
     def test_table_instance_key_dtype_validation(self, instance_key_values, instance_key_dtype, should_pass):
         """Test that _validate_table_annotation_metadata accepts/rejects the correct dtypes for instance_key."""
         n = 5
-        region = ["sample"] * n
+        region = "sample"
         region_key = "region"
         obs = pd.DataFrame(index=list(map(str, range(n))))
         obs[region_key] = pd.Categorical(region)

From cee88e18b15e2b870d4e5286398d3fb00cac2fc8 Mon Sep 17 00:00:00 2001
From: Luca Marconato <m.lucalmer@gmail.com>
Date: Tue, 17 Feb 2026 10:47:08 +0100
Subject: [PATCH 6/6] fix tests

---
 tests/models/test_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 90f58b7b..c4ac3347 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -511,7 +511,7 @@ def test_table_instance_key_dtype_validation(self, instance_key_values, instance
         region = "sample"
         region_key = "region"
         obs = pd.DataFrame(index=list(map(str, range(n))))
-        obs[region_key] = pd.Categorical(region)
+        obs[region_key] = pd.Categorical([region] * n)
         if instance_key_dtype is not None:
             obs["instance_id"] = pd.array(instance_key_values, dtype=instance_key_dtype)
         else: