Merge remote-tracking branch 'origin/main' into kevinjqliu/upgrade-pytest

kevinjqliu · kevinjqliu · commit 8ec0f0ef70bd · 2026-01-30T13:12:34.000-08:00
diff --git a/pyiceberg/__init__.py b/pyiceberg/__init__.py
@@ -15,4 +15,4 @@
 # specific language governing permissions and limitations
 # under the License.
 
-__version__ = "0.10.0"
+__version__ = "0.11.0"
diff --git a/pyiceberg/io/__init__.py b/pyiceberg/io/__init__.py
@@ -99,7 +99,6 @@
 GCS_VERSION_AWARE = "gcs.version-aware"
 HF_ENDPOINT = "hf.endpoint"
 HF_TOKEN = "hf.token"
-PYARROW_USE_LARGE_TYPES_ON_READ = "pyarrow.use-large-types-on-read"
 
 
 @runtime_checkable
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -100,7 +100,6 @@
     HDFS_KERB_TICKET,
     HDFS_PORT,
     HDFS_USER,
-    PYARROW_USE_LARGE_TYPES_ON_READ,
     S3_ACCESS_KEY_ID,
     S3_ANONYMOUS,
     S3_CONNECT_TIMEOUT,
@@ -179,7 +178,6 @@
 from pyiceberg.utils.config import Config
 from pyiceberg.utils.datetime import millis_to_datetime
 from pyiceberg.utils.decimal import unscaled_to_decimal
-from pyiceberg.utils.deprecated import deprecation_message
 from pyiceberg.utils.properties import get_first_property_value, property_as_bool, property_as_int
 from pyiceberg.utils.singleton import Singleton
 from pyiceberg.utils.truncate import truncate_upper_bound_binary_string, truncate_upper_bound_text_string
@@ -1656,6 +1654,7 @@ def _task_to_record_batches(
                 current_batch,
                 downcast_ns_timestamp_to_us=downcast_ns_timestamp_to_us,
                 projected_missing_fields=projected_missing_fields,
+                allow_timestamp_tz_mismatch=True,
             )
 
 
@@ -1755,14 +1754,6 @@ def to_table(self, tasks: Iterable[FileScanTask]) -> pa.Table:
             (pa.Table.from_batches([batch]) for batch in itertools.chain([first_batch], batches)), promote_options="permissive"
         )
 
-        if property_as_bool(self._io.properties, PYARROW_USE_LARGE_TYPES_ON_READ, False):
-            deprecation_message(
-                deprecated_in="0.10.0",
-                removed_in="0.11.0",
-                help_message=f"Property `{PYARROW_USE_LARGE_TYPES_ON_READ}` will be removed.",
-            )
-            result = result.cast(arrow_schema)
-
         return result
 
     def to_record_batches(self, tasks: Iterable[FileScanTask]) -> Iterator[pa.RecordBatch]:
@@ -1849,13 +1840,18 @@ def _to_requested_schema(
     downcast_ns_timestamp_to_us: bool = False,
     include_field_ids: bool = False,
     projected_missing_fields: dict[int, Any] = EMPTY_DICT,
+    allow_timestamp_tz_mismatch: bool = False,
 ) -> pa.RecordBatch:
     # We could reuse some of these visitors
     struct_array = visit_with_partner(
         requested_schema,
         batch,
         ArrowProjectionVisitor(
-            file_schema, downcast_ns_timestamp_to_us, include_field_ids, projected_missing_fields=projected_missing_fields
+            file_schema,
+            downcast_ns_timestamp_to_us,
+            include_field_ids,
+            projected_missing_fields=projected_missing_fields,
+            allow_timestamp_tz_mismatch=allow_timestamp_tz_mismatch,
         ),
         ArrowAccessor(file_schema),
     )
@@ -1866,46 +1862,44 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, pa.Array | None]
     _file_schema: Schema
     _include_field_ids: bool
     _downcast_ns_timestamp_to_us: bool
-    _use_large_types: bool | None
     _projected_missing_fields: dict[int, Any]
+    _allow_timestamp_tz_mismatch: bool
 
     def __init__(
         self,
         file_schema: Schema,
         downcast_ns_timestamp_to_us: bool = False,
         include_field_ids: bool = False,
-        use_large_types: bool | None = None,
         projected_missing_fields: dict[int, Any] = EMPTY_DICT,
+        allow_timestamp_tz_mismatch: bool = False,
     ) -> None:
         self._file_schema = file_schema
         self._include_field_ids = include_field_ids
         self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
-        self._use_large_types = use_large_types
         self._projected_missing_fields = projected_missing_fields
-
-        if use_large_types is not None:
-            deprecation_message(
-                deprecated_in="0.10.0",
-                removed_in="0.11.0",
-                help_message="Argument `use_large_types` will be removed from ArrowProjectionVisitor",
-            )
+        # When True, allows projecting timestamptz (UTC) to timestamp (no tz).
+        # Allowed for reading (aligns with Spark); disallowed for writing to enforce Iceberg spec's strict typing.
+        self._allow_timestamp_tz_mismatch = allow_timestamp_tz_mismatch
 
     def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
         file_field = self._file_schema.find_field(field.field_id)
 
         if field.field_type.is_primitive:
             if (target_type := schema_to_pyarrow(field.field_type, include_field_ids=self._include_field_ids)) != values.type:
                 if field.field_type == TimestampType():
-                    # Downcasting of nanoseconds to microseconds
+                    source_tz_compatible = values.type.tz is None or (
+                        self._allow_timestamp_tz_mismatch and values.type.tz in UTC_ALIASES
+                    )
                     if (
                         pa.types.is_timestamp(target_type)
                         and not target_type.tz
                         and pa.types.is_timestamp(values.type)
-                        and not values.type.tz
+                        and source_tz_compatible
                     ):
+                        # Downcasting of nanoseconds to microseconds
                         if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us:
                             return values.cast(target_type, safe=False)
-                        elif target_type.unit == "us" and values.type.unit in {"s", "ms"}:
+                        elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}:
                             return values.cast(target_type)
                     raise ValueError(f"Unsupported schema projection from {values.type} to {target_type}")
                 elif field.field_type == TimestamptzType():
@@ -1915,6 +1909,7 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
                         and pa.types.is_timestamp(values.type)
                         and (values.type.tz in UTC_ALIASES or values.type.tz is None)
                     ):
+                        # Downcasting of nanoseconds to microseconds
                         if target_type.unit == "us" and values.type.unit == "ns" and self._downcast_ns_timestamp_to_us:
                             return values.cast(target_type, safe=False)
                         elif target_type.unit == "us" and values.type.unit in {"s", "ms", "us"}:
@@ -1934,8 +1929,6 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
                 target_schema = schema_to_pyarrow(
                     promote(file_field.field_type, field.field_type), include_field_ids=self._include_field_ids
                 )
-                if self._use_large_types is False:
-                    target_schema = _pyarrow_schema_ensure_small_types(target_schema)
                 return values.cast(target_schema)
 
         return values
diff --git a/pyiceberg/table/snapshots.py b/pyiceberg/table/snapshots.py
@@ -29,7 +29,6 @@
 from pyiceberg.manifest import DataFile, DataFileContent, ManifestFile, _manifests
 from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
 from pyiceberg.schema import Schema
-from pyiceberg.utils.deprecated import deprecation_message
 
 if TYPE_CHECKING:
     from pyiceberg.table.metadata import TableMetadata
@@ -344,54 +343,10 @@ def _partition_summary(self, update_metrics: UpdateMetrics) -> str:
         return ",".join([f"{prop}={val}" for prop, val in update_metrics.to_dict().items()])
 
 
-def _truncate_table_summary(summary: Summary, previous_summary: Mapping[str, str]) -> Summary:
-    for prop in {
-        TOTAL_DATA_FILES,
-        TOTAL_DELETE_FILES,
-        TOTAL_RECORDS,
-        TOTAL_FILE_SIZE,
-        TOTAL_POSITION_DELETES,
-        TOTAL_EQUALITY_DELETES,
-    }:
-        summary[prop] = "0"
-
-    def get_prop(prop: str) -> int:
-        value = previous_summary.get(prop) or "0"
-        try:
-            return int(value)
-        except ValueError as e:
-            raise ValueError(f"Could not parse summary property {prop} to an int: {value}") from e
-
-    if value := get_prop(TOTAL_DATA_FILES):
-        summary[DELETED_DATA_FILES] = str(value)
-    if value := get_prop(TOTAL_DELETE_FILES):
-        summary[REMOVED_DELETE_FILES] = str(value)
-    if value := get_prop(TOTAL_RECORDS):
-        summary[DELETED_RECORDS] = str(value)
-    if value := get_prop(TOTAL_FILE_SIZE):
-        summary[REMOVED_FILE_SIZE] = str(value)
-    if value := get_prop(TOTAL_POSITION_DELETES):
-        summary[REMOVED_POSITION_DELETES] = str(value)
-    if value := get_prop(TOTAL_EQUALITY_DELETES):
-        summary[REMOVED_EQUALITY_DELETES] = str(value)
-
-    return summary
-
-
-def update_snapshot_summaries(
-    summary: Summary, previous_summary: Mapping[str, str] | None = None, truncate_full_table: bool = False
-) -> Summary:
+def update_snapshot_summaries(summary: Summary, previous_summary: Mapping[str, str] | None = None) -> Summary:
     if summary.operation not in {Operation.APPEND, Operation.OVERWRITE, Operation.DELETE}:
         raise ValueError(f"Operation not implemented: {summary.operation}")
 
-    if truncate_full_table and summary.operation == Operation.OVERWRITE and previous_summary is not None:
-        deprecation_message(
-            deprecated_in="0.10.0",
-            removed_in="0.11.0",
-            help_message="The truncate-full-table shouldn't be used.",
-        )
-        summary = _truncate_table_summary(summary, previous_summary)
-
     if not previous_summary:
         previous_summary = {
             TOTAL_DATA_FILES: "0",
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@
 # under the License.
 [project]
 name = "pyiceberg"
-version = "0.10.0"
+version = "0.11.0"
 description = "Apache Iceberg is an open table format for huge analytic datasets"
 authors = [{ name = "Apache Software Foundation", email = "dev@iceberg.apache.org" }]
 requires-python = ">=3.10.0,<4.0.0"
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
@@ -46,7 +46,6 @@
     NotNaN,
     NotNull,
 )
-from pyiceberg.io import PYARROW_USE_LARGE_TYPES_ON_READ
 from pyiceberg.io.pyarrow import (
     pyarrow_to_schema,
 )
@@ -1126,49 +1125,6 @@ def test_table_scan_keep_types(catalog: Catalog) -> None:
     assert result_table.schema.equals(expected_schema)
 
 
-@pytest.mark.integration
-@pytest.mark.parametrize("catalog", [lf("session_catalog_hive"), lf("session_catalog")])
-def test_table_scan_override_with_small_types(catalog: Catalog) -> None:
-    identifier = "default.test_table_scan_override_with_small_types"
-    arrow_table = pa.Table.from_arrays(
-        [
-            pa.array(["a", "b", "c"]),
-            pa.array(["a", "b", "c"]),
-            pa.array([b"a", b"b", b"c"]),
-            pa.array([["a", "b"], ["c", "d"], ["e", "f"]]),
-        ],
-        names=["string", "string-to-binary", "binary", "list"],
-    )
-
-    try:
-        catalog.drop_table(identifier)
-    except NoSuchTableError:
-        pass
-
-    tbl = catalog.create_table(
-        identifier,
-        schema=arrow_table.schema,
-    )
-
-    tbl.append(arrow_table)
-
-    with tbl.update_schema() as update_schema:
-        update_schema.update_column("string-to-binary", BinaryType())
-
-    tbl.io.properties[PYARROW_USE_LARGE_TYPES_ON_READ] = "False"
-    result_table = tbl.scan().to_arrow()
-
-    expected_schema = pa.schema(
-        [
-            pa.field("string", pa.string()),
-            pa.field("string-to-binary", pa.large_binary()),
-            pa.field("binary", pa.binary()),
-            pa.field("list", pa.list_(pa.string())),
-        ]
-    )
-    assert result_table.schema.equals(expected_schema)
-
-
 @pytest.mark.integration
 @pytest.mark.parametrize("catalog", [lf("session_catalog_hive"), lf("session_catalog")])
 def test_empty_scan_ordered_str(catalog: Catalog) -> None:
diff --git a/tests/io/test_pyarrow.py b/tests/io/test_pyarrow.py
@@ -81,6 +81,7 @@
     expression_to_pyarrow,
     parquet_path_to_id_mapping,
     schema_to_pyarrow,
+    write_file,
 )
 from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
 from pyiceberg.partitioning import PartitionField, PartitionSpec
@@ -2725,6 +2726,106 @@ def test__to_requested_schema_timestamp_to_timestamptz_projection() -> None:
     assert expected.equals(actual_result)
 
 
+def test__to_requested_schema_timestamptz_to_timestamp_projection() -> None:
+    # file is written with timestamp with timezone
+    file_schema = Schema(NestedField(1, "ts_field", TimestamptzType(), required=False))
+    batch = pa.record_batch(
+        [
+            pa.array(
+                [
+                    datetime(2025, 8, 14, 12, 0, 0, tzinfo=timezone.utc),
+                    datetime(2025, 8, 14, 13, 0, 0, tzinfo=timezone.utc),
+                ],
+                type=pa.timestamp("us", tz="UTC"),
+            )
+        ],
+        names=["ts_field"],
+    )
+
+    # table schema expects timestamp without timezone
+    table_schema = Schema(NestedField(1, "ts_field", TimestampType(), required=False))
+
+    # allow_timestamp_tz_mismatch=True enables reading timestamptz as timestamp
+    actual_result = _to_requested_schema(
+        table_schema, file_schema, batch, downcast_ns_timestamp_to_us=True, allow_timestamp_tz_mismatch=True
+    )
+    expected = pa.record_batch(
+        [
+            pa.array(
+                [
+                    datetime(2025, 8, 14, 12, 0, 0),
+                    datetime(2025, 8, 14, 13, 0, 0),
+                ],
+                type=pa.timestamp("us"),
+            )
+        ],
+        names=["ts_field"],
+    )
+
+    # expect actual_result to have no timezone
+    assert expected.equals(actual_result)
+
+
+def test__to_requested_schema_timestamptz_to_timestamp_write_rejects() -> None:
+    """Test that the write path (default) rejects timestamptz to timestamp casting.
+
+    This ensures we enforce the Iceberg spec distinction between timestamp and timestamptz on writes,
+    while the read path can be more permissive (like Spark) via allow_timestamp_tz_mismatch=True.
+    """
+    # file is written with timestamp with timezone
+    file_schema = Schema(NestedField(1, "ts_field", TimestamptzType(), required=False))
+    batch = pa.record_batch(
+        [
+            pa.array(
+                [
+                    datetime(2025, 8, 14, 12, 0, 0, tzinfo=timezone.utc),
+                    datetime(2025, 8, 14, 13, 0, 0, tzinfo=timezone.utc),
+                ],
+                type=pa.timestamp("us", tz="UTC"),
+            )
+        ],
+        names=["ts_field"],
+    )
+
+    # table schema expects timestamp without timezone
+    table_schema = Schema(NestedField(1, "ts_field", TimestampType(), required=False))
+
+    # allow_timestamp_tz_mismatch=False (default, used in write path) should raise
+    with pytest.raises(ValueError, match="Unsupported schema projection"):
+        _to_requested_schema(
+            table_schema, file_schema, batch, downcast_ns_timestamp_to_us=True, allow_timestamp_tz_mismatch=False
+        )
+
+
+def test_write_file_rejects_timestamptz_to_timestamp(tmp_path: Path) -> None:
+    """Test that write_file rejects writing timestamptz data to a timestamp column."""
+    from pyiceberg.table import WriteTask
+
+    # Table expects timestamp (no tz), but data has timestamptz
+    table_schema = Schema(NestedField(1, "ts_field", TimestampType(), required=False))
+    task_schema = Schema(NestedField(1, "ts_field", TimestamptzType(), required=False))
+
+    arrow_data = pa.table({"ts_field": [datetime(2025, 8, 14, 12, 0, 0, tzinfo=timezone.utc)]})
+
+    table_metadata = TableMetadataV2(
+        location=f"file://{tmp_path}",
+        last_column_id=1,
+        format_version=2,
+        schemas=[table_schema],
+        partition_specs=[PartitionSpec()],
+    )
+
+    task = WriteTask(
+        write_uuid=uuid.uuid4(),
+        task_id=0,
+        record_batches=arrow_data.to_batches(),
+        schema=task_schema,
+    )
+
+    with pytest.raises(ValueError, match="Unsupported schema projection"):
+        list(write_file(io=PyArrowFileIO(), table_metadata=table_metadata, tasks=iter([task])))
+
+
 def test__to_requested_schema_timestamps(
     arrow_table_schema_with_all_timestamp_precisions: pa.Schema,
     arrow_table_with_all_timestamp_precisions: pa.Table,
diff --git a/uv.lock b/uv.lock