100100 HDFS_KERB_TICKET ,
101101 HDFS_PORT ,
102102 HDFS_USER ,
103- PYARROW_USE_LARGE_TYPES_ON_READ ,
104103 S3_ACCESS_KEY_ID ,
105104 S3_ANONYMOUS ,
106105 S3_CONNECT_TIMEOUT ,
179178from pyiceberg .utils .config import Config
180179from pyiceberg .utils .datetime import millis_to_datetime
181180from pyiceberg .utils .decimal import unscaled_to_decimal
182- from pyiceberg .utils .deprecated import deprecation_message
183181from pyiceberg .utils .properties import get_first_property_value , property_as_bool , property_as_int
184182from pyiceberg .utils .singleton import Singleton
185183from pyiceberg .utils .truncate import truncate_upper_bound_binary_string , truncate_upper_bound_text_string
@@ -1656,6 +1654,7 @@ def _task_to_record_batches(
16561654 current_batch ,
16571655 downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us ,
16581656 projected_missing_fields = projected_missing_fields ,
1657+ allow_timestamp_tz_mismatch = True ,
16591658 )
16601659
16611660
@@ -1755,14 +1754,6 @@ def to_table(self, tasks: Iterable[FileScanTask]) -> pa.Table:
17551754 (pa .Table .from_batches ([batch ]) for batch in itertools .chain ([first_batch ], batches )), promote_options = "permissive"
17561755 )
17571756
1758- if property_as_bool (self ._io .properties , PYARROW_USE_LARGE_TYPES_ON_READ , False ):
1759- deprecation_message (
1760- deprecated_in = "0.10.0" ,
1761- removed_in = "0.11.0" ,
1762- help_message = f"Property `{ PYARROW_USE_LARGE_TYPES_ON_READ } ` will be removed." ,
1763- )
1764- result = result .cast (arrow_schema )
1765-
17661757 return result
17671758
17681759 def to_record_batches (self , tasks : Iterable [FileScanTask ]) -> Iterator [pa .RecordBatch ]:
@@ -1849,13 +1840,18 @@ def _to_requested_schema(
18491840 downcast_ns_timestamp_to_us : bool = False ,
18501841 include_field_ids : bool = False ,
18511842 projected_missing_fields : dict [int , Any ] = EMPTY_DICT ,
1843+ allow_timestamp_tz_mismatch : bool = False ,
18521844) -> pa .RecordBatch :
18531845 # We could reuse some of these visitors
18541846 struct_array = visit_with_partner (
18551847 requested_schema ,
18561848 batch ,
18571849 ArrowProjectionVisitor (
1858- file_schema , downcast_ns_timestamp_to_us , include_field_ids , projected_missing_fields = projected_missing_fields
1850+ file_schema ,
1851+ downcast_ns_timestamp_to_us ,
1852+ include_field_ids ,
1853+ projected_missing_fields = projected_missing_fields ,
1854+ allow_timestamp_tz_mismatch = allow_timestamp_tz_mismatch ,
18591855 ),
18601856 ArrowAccessor (file_schema ),
18611857 )
@@ -1866,46 +1862,44 @@ class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, pa.Array | None]
18661862 _file_schema : Schema
18671863 _include_field_ids : bool
18681864 _downcast_ns_timestamp_to_us : bool
1869- _use_large_types : bool | None
18701865 _projected_missing_fields : dict [int , Any ]
1866+ _allow_timestamp_tz_mismatch : bool
18711867
18721868 def __init__ (
18731869 self ,
18741870 file_schema : Schema ,
18751871 downcast_ns_timestamp_to_us : bool = False ,
18761872 include_field_ids : bool = False ,
1877- use_large_types : bool | None = None ,
18781873 projected_missing_fields : dict [int , Any ] = EMPTY_DICT ,
1874+ allow_timestamp_tz_mismatch : bool = False ,
18791875 ) -> None :
18801876 self ._file_schema = file_schema
18811877 self ._include_field_ids = include_field_ids
18821878 self ._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
1883- self ._use_large_types = use_large_types
18841879 self ._projected_missing_fields = projected_missing_fields
1885-
1886- if use_large_types is not None :
1887- deprecation_message (
1888- deprecated_in = "0.10.0" ,
1889- removed_in = "0.11.0" ,
1890- help_message = "Argument `use_large_types` will be removed from ArrowProjectionVisitor" ,
1891- )
1880+ # When True, allows projecting timestamptz (UTC) to timestamp (no tz).
1881+ # Allowed for reading (aligns with Spark); disallowed for writing to enforce Iceberg spec's strict typing.
1882+ self ._allow_timestamp_tz_mismatch = allow_timestamp_tz_mismatch
18921883
18931884 def _cast_if_needed (self , field : NestedField , values : pa .Array ) -> pa .Array :
18941885 file_field = self ._file_schema .find_field (field .field_id )
18951886
18961887 if field .field_type .is_primitive :
18971888 if (target_type := schema_to_pyarrow (field .field_type , include_field_ids = self ._include_field_ids )) != values .type :
18981889 if field .field_type == TimestampType ():
1899- # Downcasting of nanoseconds to microseconds
1890+ source_tz_compatible = values .type .tz is None or (
1891+ self ._allow_timestamp_tz_mismatch and values .type .tz in UTC_ALIASES
1892+ )
19001893 if (
19011894 pa .types .is_timestamp (target_type )
19021895 and not target_type .tz
19031896 and pa .types .is_timestamp (values .type )
1904- and not values . type . tz
1897+ and source_tz_compatible
19051898 ):
1899+ # Downcasting of nanoseconds to microseconds
19061900 if target_type .unit == "us" and values .type .unit == "ns" and self ._downcast_ns_timestamp_to_us :
19071901 return values .cast (target_type , safe = False )
1908- elif target_type .unit == "us" and values .type .unit in {"s" , "ms" }:
1902+ elif target_type .unit == "us" and values .type .unit in {"s" , "ms" , "us" }:
19091903 return values .cast (target_type )
19101904 raise ValueError (f"Unsupported schema projection from { values .type } to { target_type } " )
19111905 elif field .field_type == TimestamptzType ():
@@ -1915,6 +1909,7 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
19151909 and pa .types .is_timestamp (values .type )
19161910 and (values .type .tz in UTC_ALIASES or values .type .tz is None )
19171911 ):
1912+ # Downcasting of nanoseconds to microseconds
19181913 if target_type .unit == "us" and values .type .unit == "ns" and self ._downcast_ns_timestamp_to_us :
19191914 return values .cast (target_type , safe = False )
19201915 elif target_type .unit == "us" and values .type .unit in {"s" , "ms" , "us" }:
@@ -1934,8 +1929,6 @@ def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array:
19341929 target_schema = schema_to_pyarrow (
19351930 promote (file_field .field_type , field .field_type ), include_field_ids = self ._include_field_ids
19361931 )
1937- if self ._use_large_types is False :
1938- target_schema = _pyarrow_schema_ensure_small_types (target_schema )
19391932 return values .cast (target_schema )
19401933
19411934 return values
0 commit comments