Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions pyiceberg/io/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2183,6 +2183,11 @@ def min_as_bytes(self) -> bytes | None:
if self.current_min is None:
return None

# The Parquet format stores column statistics as binary (see Statistics struct
# in parquet.thrift). PyArrow may return these as bytes instead of str.
if self.primitive_type == StringType() and isinstance(self.current_min, bytes):
self.current_min = self.current_min.decode("utf-8")

return self.serialize(
self.current_min
if self.trunc_length is None
Expand All @@ -2194,10 +2199,14 @@ def max_as_bytes(self) -> bytes | None:
return None

if self.primitive_type == StringType():
if not isinstance(self.current_max, str):
raise ValueError("Expected the current_max to be a string")
s_result = truncate_upper_bound_text_string(self.current_max, self.trunc_length)
return self.serialize(s_result) if s_result is not None else None
# The Parquet format stores column statistics as binary (see Statistics struct
# in parquet.thrift). PyArrow may return these as bytes instead of str.
if isinstance(self.current_max, bytes):
self.current_max = self.current_max.decode("utf-8")
if isinstance(self.current_max, str):
s_result = truncate_upper_bound_text_string(self.current_max, self.trunc_length)
return self.serialize(s_result) if s_result is not None else None
raise ValueError(f"Expected the current_max to be a str, got {type(self.current_max)}")
elif self.primitive_type == BinaryType():
if not isinstance(self.current_max, bytes):
raise ValueError("Expected the current_max to be bytes")
Expand Down
109 changes: 109 additions & 0 deletions tests/io/test_pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2232,6 +2232,115 @@ def test_stats_aggregator_physical_type_does_not_match_expected_raise_error(
StatsAggregator(iceberg_type, physical_type_string)


def test_stats_aggregator_string_type_with_bytes_value() -> None:
"""Test that StatsAggregator handles bytes values for StringType.

The Parquet format stores column statistics as binary data (see Statistics
struct in parquet.thrift). When PyArrow reads these statistics, it may
return them as Python bytes objects. This test ensures we handle bytes
values correctly, as produced by writers like DuckDB.
"""
stats = StatsAggregator(StringType(), "BYTE_ARRAY")

# Simulate bytes values as returned by PyArrow from Parquet statistics
# These represent realistic min/max values for a "source" column
stats.update_min(b"/docs/readme.md")
stats.update_max(b"/docs/tutorial.md")

assert stats.current_min == b"/docs/readme.md"
assert stats.current_max == b"/docs/tutorial.md"

# Verify serialization decodes bytes to str and encodes back to bytes
min_bytes = stats.min_as_bytes()
max_bytes = stats.max_as_bytes()

assert min_bytes == b"/docs/readme.md"
assert max_bytes == b"/docs/tutorial.md"


def test_stats_aggregator_string_type_with_str_value() -> None:
"""Test that StatsAggregator handles str values for StringType."""
stats = StatsAggregator(StringType(), "BYTE_ARRAY")

# Standard str values (the common case when PyArrow decodes statistics)
stats.update_min("2024-01-01")
stats.update_max("2024-12-31")

assert stats.current_min == "2024-01-01"
assert stats.current_max == "2024-12-31"

# Verify serialization encodes str to bytes
min_bytes = stats.min_as_bytes()
max_bytes = stats.max_as_bytes()

assert min_bytes == b"2024-01-01"
assert max_bytes == b"2024-12-31"


def test_stats_aggregator_integer_type() -> None:
"""Test that StatsAggregator handles IntegerType min/max statistics."""
stats = StatsAggregator(IntegerType(), "INT32")

stats.update_min(1)
stats.update_max(1000)

assert stats.current_min == 1
assert stats.current_max == 1000

min_bytes = stats.min_as_bytes()
max_bytes = stats.max_as_bytes()

# INT32 is stored as 4 bytes little-endian
assert min_bytes == (1).to_bytes(4, byteorder="little", signed=True)
assert max_bytes == (1000).to_bytes(4, byteorder="little", signed=True)


def test_stats_aggregator_long_type() -> None:
"""Test that StatsAggregator handles LongType min/max statistics."""
stats = StatsAggregator(LongType(), "INT64")

stats.update_min(-9223372036854775808) # Long min
stats.update_max(9223372036854775807) # Long max

min_bytes = stats.min_as_bytes()
max_bytes = stats.max_as_bytes()

# INT64 is stored as 8 bytes little-endian
assert min_bytes == (-9223372036854775808).to_bytes(8, byteorder="little", signed=True)
assert max_bytes == (9223372036854775807).to_bytes(8, byteorder="little", signed=True)


def test_stats_aggregator_double_type() -> None:
"""Test that StatsAggregator handles DoubleType min/max statistics."""
import struct

stats = StatsAggregator(DoubleType(), "DOUBLE")

stats.update_min(-273.15) # Absolute zero in Celsius
stats.update_max(1000000.0)

min_bytes = stats.min_as_bytes()
max_bytes = stats.max_as_bytes()

# DOUBLE is stored as 8 bytes IEEE 754
assert min_bytes == struct.pack("<d", -273.15)
assert max_bytes == struct.pack("<d", 1000000.0)


def test_stats_aggregator_binary_type() -> None:
"""Test that StatsAggregator handles BinaryType min/max statistics."""
stats = StatsAggregator(BinaryType(), "BYTE_ARRAY")

stats.update_min(b"\x00\x01\x02")
stats.update_max(b"\xff\xfe\xfd")

min_bytes = stats.min_as_bytes()
max_bytes = stats.max_as_bytes()

assert min_bytes == b"\x00\x01\x02"
assert max_bytes == b"\xff\xfe\xfd"


def test_bin_pack_arrow_table(arrow_table_with_null: pa.Table) -> None:
# default packs to 1 bin since the table is small
bin_packed = bin_pack_arrow_table(
Expand Down
12 changes: 12 additions & 0 deletions tests/test_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,3 +603,15 @@ def test_json_single_serialization(primitive_type: PrimitiveType, value: Any, ex
)
def test_json_serialize_roundtrip(primitive_type: PrimitiveType, value: Any) -> None:
assert value == conversions.from_json(primitive_type, conversions.to_json(primitive_type, value))


def test_string_type_to_bytes_with_str() -> None:
"""Test that to_bytes works with str values for StringType."""
result = conversions.to_bytes(StringType(), "hello")
assert result == b"hello"


def test_string_type_to_bytes_with_unicode() -> None:
"""Test that to_bytes works with unicode str values for StringType."""
result = conversions.to_bytes(StringType(), "héllo wörld")
assert result == "héllo wörld".encode()