From 8304ea80da2d89bc058acc5045b711bf50cb238a Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 25 Feb 2026 11:37:50 -0500 Subject: [PATCH] resive FSST compressor buffer to be large enough for largest string Signed-off-by: Andrew Duffy --- encodings/fsst/src/compress.rs | 51 ++++++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/encodings/fsst/src/compress.rs b/encodings/fsst/src/compress.rs index 1dbf0dcf976..4230ac34520 100644 --- a/encodings/fsst/src/compress.rs +++ b/encodings/fsst/src/compress.rs @@ -52,6 +52,10 @@ where Compressor::train(&lines) } +/// Most strings are small in practice. If we encounter a larger string, we reallocate +/// the buffer to hold enough capacity for the worst-case compressed value. +const DEFAULT_BUFFER_LEN: usize = 1024 * 1024; + /// Compress from an iterator of bytestrings using FSST. pub fn fsst_compress_iter<'a, I>( iter: I, @@ -62,8 +66,7 @@ pub fn fsst_compress_iter<'a, I>( where I: Iterator>, { - // TODO(aduffy): this might be too small. - let mut buffer = Vec::with_capacity(16 * 1024 * 1024); + let mut buffer = Vec::with_capacity(DEFAULT_BUFFER_LEN); let mut builder = VarBinBuilder::::with_capacity(len); let mut uncompressed_lengths: BufferMut = BufferMut::with_capacity(len); for string in iter { @@ -79,7 +82,14 @@ where .vortex_expect("string length must fit in i32"), ); - // SAFETY: buffer is large enough + // make sure the buffer is 2x+7 larger than the input + let target_size = 2 * s.len() + 7; + if target_size > buffer.len() { + let additional_capacity = target_size - buffer.len(); + buffer.reserve(additional_capacity); + } + + // SAFETY: buffer is always sized to be large enough unsafe { compressor.compress_into(s, &mut buffer) }; builder.append_value(&buffer); @@ -96,3 +106,38 @@ where FSSTArray::try_new(dtype, symbols, symbol_lengths, codes, uncompressed_lengths) .vortex_expect("building FSSTArray from parts") } + +#[cfg(test)] +mod tests { + use fsst::CompressorBuilder; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::scalar::Scalar; + + use crate::compress::DEFAULT_BUFFER_LEN; + use crate::fsst_compress_iter; + + #[test] + fn test_large_string() { + let big_string: String = "abc" + .chars() + .cycle() + .take(10 * DEFAULT_BUFFER_LEN) + .collect(); + + let compressor = CompressorBuilder::default().build(); + + let compressed = fsst_compress_iter( + [Some(big_string.as_bytes())].into_iter(), + 1, + DType::Utf8(Nullability::NonNullable), + &compressor, + ); + + let decoded = compressed.scalar_at(0).unwrap(); + + let expected = Scalar::utf8(big_string, Nullability::NonNullable); + + assert_eq!(decoded, expected); + } +}