diff --git a/datafusion/physical-expr-common/src/binary_map.rs b/datafusion/physical-expr-common/src/binary_map.rs index ad184d6500d56..5ebf86e3eba82 100644 --- a/datafusion/physical-expr-common/src/binary_map.rs +++ b/datafusion/physical-expr-common/src/binary_map.rs @@ -244,10 +244,12 @@ where V: Debug + PartialEq + Eq + Clone + Copy + Default, { pub fn new(output_type: OutputType) -> Self { + let map = hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY); + let map_size = map.allocation_size(); Self { output_type, - map: hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY), - map_size: 0, + map, + map_size, buffer: BufferBuilder::new(INITIAL_BUFFER_CAPACITY), offsets: vec![O::default()], // first offset is always 0 random_state: RandomState::default(), @@ -874,6 +876,47 @@ mod tests { assert!(size_after_values2 > total_strings1_len + total_strings2_len); } + /// Verify that `size()` accounts for the initial hash table allocation + /// from `HashTable::with_capacity(INITIAL_MAP_CAPACITY)`. + /// + /// Regression test: `map_size` was previously initialized to 0 despite + /// the hash table pre-allocating memory, causing `size()` to undercount + /// until the first hash table resize. + #[test] + fn test_size_accounts_for_initial_hash_table_allocation() { + let set = ArrowBytesSet::::new(OutputType::Utf8); + let initial_size = set.size(); + + // Compute the exact allocation for a HashTable with the same + // capacity and entry type used by ArrowBytesMap + let expected_ht_alloc = + hashbrown::hash_table::HashTable::>::with_capacity( + INITIAL_MAP_CAPACITY, + ) + .allocation_size(); + + // The hash table must allocate non-trivial memory for 128 entries + assert!( + expected_ht_alloc > 0, + "hash table should allocate memory for {INITIAL_MAP_CAPACITY} entries" + ); + + // size() = map_size + buffer_capacity + offsets + hashes_buffer + // At creation the non-map components are: + // buffer: INITIAL_BUFFER_CAPACITY bytes + // offsets: vec with 1 element + // hashes: empty vec (0 bytes) + // + // Before the fix (map_size=0), size() ≈ INITIAL_BUFFER_CAPACITY + sizeof(i32), + // missing the hash table allocation entirely. + // After the fix, size() includes expected_ht_alloc as well. + assert!( + initial_size >= expected_ht_alloc + INITIAL_BUFFER_CAPACITY, + "size() ({initial_size}) should include both hash table allocation \ + ({expected_ht_alloc}) and buffer capacity ({INITIAL_BUFFER_CAPACITY})" + ); + } + #[test] fn test_map() { let input = vec![ diff --git a/datafusion/physical-expr-common/src/binary_view_map.rs b/datafusion/physical-expr-common/src/binary_view_map.rs index abc3e28f82627..88e67c3c66145 100644 --- a/datafusion/physical-expr-common/src/binary_view_map.rs +++ b/datafusion/physical-expr-common/src/binary_view_map.rs @@ -155,10 +155,12 @@ where V: Debug + PartialEq + Eq + Clone + Copy + Default, { pub fn new(output_type: OutputType) -> Self { + let map = hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY); + let map_size = map.allocation_size(); Self { output_type, - map: hashbrown::hash_table::HashTable::with_capacity(INITIAL_MAP_CAPACITY), - map_size: 0, + map, + map_size, views: Vec::new(), in_progress: Vec::new(), completed: Vec::new(), @@ -678,6 +680,43 @@ mod tests { assert_eq!(set.len(), 10); } + /// Verify that `size()` accounts for the initial hash table allocation + /// from `HashTable::with_capacity(INITIAL_MAP_CAPACITY)`. + /// + /// Regression test: `map_size` was previously initialized to 0 despite + /// the hash table pre-allocating memory, causing `size()` to undercount + /// until the first hash table resize. + #[test] + fn test_size_accounts_for_initial_hash_table_allocation() { + let set = ArrowBytesViewSet::new(OutputType::Utf8View); + let initial_size = set.size(); + + // Compute the exact allocation for a HashTable with the same + // capacity and entry type used by ArrowBytesViewMap + let expected_ht_alloc = + hashbrown::hash_table::HashTable::>::with_capacity( + INITIAL_MAP_CAPACITY, + ) + .allocation_size(); + + assert!( + expected_ht_alloc > 0, + "hash table should allocate memory for {INITIAL_MAP_CAPACITY} entries" + ); + + // For ArrowBytesViewMap, all non-map components (views, in_progress, + // completed, nulls, hashes_buffer) start empty, so: + // size() = map_size + 0 + // + // Before the fix (map_size=0), size() was 0 at creation, + // missing the hash table allocation entirely. + // After the fix, size() == expected_ht_alloc. + assert!( + initial_size >= expected_ht_alloc, + "size() ({initial_size}) should include hash table allocation ({expected_ht_alloc})" + ); + } + #[derive(Debug, PartialEq, Eq, Default, Clone, Copy)] struct TestPayload { // store the string value to check against input