Skip to content

Commit 690f6b4

Browse files
committed
perf: Optimize lpad, rpad for ASCII strings
The previous implementation incurred the overhead of Unicode machinery, even for the common case that both the input string and the fill string consistent only of ASCII characters. For the ASCII-only case, we can assume that the length in bytes equals the length in characters, and avoid expensive graphene-based segmentation. This follows similar optimizations applied elsewhere in the codebase. Benchmarks indicate this is a significant performance win for ASCII-only input (4x-10x faster) but only a mild regression for Unicode input (2-5% slower). Along the way: * Combine: a few instances of `write_str(str)? + append_value("")` with `append_value(str)`, which saves a few cycles * Add a missing test case for truncating the input string * Add benchmarks for Unicode input
1 parent b790ed3 commit 690f6b4

File tree

3 files changed

+323
-49
lines changed

3 files changed

+323
-49
lines changed

datafusion/functions/benches/pad.rs

Lines changed: 181 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717

1818
extern crate criterion;
1919

20-
use arrow::array::{ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
20+
use arrow::array::{
21+
ArrowPrimitiveType, GenericStringBuilder, OffsetSizeTrait, PrimitiveArray,
22+
StringViewBuilder,
23+
};
2124
use arrow::datatypes::{DataType, Field, Int64Type};
2225
use arrow::util::bench_util::{
2326
create_string_array_with_len, create_string_view_array_with_len,
@@ -32,6 +35,51 @@ use std::hint::black_box;
3235
use std::sync::Arc;
3336
use std::time::Duration;
3437

38+
const UNICODE_STRINGS: &[&str] = &[
39+
"Ñandú",
40+
"Íslensku",
41+
"Þjóðarinnar",
42+
"Ελληνική",
43+
"Иванович",
44+
"データフュージョン",
45+
"José García",
46+
"Ölçü bïrïmï",
47+
"Ÿéšṱëṟḏàÿ",
48+
"Ährenstraße",
49+
];
50+
51+
fn create_unicode_string_array<O: OffsetSizeTrait>(
52+
size: usize,
53+
null_density: f32,
54+
) -> arrow::array::GenericStringArray<O> {
55+
let mut rng = rand::rng();
56+
let mut builder = GenericStringBuilder::<O>::new();
57+
for i in 0..size {
58+
if rng.random::<f32>() < null_density {
59+
builder.append_null();
60+
} else {
61+
builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
62+
}
63+
}
64+
builder.finish()
65+
}
66+
67+
fn create_unicode_string_view_array(
68+
size: usize,
69+
null_density: f32,
70+
) -> arrow::array::StringViewArray {
71+
let mut rng = rand::rng();
72+
let mut builder = StringViewBuilder::with_capacity(size);
73+
for i in 0..size {
74+
if rng.random::<f32>() < null_density {
75+
builder.append_null();
76+
} else {
77+
builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]);
78+
}
79+
}
80+
builder.finish()
81+
}
82+
3583
struct Filter<Dist> {
3684
dist: Dist,
3785
}
@@ -69,6 +117,34 @@ where
69117
.collect()
70118
}
71119

120+
/// Create args for pad benchmark with Unicode strings
121+
fn create_unicode_pad_args(
122+
size: usize,
123+
target_len: usize,
124+
use_string_view: bool,
125+
) -> Vec<ColumnarValue> {
126+
let length_array =
127+
Arc::new(create_primitive_array::<Int64Type>(size, 0.0, target_len));
128+
129+
if use_string_view {
130+
let string_array = create_unicode_string_view_array(size, 0.1);
131+
let fill_array = create_unicode_string_view_array(size, 0.1);
132+
vec![
133+
ColumnarValue::Array(Arc::new(string_array)),
134+
ColumnarValue::Array(length_array),
135+
ColumnarValue::Array(Arc::new(fill_array)),
136+
]
137+
} else {
138+
let string_array = create_unicode_string_array::<i32>(size, 0.1);
139+
let fill_array = create_unicode_string_array::<i32>(size, 0.1);
140+
vec![
141+
ColumnarValue::Array(Arc::new(string_array)),
142+
ColumnarValue::Array(length_array),
143+
ColumnarValue::Array(Arc::new(fill_array)),
144+
]
145+
}
146+
}
147+
72148
/// Create args for pad benchmark
73149
fn create_pad_args<O: OffsetSizeTrait>(
74150
size: usize,
@@ -210,6 +286,58 @@ fn criterion_benchmark(c: &mut Criterion) {
210286
},
211287
);
212288

289+
// Utf8 type with Unicode strings
290+
let args = create_unicode_pad_args(size, 20, false);
291+
let arg_fields = args
292+
.iter()
293+
.enumerate()
294+
.map(|(idx, arg)| {
295+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
296+
})
297+
.collect::<Vec<_>>();
298+
299+
group.bench_function(
300+
format!("lpad utf8 unicode [size={size}, target=20]"),
301+
|b| {
302+
b.iter(|| {
303+
let args_cloned = args.clone();
304+
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
305+
args: args_cloned,
306+
arg_fields: arg_fields.clone(),
307+
number_rows: size,
308+
return_field: Field::new("f", DataType::Utf8, true).into(),
309+
config_options: Arc::clone(&config_options),
310+
}))
311+
})
312+
},
313+
);
314+
315+
// StringView type with Unicode strings
316+
let args = create_unicode_pad_args(size, 20, true);
317+
let arg_fields = args
318+
.iter()
319+
.enumerate()
320+
.map(|(idx, arg)| {
321+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
322+
})
323+
.collect::<Vec<_>>();
324+
325+
group.bench_function(
326+
format!("lpad stringview unicode [size={size}, target=20]"),
327+
|b| {
328+
b.iter(|| {
329+
let args_cloned = args.clone();
330+
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
331+
args: args_cloned,
332+
arg_fields: arg_fields.clone(),
333+
number_rows: size,
334+
return_field: Field::new("f", DataType::Utf8View, true).into(),
335+
config_options: Arc::clone(&config_options),
336+
}))
337+
})
338+
},
339+
);
340+
213341
group.finish();
214342
}
215343

@@ -324,6 +452,58 @@ fn criterion_benchmark(c: &mut Criterion) {
324452
},
325453
);
326454

455+
// Utf8 type with Unicode strings
456+
let args = create_unicode_pad_args(size, 20, false);
457+
let arg_fields = args
458+
.iter()
459+
.enumerate()
460+
.map(|(idx, arg)| {
461+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
462+
})
463+
.collect::<Vec<_>>();
464+
465+
group.bench_function(
466+
format!("rpad utf8 unicode [size={size}, target=20]"),
467+
|b| {
468+
b.iter(|| {
469+
let args_cloned = args.clone();
470+
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
471+
args: args_cloned,
472+
arg_fields: arg_fields.clone(),
473+
number_rows: size,
474+
return_field: Field::new("f", DataType::Utf8, true).into(),
475+
config_options: Arc::clone(&config_options),
476+
}))
477+
})
478+
},
479+
);
480+
481+
// StringView type with Unicode strings
482+
let args = create_unicode_pad_args(size, 20, true);
483+
let arg_fields = args
484+
.iter()
485+
.enumerate()
486+
.map(|(idx, arg)| {
487+
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
488+
})
489+
.collect::<Vec<_>>();
490+
491+
group.bench_function(
492+
format!("rpad stringview unicode [size={size}, target=20]"),
493+
|b| {
494+
b.iter(|| {
495+
let args_cloned = args.clone();
496+
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
497+
args: args_cloned,
498+
arg_fields: arg_fields.clone(),
499+
number_rows: size,
500+
return_field: Field::new("f", DataType::Utf8View, true).into(),
501+
config_options: Arc::clone(&config_options),
502+
}))
503+
})
504+
},
505+
);
506+
327507
group.finish();
328508
}
329509
}

datafusion/functions/src/unicode/lpad.rs

Lines changed: 69 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,10 @@ use datafusion_macros::user_doc;
4949
+---------------------------------------------+
5050
```"#,
5151
standard_argument(name = "str", prefix = "String"),
52-
argument(name = "n", description = "String length to pad to."),
52+
argument(
53+
name = "n",
54+
description = "String length to pad to. If the input string is longer than this length, it is truncated (on the right)."
55+
),
5356
argument(
5457
name = "padding_str",
5558
description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._"
@@ -225,24 +228,47 @@ where
225228
continue;
226229
}
227230

228-
// Reuse buffers by clearing and refilling
229-
graphemes_buf.clear();
230-
graphemes_buf.extend(string.graphemes(true));
231-
232-
fill_chars_buf.clear();
233-
fill_chars_buf.extend(fill.chars());
234-
235-
if length < graphemes_buf.len() {
236-
builder.append_value(graphemes_buf[..length].concat());
237-
} else if fill_chars_buf.is_empty() {
238-
builder.append_value(string);
231+
if string.is_ascii() && fill.is_ascii() {
232+
// ASCII fast path: byte length == character length,
233+
// so we skip expensive grapheme segmentation.
234+
let str_len = string.len();
235+
if length < str_len {
236+
builder.append_value(&string[..length]);
237+
} else if fill.is_empty() {
238+
builder.append_value(string);
239+
} else {
240+
let pad_len = length - str_len;
241+
let fill_len = fill.len();
242+
let full_reps = pad_len / fill_len;
243+
let remainder = pad_len % fill_len;
244+
for _ in 0..full_reps {
245+
builder.write_str(fill)?;
246+
}
247+
if remainder > 0 {
248+
builder.write_str(&fill[..remainder])?;
249+
}
250+
builder.append_value(string);
251+
}
239252
} else {
240-
for l in 0..length - graphemes_buf.len() {
241-
let c = *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
242-
builder.write_char(c)?;
253+
// Reuse buffers by clearing and refilling
254+
graphemes_buf.clear();
255+
graphemes_buf.extend(string.graphemes(true));
256+
257+
fill_chars_buf.clear();
258+
fill_chars_buf.extend(fill.chars());
259+
260+
if length < graphemes_buf.len() {
261+
builder.append_value(graphemes_buf[..length].concat());
262+
} else if fill_chars_buf.is_empty() {
263+
builder.append_value(string);
264+
} else {
265+
for l in 0..length - graphemes_buf.len() {
266+
let c =
267+
*fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
268+
builder.write_char(c)?;
269+
}
270+
builder.append_value(string);
243271
}
244-
builder.write_str(string)?;
245-
builder.append_value("");
246272
}
247273
} else {
248274
builder.append_null();
@@ -266,17 +292,28 @@ where
266292
continue;
267293
}
268294

269-
// Reuse buffer by clearing and refilling
270-
graphemes_buf.clear();
271-
graphemes_buf.extend(string.graphemes(true));
272-
273-
if length < graphemes_buf.len() {
274-
builder.append_value(graphemes_buf[..length].concat());
295+
if string.is_ascii() {
296+
// ASCII fast path: byte length == character length
297+
let str_len = string.len();
298+
if length < str_len {
299+
builder.append_value(&string[..length]);
300+
} else {
301+
builder.write_str(" ".repeat(length - str_len).as_str())?;
302+
builder.append_value(string);
303+
}
275304
} else {
276-
builder
277-
.write_str(" ".repeat(length - graphemes_buf.len()).as_str())?;
278-
builder.write_str(string)?;
279-
builder.append_value("");
305+
// Reuse buffer by clearing and refilling
306+
graphemes_buf.clear();
307+
graphemes_buf.extend(string.graphemes(true));
308+
309+
if length < graphemes_buf.len() {
310+
builder.append_value(graphemes_buf[..length].concat());
311+
} else {
312+
builder.write_str(
313+
" ".repeat(length - graphemes_buf.len()).as_str(),
314+
)?;
315+
builder.append_value(string);
316+
}
280317
}
281318
} else {
282319
builder.append_null();
@@ -523,6 +560,11 @@ mod tests {
523560
None,
524561
Ok(None)
525562
);
563+
test_lpad!(
564+
Some("hello".into()),
565+
ScalarValue::Int64(Some(3i64)),
566+
Ok(Some("hel"))
567+
);
526568
test_lpad!(
527569
Some("josé".into()),
528570
ScalarValue::Int64(Some(10i64)),

0 commit comments

Comments
 (0)