Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/functions/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ hex = { workspace = true, optional = true }
itertools = { workspace = true }
log = { workspace = true }
md-5 = { version = "^0.10.0", optional = true }
memchr = "2.8.0"
num-traits = { workspace = true }
rand = { workspace = true }
regex = { workspace = true, optional = true }
Expand Down
9 changes: 6 additions & 3 deletions datafusion/functions/benches/strpos.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@ use std::hint::black_box;
use std::str::Chars;
use std::sync::Arc;

/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
/// 4096 rows, each row containing a string with 128 random characters.
/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
/// Returns a `Vec<ColumnarValue>` with two elements: a haystack array and a
/// needle array. Each haystack is a random string of `str_len_chars`
/// characters. Each needle is a random contiguous substring of its
/// corresponding haystack (i.e., the needle is always present in the haystack).
/// Around `null_density` fraction of rows are null and `utf8_density` fraction
/// contain non-ASCII characters; the remaining rows are ASCII-only.
fn gen_string_array(
n_rows: usize,
str_len_chars: usize,
Expand Down
68 changes: 41 additions & 27 deletions datafusion/functions/src/unicode/strpos.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ use datafusion_expr::{
Volatility,
};
use datafusion_macros::user_doc;
use memchr::memchr;

#[user_doc(
doc_section(label = "String Functions"),
Expand Down Expand Up @@ -179,6 +180,31 @@ fn strpos(args: &[ArrayRef]) -> Result<ArrayRef> {
}
}

/// Find `needle` in `haystack` using `memchr` to quickly skip to positions
/// where the first byte matches, then verify the remaining bytes. Using
/// string::find is slower because it has significant per-call overhead that
/// `memchr` does not, and strpos is often invoked many times on short inputs.
/// Returns a 1-based position, or 0 if not found.
/// Both inputs must be ASCII-only.
fn find_ascii_substring(haystack: &[u8], needle: &[u8]) -> usize {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible to use memchr::memmem::find() directly? Based on the Complexity section, it seems has implemented the same algorithm.
https://docs.rs/memchr/latest/memchr/memmem/fn.find.html

Copy link
Contributor Author

@neilconway neilconway Feb 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the suggestion! When I tried using memmem::find(), it was substantially slower -- presumably because it incurs some per-call overhead (I'd imagine setting up lookup tables etc.) that memchr does not.

I'd like to explore optimizing the (common) case where strpos() is invoked with a constant substring; in that case we could construct a memmove::Finder once, and use it for the entire input batch. But this PR is already a significant win so my thought was to defer that to a subsequent PR.

let needle_len = needle.len();
let first_byte = needle[0];
let mut offset = 0;

while let Some(pos) = memchr(first_byte, &haystack[offset..]) {
let start = offset + pos;
if start + needle_len > haystack.len() {
return 0;
}
if haystack[start..start + needle_len] == *needle {
return start + 1;
}
offset = start + 1;
}

0
}

/// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)
/// strpos('high', 'ig') = 2
/// The implementation uses UTF-8 code points as characters
Expand All @@ -198,37 +224,25 @@ where
.zip(substring_iter)
.map(|(string, substring)| match (string, substring) {
(Some(string), Some(substring)) => {
// If only ASCII characters are present, we can use the slide window method to find
// the sub vector in the main vector. This is faster than string.find() method.
if substring.is_empty() {
return T::Native::from_usize(1);
}

let substring_bytes = substring.as_bytes();
let string_bytes = string.as_bytes();

if substring_bytes.len() > string_bytes.len() {
return T::Native::from_usize(0);
}

if ascii_only {
// If the substring is empty, the result is 1.
if substring.is_empty() {
T::Native::from_usize(1)
} else {
T::Native::from_usize(
string
.as_bytes()
.windows(substring.len())
.position(|w| w == substring.as_bytes())
.map(|x| x + 1)
.unwrap_or(0),
)
}
T::Native::from_usize(find_ascii_substring(
string_bytes,
substring_bytes,
))
} else {
// For non-ASCII, use a single-pass search that tracks both
// byte position and character position simultaneously
if substring.is_empty() {
return T::Native::from_usize(1);
}

let substring_bytes = substring.as_bytes();
let string_bytes = string.as_bytes();

if substring_bytes.len() > string_bytes.len() {
return T::Native::from_usize(0);
}

// Single pass: find substring while counting characters
let mut char_pos = 0;
for (byte_idx, _) in string.char_indices() {
char_pos += 1;
Expand Down