Skip to content

Commit 55761ce

Browse files
committed
Add "slide" operation
1 parent b223166 commit 55761ce

File tree

20 files changed

+8110
-16
lines changed

20 files changed

+8110
-16
lines changed

fearless_simd/src/generated/avx2.rs

Lines changed: 1270 additions & 4 deletions
Large diffs are not rendered by default.

fearless_simd/src/generated/fallback.rs

Lines changed: 708 additions & 0 deletions
Large diffs are not rendered by default.

fearless_simd/src/generated/neon.rs

Lines changed: 1791 additions & 0 deletions
Large diffs are not rendered by default.

fearless_simd/src/generated/simd_trait.rs

Lines changed: 357 additions & 0 deletions
Large diffs are not rendered by default.

fearless_simd/src/generated/simd_types.rs

Lines changed: 360 additions & 0 deletions
Large diffs are not rendered by default.

fearless_simd/src/generated/sse4_2.rs

Lines changed: 1141 additions & 0 deletions
Large diffs are not rendered by default.

fearless_simd/src/generated/wasm.rs

Lines changed: 1155 additions & 0 deletions
Large diffs are not rendered by default.

fearless_simd/src/support.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,24 @@ pub(crate) fn simd_debug_impl<Element: core::fmt::Debug>(
4242
.field("simd", token)
4343
.finish()
4444
}
45+
46+
/// Selects the input operands to be used for `slignr`/`vext`/etc. when computing a single output block for cross-block
47+
/// "slide" operations. Extracts from [a : b].
48+
#[inline(always)]
49+
#[allow(clippy::allow_attributes, reason = "Only needed in some cfgs.")]
50+
#[allow(dead_code, reason = "Only used in some cfgs.")]
51+
pub(crate) fn cross_block_slide_blocks_at<const N: usize, Block: Copy>(
52+
a: &[Block; N],
53+
b: &[Block; N],
54+
out_idx: usize,
55+
shift_bytes: usize,
56+
) -> [Block; 2] {
57+
const BLOCK_BYTES: usize = 16;
58+
let out_byte_start = out_idx * BLOCK_BYTES + shift_bytes;
59+
let lo_idx = out_byte_start.div_euclid(BLOCK_BYTES);
60+
let hi_idx = lo_idx + 1;
61+
// Concatenation is [a : b], so indices 0..N are from a, indices N..2N are from b
62+
let lo_block = if lo_idx < N { a[lo_idx] } else { b[lo_idx - N] };
63+
let hi_block = if hi_idx < N { a[hi_idx] } else { b[hi_idx - N] };
64+
[lo_block, hi_block]
65+
}

fearless_simd_dev_macros/src/lib.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,10 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
8080
fn #sse4_name() {
8181
if std::arch::is_x86_feature_detected!("sse4.2") {
8282
let sse4 = unsafe { fearless_simd::x86::Sse4_2::new_unchecked() };
83-
#input_fn_name(sse4);
83+
sse4.vectorize(
84+
#[inline(always)]
85+
|| #input_fn_name(sse4)
86+
);
8487
}
8588
}
8689
};
@@ -94,7 +97,10 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
9497
&& std::arch::is_x86_feature_detected!("fma")
9598
{
9699
let avx2 = unsafe { fearless_simd::x86::Avx2::new_unchecked() };
97-
#input_fn_name(avx2);
100+
avx2.vectorize(
101+
#[inline(always)]
102+
|| #input_fn_name(avx2)
103+
);
98104
}
99105
}
100106
};
@@ -110,6 +116,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream {
110116
};
111117

112118
quote! {
119+
#[inline(always)]
113120
#input_fn
114121

115122
#fallback_snippet

fearless_simd_gen/src/generic.rs

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use proc_macro2::{Ident, Span, TokenStream};
55
use quote::{format_ident, quote};
66

77
use crate::{
8-
ops::{OpSig, RefKind},
8+
ops::{OpSig, RefKind, SlideGranularity},
99
types::{SIMD_TYPES, ScalarType, VecType},
1010
};
1111

@@ -234,6 +234,32 @@ pub(crate) fn generic_op(op: &str, sig: OpSig, ty: &VecType) -> TokenStream {
234234
}
235235
OpSig::FromBytes => generic_from_bytes(method_sig, ty),
236236
OpSig::ToBytes => generic_to_bytes(method_sig, ty),
237+
OpSig::Slide { granularity, .. } => {
238+
match (granularity, ty.n_bits()) {
239+
(SlideGranularity::WithinBlocks, 128) => {
240+
// If this operation is done on a 128-bit vector type, the "within blocks" method is identical to the
241+
// non-within-blocks one, so just defer to that.
242+
let non_blockwise = generic_op_name("slide", ty);
243+
quote! {
244+
#method_sig {
245+
self.#non_blockwise::<SHIFT>(a, b)
246+
}
247+
}
248+
}
249+
(SlideGranularity::WithinBlocks, _) => {
250+
quote! {
251+
#method_sig {
252+
let (a0, a1) = self.#split(a);
253+
let (b0, b1) = self.#split(b);
254+
self.#combine(self.#do_half::<SHIFT>(a0, b0), self.#do_half::<SHIFT>(a1, b1))
255+
}
256+
}
257+
}
258+
_ => {
259+
panic!("Item-wise shifts across blocks cannot be done via split/combine");
260+
}
261+
}
262+
}
237263
}
238264
}
239265

0 commit comments

Comments
 (0)