Skip to content

Commit bf99e21

Browse files
authored
Don't use load/store intrinsics (#185)
This is an interesting one! The remaining performance gap in QuState/PhastFT#58 seems to come from subpar performance when loading constants. I noticed that in Rust's `stdarch`, which defines all the SIMD intrinsics, the x86 load/store intrinsics lower to raw memory operations (`ptr::copy_nonoverlapping`). The AArch64 load/store intrinsics, on the other hand, *do* map to corresponding LLVM intrinsics! My hypothesis is that the LLVM intrinsics are not lowered until much later in the compilation pipeline, resulting in much fewer optimization opportunities and much worse codegen. If this is the case, we should just use memory operations directly. This also simplifies the code that we generate by quite a bit.
1 parent d969e5f commit bf99e21

File tree

9 files changed

+1218
-1388
lines changed

9 files changed

+1218
-1388
lines changed

fearless_simd/src/generated/avx2.rs

Lines changed: 300 additions & 280 deletions
Large diffs are not rendered by default.

fearless_simd/src/generated/neon.rs

Lines changed: 324 additions & 108 deletions
Large diffs are not rendered by default.

fearless_simd/src/generated/sse4_2.rs

Lines changed: 276 additions & 444 deletions
Large diffs are not rendered by default.

fearless_simd/src/generated/wasm.rs

Lines changed: 276 additions & 444 deletions
Large diffs are not rendered by default.

fearless_simd_gen/src/arch/neon.rs

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -98,28 +98,6 @@ pub(crate) fn simple_intrinsic(name: &str, ty: &VecType) -> Ident {
9898
)
9999
}
100100

101-
fn memory_intrinsic(op: &str, ty: &VecType) -> Ident {
102-
let (opt_q, scalar_c, size) = neon_array_type(ty);
103-
let num_blocks = ty.n_bits() / 128;
104-
let opt_count = if num_blocks > 1 {
105-
format!("_x{num_blocks}")
106-
} else {
107-
String::new()
108-
};
109-
Ident::new(
110-
&format!("{op}1{opt_q}_{scalar_c}{size}{opt_count}"),
111-
Span::call_site(),
112-
)
113-
}
114-
115-
pub(crate) fn load_intrinsic(ty: &VecType) -> Ident {
116-
memory_intrinsic("vld", ty)
117-
}
118-
119-
pub(crate) fn store_intrinsic(ty: &VecType) -> Ident {
120-
memory_intrinsic("vst", ty)
121-
}
122-
123101
pub(crate) fn split_intrinsic(name: &str, name2: &str, ty: &VecType) -> Ident {
124102
let (opt_q, scalar_c, size) = neon_array_type(ty);
125103
Ident::new(

fearless_simd_gen/src/generic.rs

Lines changed: 36 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -266,33 +266,31 @@ pub(crate) fn generic_block_combine(
266266
pub(crate) fn generic_from_array(
267267
method_sig: TokenStream,
268268
vec_ty: &VecType,
269-
_kind: RefKind,
270-
max_block_size: usize,
271-
load_unaligned_block: impl Fn(&VecType) -> Ident,
269+
kind: RefKind,
272270
) -> TokenStream {
273-
let block_size = max_block_size.min(vec_ty.n_bits());
274-
let block_count = vec_ty.n_bits() / block_size;
275-
let num_scalars_per_block = vec_ty.len / block_count;
276-
277-
let native_block_ty = VecType::new(
278-
vec_ty.scalar,
279-
vec_ty.scalar_bits,
280-
block_size / vec_ty.scalar_bits,
281-
);
282-
283-
let wrapper_ty = vec_ty.aligned_wrapper();
284-
let load_unaligned = load_unaligned_block(&native_block_ty);
285-
let expr = if block_count == 1 {
286-
quote! {
287-
unsafe { #wrapper_ty(#load_unaligned(val.as_ptr() as *const _)) }
288-
}
271+
let inner_ref = if kind == RefKind::Value {
272+
quote! { &val }
289273
} else {
290-
let blocks = (0..block_count).map(|n| n * num_scalars_per_block);
291-
quote! {
292-
unsafe { #wrapper_ty([
293-
#(#load_unaligned(val.as_ptr().add(#blocks) as *const _)),*
294-
]) }
295-
}
274+
quote! { val }
275+
};
276+
277+
// There are architecture-specific "load" intrinsics, but they can actually be *worse* for performance. If they
278+
// lower to LLVM intrinsics, they will likely not be optimized until much later in the pipeline (if at all),
279+
// resulting in substantially worse codegen. See https://github.com/linebender/fearless_simd/pull/185.
280+
let expr = quote! {
281+
// Safety: The native vector type backing any implementation will be:
282+
// - A `#[repr(simd)]` type, which has the same layout as an array of scalars
283+
// - An array of `#[repr(simd)]` types
284+
// - For AArch64 specifically, a `#[repr(C)]` tuple of `#[repr(simd)]` types
285+
//
286+
// These all have the same layout as a flat array of the corresponding scalars. The native vector types probably
287+
// have greater alignment requirements than the source array type we're copying from, but that's explicitly
288+
// allowed by transmute_copy:
289+
//
290+
// > This function will unsafely assume the pointer src is valid for size_of::<Dst> bytes by transmuting &Src to
291+
// > &Dst and then reading the &Dst **(except that this is done in a way that is correct even when &Dst has
292+
// > stricter alignment requirements than &Src).**
293+
unsafe { core::mem::transmute_copy(#inner_ref) }
296294
};
297295
let vec_rust = vec_ty.rust();
298296

@@ -333,39 +331,20 @@ pub(crate) fn generic_as_array<T: ToTokens>(
333331
}
334332
}
335333

336-
pub(crate) fn generic_store_array(
337-
method_sig: TokenStream,
338-
vec_ty: &VecType,
339-
max_block_size: usize,
340-
store_unaligned_block: impl Fn(&VecType) -> Ident,
341-
) -> TokenStream {
342-
let block_size = max_block_size.min(vec_ty.n_bits());
343-
let block_count = vec_ty.n_bits() / block_size;
344-
let num_scalars_per_block = vec_ty.len / block_count;
345-
346-
let native_block_ty = VecType::new(
347-
vec_ty.scalar,
348-
vec_ty.scalar_bits,
349-
block_size / vec_ty.scalar_bits,
350-
);
334+
pub(crate) fn generic_store_array(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
335+
let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);
336+
let count = vec_ty.len;
351337

352-
let store_unaligned = store_unaligned_block(&native_block_ty);
353-
let store_expr = if block_count == 1 {
354-
quote! {
355-
unsafe { #store_unaligned(dest.as_mut_ptr() as *mut _, a.val.0) }
356-
}
357-
} else {
358-
let blocks = (0..block_count).map(|n| {
359-
let offset = n * num_scalars_per_block;
360-
let block_idx = proc_macro2::Literal::usize_unsuffixed(n);
361-
quote! {
362-
#store_unaligned(dest.as_mut_ptr().add(#offset) as *mut _, a.val.0[#block_idx])
363-
}
364-
});
365-
quote! {
366-
unsafe {
367-
#(#blocks;)*
368-
}
338+
let store_expr = quote! {
339+
unsafe {
340+
// Copies `count` scalars from the backing type, which has the same layout as the destination array (see
341+
// `generic_as_array`). The backing type is aligned to its own size, and the destination array must *by
342+
// definition* be aligned to at least the alignment of the scalar.
343+
core::ptr::copy_nonoverlapping(
344+
(&raw const a.val.0) as *const #scalar_ty,
345+
dest.as_mut_ptr(),
346+
#count,
347+
);
369348
}
370349
};
371350

fearless_simd_gen/src/mk_neon.rs

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
use proc_macro2::{Ident, Literal, Span, TokenStream};
55
use quote::{ToTokens as _, format_ident, quote};
66

7-
use crate::arch::neon::{load_intrinsic, store_intrinsic};
87
use crate::generic::{
98
generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_store_array,
109
generic_to_bytes,
@@ -462,21 +461,13 @@ impl Level for Neon {
462461
}
463462
}
464463
}
465-
OpSig::FromArray { kind } => generic_from_array(
466-
method_sig,
467-
vec_ty,
468-
kind,
469-
self.max_block_size(),
470-
load_intrinsic,
471-
),
464+
OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
472465
OpSig::AsArray { kind } => {
473466
generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
474467
self.arch_ty(vec_ty)
475468
})
476469
}
477-
OpSig::StoreArray => {
478-
generic_store_array(method_sig, vec_ty, self.max_block_size(), store_intrinsic)
479-
}
470+
OpSig::StoreArray => generic_store_array(method_sig, vec_ty),
480471
OpSig::FromBytes => generic_from_bytes(method_sig, vec_ty),
481472
OpSig::ToBytes => generic_to_bytes(method_sig, vec_ty),
482473
}

fearless_simd_gen/src/mk_wasm.rs

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -613,21 +613,13 @@ impl Level for WasmSimd128 {
613613
}
614614
}
615615
}
616-
OpSig::FromArray { kind } => {
617-
generic_from_array(method_sig, vec_ty, kind, self.max_block_size(), |_| {
618-
v128_intrinsic("load")
619-
})
620-
}
616+
OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
621617
OpSig::AsArray { kind } => {
622618
generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |_| {
623619
Ident::new("v128", Span::call_site())
624620
})
625621
}
626-
OpSig::StoreArray => {
627-
generic_store_array(method_sig, vec_ty, self.max_block_size(), |_| {
628-
v128_intrinsic("store")
629-
})
630-
}
622+
OpSig::StoreArray => generic_store_array(method_sig, vec_ty),
631623
OpSig::FromBytes => generic_from_bytes(method_sig, vec_ty),
632624
OpSig::ToBytes => generic_to_bytes(method_sig, vec_ty),
633625
}

fearless_simd_gen/src/mk_x86.rs

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -170,23 +170,13 @@ impl Level for X86 {
170170
block_size,
171171
block_count,
172172
} => self.handle_store_interleaved(method_sig, vec_ty, block_size, block_count),
173-
OpSig::FromArray { kind } => generic_from_array(
174-
method_sig,
175-
vec_ty,
176-
kind,
177-
self.max_block_size(),
178-
|block_ty| intrinsic_ident("loadu", coarse_type(block_ty), block_ty.n_bits()),
179-
),
173+
OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
180174
OpSig::AsArray { kind } => {
181175
generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
182176
self.arch_ty(vec_ty)
183177
})
184178
}
185-
OpSig::StoreArray => {
186-
generic_store_array(method_sig, vec_ty, self.max_block_size(), |block_ty| {
187-
intrinsic_ident("storeu", coarse_type(block_ty), block_ty.n_bits())
188-
})
189-
}
179+
OpSig::StoreArray => generic_store_array(method_sig, vec_ty),
190180
OpSig::FromBytes => generic_from_bytes(method_sig, vec_ty),
191181
OpSig::ToBytes => generic_to_bytes(method_sig, vec_ty),
192182
}

0 commit comments

Comments
 (0)