Skip to content

Commit 950c4db

Browse files
committed
Add f64<->f32 widening/narrowing conversions
1 parent 3688f88 commit 950c4db

File tree

13 files changed

+548
-195
lines changed

13 files changed

+548
-195
lines changed

fearless_simd/src/generated/avx2.rs

Lines changed: 38 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,10 @@ impl Simd for Avx2 {
178178
unsafe { _mm256_setr_m128(a.into(), b.into()).simd_into(self) }
179179
}
180180
#[inline(always)]
181+
fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self> {
182+
unsafe { _mm256_cvtps_pd(a.into()).simd_into(self) }
183+
}
184+
#[inline(always)]
181185
fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
182186
f64x2 {
183187
val: bytemuck::cast(a.val),
@@ -1449,6 +1453,15 @@ impl Simd for Avx2 {
14491453
}
14501454
}
14511455
#[inline(always)]
1456+
fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self> {
1457+
unsafe {
1458+
let (a0, a1) = self.split_f32x8(a);
1459+
let high = _mm256_cvtps_pd(a0.into()).simd_into(self);
1460+
let low = _mm256_cvtps_pd(a1.into()).simd_into(self);
1461+
self.combine_f64x4(high, low)
1462+
}
1463+
}
1464+
#[inline(always)]
14521465
fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
14531466
f64x4 {
14541467
val: bytemuck::cast(a.val),
@@ -2818,6 +2831,10 @@ impl Simd for Avx2 {
28182831
}
28192832
}
28202833
#[inline(always)]
2834+
fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self> {
2835+
unsafe { _mm256_cvtpd_ps(a.into()).simd_into(self) }
2836+
}
2837+
#[inline(always)]
28212838
fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
28222839
f32x8 {
28232840
val: bytemuck::cast(a.val),
@@ -3052,6 +3069,18 @@ impl Simd for Avx2 {
30523069
(b0.simd_into(self), b1.simd_into(self))
30533070
}
30543071
#[inline(always)]
3072+
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
3073+
crate::Fallback::new()
3074+
.load_interleaved_128_f32x16(src)
3075+
.val
3076+
.simd_into(self)
3077+
}
3078+
#[inline(always)]
3079+
fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
3080+
let fb = crate::Fallback::new();
3081+
fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
3082+
}
3083+
#[inline(always)]
30553084
fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
30563085
let (a0, a1) = self.split_f32x16(a);
30573086
self.combine_f64x4(
@@ -3068,18 +3097,6 @@ impl Simd for Avx2 {
30683097
)
30693098
}
30703099
#[inline(always)]
3071-
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
3072-
crate::Fallback::new()
3073-
.load_interleaved_128_f32x16(src)
3074-
.val
3075-
.simd_into(self)
3076-
}
3077-
#[inline(always)]
3078-
fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
3079-
let fb = crate::Fallback::new();
3080-
fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest);
3081-
}
3082-
#[inline(always)]
30833100
fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
30843101
let (a0, a1) = self.split_f32x16(a);
30853102
self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
@@ -4484,6 +4501,15 @@ impl Simd for Avx2 {
44844501
(b0.simd_into(self), b1.simd_into(self))
44854502
}
44864503
#[inline(always)]
4504+
fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self> {
4505+
let (a, b) = self.split_f64x8(a);
4506+
unsafe {
4507+
let lo = _mm256_cvtpd_ps(a.into());
4508+
let hi = _mm256_cvtpd_ps(b.into());
4509+
_mm256_setr_m128(lo, hi).simd_into(self)
4510+
}
4511+
}
4512+
#[inline(always)]
44874513
fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
44884514
let (a0, a1) = self.split_f64x8(a);
44894515
self.combine_f32x8(

fearless_simd/src/generated/fallback.rs

Lines changed: 46 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,16 @@ impl Simd for Fallback {
338338
result.simd_into(self)
339339
}
340340
#[inline(always)]
341+
fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self> {
342+
[
343+
a[0usize] as f64,
344+
a[1usize] as f64,
345+
a[2usize] as f64,
346+
a[3usize] as f64,
347+
]
348+
.simd_into(self)
349+
}
350+
#[inline(always)]
341351
fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
342352
f64x2 {
343353
val: bytemuck::cast(a.val),
@@ -3251,6 +3261,11 @@ impl Simd for Fallback {
32513261
(b0.simd_into(self), b1.simd_into(self))
32523262
}
32533263
#[inline(always)]
3264+
fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self> {
3265+
let (a0, a1) = self.split_f32x8(a);
3266+
self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1))
3267+
}
3268+
#[inline(always)]
32543269
fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
32553270
let (a0, a1) = self.split_f32x8(a);
32563271
self.combine_f64x2(
@@ -4684,6 +4699,16 @@ impl Simd for Fallback {
46844699
(b0.simd_into(self), b1.simd_into(self))
46854700
}
46864701
#[inline(always)]
4702+
fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self> {
4703+
[
4704+
a[0usize] as f32,
4705+
a[1usize] as f32,
4706+
a[2usize] as f32,
4707+
a[3usize] as f32,
4708+
]
4709+
.simd_into(self)
4710+
}
4711+
#[inline(always)]
46874712
fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
46884713
let (a0, a1) = self.split_f64x4(a);
46894714
self.combine_f32x4(
@@ -4934,22 +4959,6 @@ impl Simd for Fallback {
49344959
(b0.simd_into(self), b1.simd_into(self))
49354960
}
49364961
#[inline(always)]
4937-
fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
4938-
let (a0, a1) = self.split_f32x16(a);
4939-
self.combine_f64x4(
4940-
self.reinterpret_f64_f32x8(a0),
4941-
self.reinterpret_f64_f32x8(a1),
4942-
)
4943-
}
4944-
#[inline(always)]
4945-
fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
4946-
let (a0, a1) = self.split_f32x16(a);
4947-
self.combine_i32x8(
4948-
self.reinterpret_i32_f32x8(a0),
4949-
self.reinterpret_i32_f32x8(a1),
4950-
)
4951-
}
4952-
#[inline(always)]
49534962
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
49544963
[
49554964
src[0usize],
@@ -4980,6 +4989,22 @@ impl Simd for Fallback {
49804989
];
49814990
}
49824991
#[inline(always)]
4992+
fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
4993+
let (a0, a1) = self.split_f32x16(a);
4994+
self.combine_f64x4(
4995+
self.reinterpret_f64_f32x8(a0),
4996+
self.reinterpret_f64_f32x8(a1),
4997+
)
4998+
}
4999+
#[inline(always)]
5000+
fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self> {
5001+
let (a0, a1) = self.split_f32x16(a);
5002+
self.combine_i32x8(
5003+
self.reinterpret_i32_f32x8(a0),
5004+
self.reinterpret_i32_f32x8(a1),
5005+
)
5006+
}
5007+
#[inline(always)]
49835008
fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
49845009
let (a0, a1) = self.split_f32x16(a);
49855010
self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
@@ -6489,6 +6514,11 @@ impl Simd for Fallback {
64896514
(b0.simd_into(self), b1.simd_into(self))
64906515
}
64916516
#[inline(always)]
6517+
fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self> {
6518+
let (a0, a1) = self.split_f64x8(a);
6519+
self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1))
6520+
}
6521+
#[inline(always)]
64926522
fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
64936523
let (a0, a1) = self.split_f64x8(a);
64946524
self.combine_f32x8(

fearless_simd/src/generated/neon.rs

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,14 @@ impl Simd for Neon {
184184
result.simd_into(self)
185185
}
186186
#[inline(always)]
187+
fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self> {
188+
unsafe {
189+
let low = vcvt_f64_f32(vget_low_f32(a.into()));
190+
let high = vcvt_high_f64_f32(a.into());
191+
float64x2x2_t(low, high).simd_into(self)
192+
}
193+
}
194+
#[inline(always)]
187195
fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self> {
188196
unsafe { vreinterpretq_f64_f32(a.into()).simd_into(self) }
189197
}
@@ -1401,6 +1409,11 @@ impl Simd for Neon {
14011409
(b0.simd_into(self), b1.simd_into(self))
14021410
}
14031411
#[inline(always)]
1412+
fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self> {
1413+
let (a0, a1) = self.split_f32x8(a);
1414+
self.combine_f64x4(self.widen_f32x4(a0), self.widen_f32x4(a1))
1415+
}
1416+
#[inline(always)]
14041417
fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self> {
14051418
let (a0, a1) = self.split_f32x8(a);
14061419
self.combine_f64x2(
@@ -2821,6 +2834,13 @@ impl Simd for Neon {
28212834
(b0.simd_into(self), b1.simd_into(self))
28222835
}
28232836
#[inline(always)]
2837+
fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self> {
2838+
unsafe {
2839+
let converted: float64x2x2_t = a.into();
2840+
vcvt_high_f32_f64(vcvt_f32_f64(converted.0), converted.1).simd_into(self)
2841+
}
2842+
}
2843+
#[inline(always)]
28242844
fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self> {
28252845
let (a0, a1) = self.split_f64x4(a);
28262846
self.combine_f32x4(
@@ -3071,6 +3091,14 @@ impl Simd for Neon {
30713091
(b0.simd_into(self), b1.simd_into(self))
30723092
}
30733093
#[inline(always)]
3094+
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
3095+
unsafe { vld4q_f32(src.as_ptr()).simd_into(self) }
3096+
}
3097+
#[inline(always)]
3098+
fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
3099+
unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) }
3100+
}
3101+
#[inline(always)]
30743102
fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self> {
30753103
let (a0, a1) = self.split_f32x16(a);
30763104
self.combine_f64x4(
@@ -3087,14 +3115,6 @@ impl Simd for Neon {
30873115
)
30883116
}
30893117
#[inline(always)]
3090-
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
3091-
unsafe { vld4q_f32(src.as_ptr()).simd_into(self) }
3092-
}
3093-
#[inline(always)]
3094-
fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
3095-
unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) }
3096-
}
3097-
#[inline(always)]
30983118
fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
30993119
let (a0, a1) = self.split_f32x16(a);
31003120
self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1))
@@ -4465,6 +4485,11 @@ impl Simd for Neon {
44654485
(b0.simd_into(self), b1.simd_into(self))
44664486
}
44674487
#[inline(always)]
4488+
fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self> {
4489+
let (a0, a1) = self.split_f64x8(a);
4490+
self.combine_f32x4(self.narrow_f64x4(a0), self.narrow_f64x4(a1))
4491+
}
4492+
#[inline(always)]
44684493
fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self> {
44694494
let (a0, a1) = self.split_f64x8(a);
44704495
self.combine_f32x8(

fearless_simd/src/generated/simd_trait.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
9696
fn trunc_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
9797
fn select_f32x4(self, a: mask32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
9898
fn combine_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x8<Self>;
99+
fn widen_f32x4(self, a: f32x4<Self>) -> f64x4<Self>;
99100
fn reinterpret_f64_f32x4(self, a: f32x4<Self>) -> f64x2<Self>;
100101
fn reinterpret_i32_f32x4(self, a: f32x4<Self>) -> i32x4<Self>;
101102
fn reinterpret_u8_f32x4(self, a: f32x4<Self>) -> u8x16<Self>;
@@ -374,6 +375,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
374375
fn select_f32x8(self, a: mask32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
375376
fn combine_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x16<Self>;
376377
fn split_f32x8(self, a: f32x8<Self>) -> (f32x4<Self>, f32x4<Self>);
378+
fn widen_f32x8(self, a: f32x8<Self>) -> f64x8<Self>;
377379
fn reinterpret_f64_f32x8(self, a: f32x8<Self>) -> f64x4<Self>;
378380
fn reinterpret_i32_f32x8(self, a: f32x8<Self>) -> i32x8<Self>;
379381
fn reinterpret_u8_f32x8(self, a: f32x8<Self>) -> u8x32<Self>;
@@ -619,6 +621,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
619621
fn select_f64x4(self, a: mask64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
620622
fn combine_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x8<Self>;
621623
fn split_f64x4(self, a: f64x4<Self>) -> (f64x2<Self>, f64x2<Self>);
624+
fn narrow_f64x4(self, a: f64x4<Self>) -> f32x4<Self>;
622625
fn reinterpret_f32_f64x4(self, a: f64x4<Self>) -> f32x8<Self>;
623626
fn splat_mask64x4(self, val: i64) -> mask64x4<Self>;
624627
fn not_mask64x4(self, a: mask64x4<Self>) -> mask64x4<Self>;
@@ -663,10 +666,10 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
663666
fn trunc_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
664667
fn select_f32x16(self, a: mask32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
665668
fn split_f32x16(self, a: f32x16<Self>) -> (f32x8<Self>, f32x8<Self>);
666-
fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self>;
667-
fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self>;
668669
fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self>;
669670
fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> ();
671+
fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self>;
672+
fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self>;
670673
fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self>;
671674
fn reinterpret_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
672675
fn cvt_u32_f32x16(self, a: f32x16<Self>) -> u32x16<Self>;
@@ -905,6 +908,7 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
905908
fn trunc_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
906909
fn select_f64x8(self, a: mask64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
907910
fn split_f64x8(self, a: f64x8<Self>) -> (f64x4<Self>, f64x4<Self>);
911+
fn narrow_f64x8(self, a: f64x8<Self>) -> f32x8<Self>;
908912
fn reinterpret_f32_f64x8(self, a: f64x8<Self>) -> f32x16<Self>;
909913
fn splat_mask64x8(self, val: i64) -> mask64x8<Self>;
910914
fn not_mask64x8(self, a: mask64x8<Self>) -> mask64x8<Self>;

0 commit comments

Comments
 (0)