wide/
f32x4_.rs

1use super::*;
2
3pick! {
4  if #[cfg(target_feature="sse")] {
5    #[derive(Default, Clone, Copy, PartialEq)]
6    #[repr(C, align(16))]
7    pub struct f32x4 { pub(crate) sse: m128 }
8  } else if #[cfg(target_feature="simd128")] {
9    use core::arch::wasm32::*;
10
11    #[derive(Clone, Copy)]
12    #[repr(transparent)]
13    pub struct f32x4 { pub(crate) simd: v128 }
14
15    impl Default for f32x4 {
16      fn default() -> Self {
17        Self::splat(0.0)
18      }
19    }
20
21    impl PartialEq for f32x4 {
22      fn eq(&self, other: &Self) -> bool {
23        u32x4_all_true(f32x4_eq(self.simd, other.simd))
24      }
25    }
26  } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
27    use core::arch::aarch64::*;
28    #[repr(C)]
29    #[derive(Copy, Clone)]
30    pub struct f32x4 { pub(crate) neon : float32x4_t }
31
32    impl Default for f32x4 {
33      #[inline]
34      #[must_use]
35      fn default() -> Self {
36        unsafe { Self { neon: vdupq_n_f32(0.0)} }
37      }
38    }
39
40    impl PartialEq for f32x4 {
41      #[inline]
42      #[must_use]
43      fn eq(&self, other: &Self) -> bool {
44        unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX }
45      }
46
47    }
48    } else {
49    #[derive(Default, Clone, Copy, PartialEq)]
50    #[repr(C, align(16))]
51    pub struct f32x4 { pub(crate) arr: [f32;4] }
52  }
53}
54
55macro_rules! const_f32_as_f32x4 {
56  ($i:ident, $f:expr) => {
57    #[allow(non_upper_case_globals)]
58    pub const $i: f32x4 = f32x4::new([$f; 4]);
59  };
60}
61
62impl f32x4 {
63  const_f32_as_f32x4!(ONE, 1.0);
64  const_f32_as_f32x4!(ZERO, 0.0);
65  const_f32_as_f32x4!(HALF, 0.5);
66  const_f32_as_f32x4!(E, core::f32::consts::E);
67  const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI);
68  const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI);
69  const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI);
70  const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2);
71  const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2);
72  const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3);
73  const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4);
74  const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6);
75  const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8);
76  const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2);
77  const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10);
78  const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E);
79  const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E);
80  const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2);
81  const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10);
82  const_f32_as_f32x4!(PI, core::f32::consts::PI);
83  const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2);
84  const_f32_as_f32x4!(TAU, core::f32::consts::TAU);
85}
86
87unsafe impl Zeroable for f32x4 {}
88unsafe impl Pod for f32x4 {}
89
90impl Add for f32x4 {
91  type Output = Self;
92  #[inline]
93  #[must_use]
94  fn add(self, rhs: Self) -> Self::Output {
95    pick! {
96      if #[cfg(target_feature="sse")] {
97        Self { sse: add_m128(self.sse, rhs.sse) }
98      } else if #[cfg(target_feature="simd128")] {
99        Self { simd: f32x4_add(self.simd, rhs.simd) }
100      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
101        unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } }
102      } else {
103        Self { arr: [
104          self.arr[0] + rhs.arr[0],
105          self.arr[1] + rhs.arr[1],
106          self.arr[2] + rhs.arr[2],
107          self.arr[3] + rhs.arr[3],
108        ]}
109      }
110    }
111  }
112}
113
114impl Sub for f32x4 {
115  type Output = Self;
116  #[inline]
117  #[must_use]
118  fn sub(self, rhs: Self) -> Self::Output {
119    pick! {
120      if #[cfg(target_feature="sse")] {
121        Self { sse: sub_m128(self.sse, rhs.sse) }
122      } else if #[cfg(target_feature="simd128")] {
123        Self { simd: f32x4_sub(self.simd, rhs.simd) }
124      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
125        unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }}
126      } else {
127        Self { arr: [
128          self.arr[0] - rhs.arr[0],
129          self.arr[1] - rhs.arr[1],
130          self.arr[2] - rhs.arr[2],
131          self.arr[3] - rhs.arr[3],
132        ]}
133      }
134    }
135  }
136}
137
138impl Mul for f32x4 {
139  type Output = Self;
140  #[inline]
141  #[must_use]
142  fn mul(self, rhs: Self) -> Self::Output {
143    pick! {
144      if #[cfg(target_feature="sse")] {
145        Self { sse: mul_m128(self.sse, rhs.sse) }
146      } else if #[cfg(target_feature="simd128")] {
147        Self { simd: f32x4_mul(self.simd, rhs.simd) }
148      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
149        unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }}
150      } else {
151        Self { arr: [
152          self.arr[0] * rhs.arr[0],
153          self.arr[1] * rhs.arr[1],
154          self.arr[2] * rhs.arr[2],
155          self.arr[3] * rhs.arr[3],
156        ]}
157      }
158    }
159  }
160}
161
162impl Div for f32x4 {
163  type Output = Self;
164  #[inline]
165  #[must_use]
166  fn div(self, rhs: Self) -> Self::Output {
167    pick! {
168      if #[cfg(target_feature="sse")] {
169        Self { sse: div_m128(self.sse, rhs.sse) }
170      } else if #[cfg(target_feature="simd128")] {
171        Self { simd: f32x4_div(self.simd, rhs.simd) }
172      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
173        unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }}
174      } else {
175        Self { arr: [
176          self.arr[0] / rhs.arr[0],
177          self.arr[1] / rhs.arr[1],
178          self.arr[2] / rhs.arr[2],
179          self.arr[3] / rhs.arr[3],
180        ]}
181      }
182    }
183  }
184}
185
186impl Add<f32> for f32x4 {
187  type Output = Self;
188  #[inline]
189  #[must_use]
190  fn add(self, rhs: f32) -> Self::Output {
191    self.add(Self::splat(rhs))
192  }
193}
194
195impl Sub<f32> for f32x4 {
196  type Output = Self;
197  #[inline]
198  #[must_use]
199  fn sub(self, rhs: f32) -> Self::Output {
200    self.sub(Self::splat(rhs))
201  }
202}
203
204impl Mul<f32> for f32x4 {
205  type Output = Self;
206  #[inline]
207  #[must_use]
208  fn mul(self, rhs: f32) -> Self::Output {
209    self.mul(Self::splat(rhs))
210  }
211}
212
213impl Div<f32> for f32x4 {
214  type Output = Self;
215  #[inline]
216  #[must_use]
217  fn div(self, rhs: f32) -> Self::Output {
218    self.div(Self::splat(rhs))
219  }
220}
221
222impl Add<f32x4> for f32 {
223  type Output = f32x4;
224  #[inline]
225  #[must_use]
226  fn add(self, rhs: f32x4) -> Self::Output {
227    f32x4::splat(self).add(rhs)
228  }
229}
230
231impl Sub<f32x4> for f32 {
232  type Output = f32x4;
233  #[inline]
234  #[must_use]
235  fn sub(self, rhs: f32x4) -> Self::Output {
236    f32x4::splat(self).sub(rhs)
237  }
238}
239
240impl Mul<f32x4> for f32 {
241  type Output = f32x4;
242  #[inline]
243  #[must_use]
244  fn mul(self, rhs: f32x4) -> Self::Output {
245    f32x4::splat(self).mul(rhs)
246  }
247}
248
249impl Div<f32x4> for f32 {
250  type Output = f32x4;
251  #[inline]
252  #[must_use]
253  fn div(self, rhs: f32x4) -> Self::Output {
254    f32x4::splat(self).div(rhs)
255  }
256}
257
258impl BitAnd for f32x4 {
259  type Output = Self;
260  #[inline]
261  #[must_use]
262  fn bitand(self, rhs: Self) -> Self::Output {
263    pick! {
264      if #[cfg(target_feature="sse")] {
265        Self { sse: bitand_m128(self.sse, rhs.sse) }
266      } else if #[cfg(target_feature="simd128")] {
267        Self { simd: v128_and(self.simd, rhs.simd) }
268      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
269        unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
270      } else {
271        Self { arr: [
272          f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()),
273          f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()),
274          f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()),
275          f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()),
276        ]}
277      }
278    }
279  }
280}
281
282impl BitOr for f32x4 {
283  type Output = Self;
284  #[inline]
285  #[must_use]
286  fn bitor(self, rhs: Self) -> Self::Output {
287    pick! {
288      if #[cfg(target_feature="sse")] {
289        Self { sse: bitor_m128(self.sse, rhs.sse) }
290      } else if #[cfg(target_feature="simd128")] {
291        Self { simd: v128_or(self.simd, rhs.simd) }
292      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
293        unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
294      } else {
295        Self { arr: [
296          f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()),
297          f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()),
298          f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()),
299          f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()),
300        ]}
301      }
302    }
303  }
304}
305
306impl BitXor for f32x4 {
307  type Output = Self;
308  #[inline]
309  #[must_use]
310  fn bitxor(self, rhs: Self) -> Self::Output {
311    pick! {
312      if #[cfg(target_feature="sse")] {
313        Self { sse: bitxor_m128(self.sse, rhs.sse) }
314      } else if #[cfg(target_feature="simd128")] {
315        Self { simd: v128_xor(self.simd, rhs.simd) }
316      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
317        unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }}
318      } else {
319        Self { arr: [
320          f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()),
321          f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()),
322          f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()),
323          f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()),
324        ]}
325      }
326    }
327  }
328}
329
330impl CmpEq for f32x4 {
331  type Output = Self;
332  #[inline]
333  #[must_use]
334  fn cmp_eq(self, rhs: Self) -> Self::Output {
335    pick! {
336      if #[cfg(target_feature="sse")] {
337        Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) }
338      } else if #[cfg(target_feature="simd128")] {
339        Self { simd: f32x4_eq(self.simd, rhs.simd) }
340      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
341        unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }}
342      } else {
343        Self { arr: [
344          if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
345          if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
346          if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
347          if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
348        ]}
349      }
350    }
351  }
352}
353
354impl CmpGe for f32x4 {
355  type Output = Self;
356  #[inline]
357  #[must_use]
358  fn cmp_ge(self, rhs: Self) -> Self::Output {
359    pick! {
360      if #[cfg(target_feature="sse")] {
361        Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) }
362      } else if #[cfg(target_feature="simd128")] {
363        Self { simd: f32x4_ge(self.simd, rhs.simd) }
364      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
365        unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }}
366      } else {
367        Self { arr: [
368          if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
369          if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
370          if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
371          if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
372        ]}
373      }
374    }
375  }
376}
377
378impl CmpGt for f32x4 {
379  type Output = Self;
380  #[inline]
381  #[must_use]
382  fn cmp_gt(self, rhs: Self) -> Self::Output {
383    pick! {
384      if #[cfg(target_feature="sse")] {
385        Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) }
386      } else if #[cfg(target_feature="simd128")] {
387        Self { simd: f32x4_gt(self.simd, rhs.simd) }
388      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
389        unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }}
390      } else {
391        Self { arr: [
392          if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
393          if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
394          if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
395          if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
396        ]}
397      }
398    }
399  }
400}
401
402impl CmpNe for f32x4 {
403  type Output = Self;
404  #[inline]
405  #[must_use]
406  fn cmp_ne(self, rhs: Self) -> Self::Output {
407    pick! {
408      if #[cfg(target_feature="sse")] {
409        Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) }
410      } else if #[cfg(target_feature="simd128")] {
411        Self { simd: f32x4_ne(self.simd, rhs.simd) }
412      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
413        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }}
414      } else {
415        Self { arr: [
416          if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
417          if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
418          if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
419          if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
420        ]}
421      }
422    }
423  }
424}
425
426impl CmpLe for f32x4 {
427  type Output = Self;
428  #[inline]
429  #[must_use]
430  fn cmp_le(self, rhs: Self) -> Self::Output {
431    pick! {
432      if #[cfg(target_feature="sse")] {
433        Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) }
434      } else if #[cfg(target_feature="simd128")] {
435        Self { simd: f32x4_le(self.simd, rhs.simd) }
436      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
437        unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }}
438      } else {
439        Self { arr: [
440          if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
441          if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
442          if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
443          if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
444        ]}
445      }
446    }
447  }
448}
449
450impl CmpLt for f32x4 {
451  type Output = Self;
452  #[inline]
453  #[must_use]
454  fn cmp_lt(self, rhs: Self) -> Self::Output {
455    pick! {
456      if #[cfg(target_feature="sse")] {
457        Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) }
458      } else if #[cfg(target_feature="simd128")] {
459        Self { simd: f32x4_lt(self.simd, rhs.simd) }
460      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
461        unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }}
462      } else {
463        Self { arr: [
464          if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 },
465          if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 },
466          if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 },
467          if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 },
468        ]}
469      }
470    }
471  }
472}
473
474impl f32x4 {
475  #[inline]
476  #[must_use]
477  pub const fn new(array: [f32; 4]) -> Self {
478    #[allow(non_upper_case_globals)]
479    unsafe {
480      core::mem::transmute(array)
481    }
482  }
483
484  #[inline]
485  #[must_use]
486  pub fn blend(self, t: Self, f: Self) -> Self {
487    pick! {
488      if #[cfg(target_feature="sse4.1")] {
489        Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) }
490      } else if #[cfg(target_feature="simd128")] {
491        Self { simd: v128_bitselect(t.simd, f.simd, self.simd) }
492      } else {
493        generic_bit_blend(self, t, f)
494      }
495    }
496  }
497  #[inline]
498  #[must_use]
499  pub fn abs(self) -> Self {
500    pick! {
501      if #[cfg(target_feature="simd128")] {
502        Self { simd: f32x4_abs(self.simd) }
503      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
504        unsafe {Self { neon: vabsq_f32(self.neon) }}
505      } else {
506        let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32));
507        self & non_sign_bits
508      }
509    }
510  }
511  #[inline]
512  #[must_use]
513  pub fn floor(self) -> Self {
514    pick! {
515      if #[cfg(target_feature="simd128")] {
516        Self { simd: f32x4_floor(self.simd) }
517      } else if #[cfg(target_feature="sse4.1")] {
518        Self { sse: floor_m128(self.sse) }
519      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
520        unsafe {Self { neon: vrndmq_f32(self.neon) }}
521      } else if #[cfg(feature="std")] {
522        let base: [f32; 4] = cast(self);
523        cast(base.map(|val| val.floor()))
524      } else {
525        let base: [f32; 4] = cast(self);
526        let rounded: [f32; 4] = cast(self.round());
527        cast([
528          if base[0] < rounded[0] { rounded[0] - 1.0 } else { rounded[0] },
529          if base[1] < rounded[1] { rounded[1] - 1.0 } else { rounded[1] },
530          if base[2] < rounded[2] { rounded[2] - 1.0 } else { rounded[2] },
531          if base[3] < rounded[3] { rounded[3] - 1.0 } else { rounded[3] },
532        ])
533      }
534    }
535  }
536  #[inline]
537  #[must_use]
538  pub fn ceil(self) -> Self {
539    pick! {
540      if #[cfg(target_feature="simd128")] {
541        Self { simd: f32x4_ceil(self.simd) }
542      } else if #[cfg(target_feature="sse4.1")] {
543        Self { sse: ceil_m128(self.sse) }
544      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
545        unsafe {Self { neon: vrndpq_f32(self.neon) }}
546      } else if #[cfg(feature="std")] {
547        let base: [f32; 4] = cast(self);
548        cast(base.map(|val| val.ceil()))
549      } else {
550        let base: [f32; 4] = cast(self);
551        let rounded: [f32; 4] = cast(self.round());
552        cast([
553          if base[0] > rounded[0] { rounded[0] + 1.0 } else { rounded[0] },
554          if base[1] > rounded[1] { rounded[1] + 1.0 } else { rounded[1] },
555          if base[2] > rounded[2] { rounded[2] + 1.0 } else { rounded[2] },
556          if base[3] > rounded[3] { rounded[3] + 1.0 } else { rounded[3] },
557        ])
558      }
559    }
560  }
561
562  /// Calculates the lanewise maximum of both vectors. This is a faster
563  /// implementation than `max`, but it doesn't specify any behavior if NaNs are
564  /// involved.
565  #[inline]
566  #[must_use]
567  pub fn fast_max(self, rhs: Self) -> Self {
568    pick! {
569      if #[cfg(target_feature="sse")] {
570        Self { sse: max_m128(self.sse, rhs.sse) }
571      } else if #[cfg(target_feature="simd128")] {
572        Self {
573          simd: f32x4_pmax(self.simd, rhs.simd),
574        }
575      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
576        unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }}
577      } else {
578        Self { arr: [
579          if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] },
580          if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] },
581          if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] },
582          if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] },
583        ]}
584      }
585    }
586  }
587
588  /// Calculates the lanewise maximum of both vectors. If either lane is NaN,
589  /// the other lane gets chosen. Use `fast_max` for a faster implementation
590  /// that doesn't handle NaNs.
591  #[inline]
592  #[must_use]
593  pub fn max(self, rhs: Self) -> Self {
594    pick! {
595      if #[cfg(target_feature="sse")] {
596        // max_m128 seems to do rhs < self ? self : rhs. So if there's any NaN
597        // involved, it chooses rhs, so we need to specifically check rhs for
598        // NaN.
599        rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) })
600      } else if #[cfg(target_feature="simd128")] {
601        // WASM has two max intrinsics:
602        // - max: This propagates NaN, that's the opposite of what we need.
603        // - pmax: This is defined as self < rhs ? rhs : self, which basically
604        //   chooses self if either is NaN.
605        //
606        // pmax is what we want, but we need to specifically check self for NaN.
607        Self {
608          simd: v128_bitselect(
609            rhs.simd,
610            f32x4_pmax(self.simd, rhs.simd),
611            f32x4_ne(self.simd, self.simd), // NaN check
612          )
613        }
614      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
615        unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }}
616      } else {
617        Self { arr: [
618          self.arr[0].max(rhs.arr[0]),
619          self.arr[1].max(rhs.arr[1]),
620          self.arr[2].max(rhs.arr[2]),
621          self.arr[3].max(rhs.arr[3]),
622        ]}
623      }
624    }
625  }
626
627  /// Calculates the lanewise minimum of both vectors. This is a faster
628  /// implementation than `min`, but it doesn't specify any behavior if NaNs are
629  /// involved.
630  #[inline]
631  #[must_use]
632  pub fn fast_min(self, rhs: Self) -> Self {
633    pick! {
634      if #[cfg(target_feature="sse")] {
635        Self { sse: min_m128(self.sse, rhs.sse) }
636      } else if #[cfg(target_feature="simd128")] {
637        Self {
638          simd: f32x4_pmin(self.simd, rhs.simd),
639        }
640      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
641        unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }}
642      } else {
643        Self { arr: [
644          if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] },
645          if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] },
646          if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] },
647          if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] },
648        ]}
649      }
650    }
651  }
652
653  /// Calculates the lanewise minimum of both vectors. If either lane is NaN,
654  /// the other lane gets chosen. Use `fast_min` for a faster implementation
655  /// that doesn't handle NaNs.
656  #[inline]
657  #[must_use]
658  pub fn min(self, rhs: Self) -> Self {
659    pick! {
660      if #[cfg(target_feature="sse")] {
661        // min_m128 seems to do self < rhs ? self : rhs. So if there's any NaN
662        // involved, it chooses rhs, so we need to specifically check rhs for
663        // NaN.
664        rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) })
665      } else if #[cfg(target_feature="simd128")] {
666        // WASM has two min intrinsics:
667        // - min: This propagates NaN, that's the opposite of what we need.
668        // - pmin: This is defined as rhs < self ? rhs : self, which basically
669        //   chooses self if either is NaN.
670        //
671        // pmin is what we want, but we need to specifically check self for NaN.
672        Self {
673          simd: v128_bitselect(
674            rhs.simd,
675            f32x4_pmin(self.simd, rhs.simd),
676            f32x4_ne(self.simd, self.simd), // NaN check
677          )
678        }
679      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
680        unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }}
681      } else {
682        Self { arr: [
683          self.arr[0].min(rhs.arr[0]),
684          self.arr[1].min(rhs.arr[1]),
685          self.arr[2].min(rhs.arr[2]),
686          self.arr[3].min(rhs.arr[3]),
687        ]}
688      }
689    }
690  }
691  #[inline]
692  #[must_use]
693  pub fn is_nan(self) -> Self {
694    pick! {
695      if #[cfg(target_feature="sse")] {
696        Self { sse: cmp_unord_mask_m128(self.sse, self.sse) }
697      } else if #[cfg(target_feature="simd128")] {
698        Self { simd: f32x4_ne(self.simd, self.simd) }
699      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
700        unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }}
701      } else {
702        Self { arr: [
703          if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
704          if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
705          if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
706          if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 },
707        ]}
708      }
709    }
710  }
711  #[inline]
712  #[must_use]
713  pub fn is_finite(self) -> Self {
714    let shifted_exp_mask = u32x4::from(0xFF000000);
715    let u: u32x4 = cast(self);
716    let shift_u = u << 1_u64;
717    let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask);
718    cast(out)
719  }
720  #[inline]
721  #[must_use]
722  pub fn is_inf(self) -> Self {
723    let shifted_inf = u32x4::from(0xFF000000);
724    let u: u32x4 = cast(self);
725    let shift_u = u << 1_u64;
726    let out = (shift_u).cmp_eq(shifted_inf);
727    cast(out)
728  }
729
730  #[inline]
731  #[must_use]
732  pub fn round(self) -> Self {
733    pick! {
734      if #[cfg(target_feature="sse4.1")] {
735        Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) }
736      } else if #[cfg(target_feature="sse2")] {
737        let mi: m128i = convert_to_i32_m128i_from_m128(self.sse);
738        let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) };
739        let i: i32x4 = cast(mi);
740        let mask: f32x4 = cast(i.cmp_eq(i32x4::from(0x80000000_u32 as i32)));
741        mask.blend(self, f)
742      } else if #[cfg(target_feature="simd128")] {
743        Self { simd: f32x4_nearest(self.simd) }
744      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
745        unsafe {Self { neon: vrndnq_f32(self.neon) }}
746      } else {
747        // Note(Lokathor): This software fallback is probably very slow compared
748        // to having a hardware option available, even just the sse2 version is
749        // better than this. Oh well.
750        let to_int = f32x4::from(1.0 / f32::EPSILON);
751        let u: u32x4 = cast(self);
752        let e: i32x4 = cast((u >> 23) & u32x4::from(0xff));
753        let mut y: f32x4;
754
755        let no_op_magic = i32x4::from(0x7f + 23);
756        let no_op_mask: f32x4 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic));
757        let no_op_val: f32x4 = self;
758
759        let zero_magic = i32x4::from(0x7f - 1);
760        let zero_mask: f32x4 = cast(e.cmp_lt(zero_magic));
761        let zero_val: f32x4 = self * f32x4::from(0.0);
762
763        let neg_bit: f32x4 = cast(cast::<u32x4, i32x4>(u).cmp_lt(i32x4::default()));
764        let x: f32x4 = neg_bit.blend(-self, self);
765        y = x + to_int - to_int - x;
766        y = y.cmp_gt(f32x4::from(0.5)).blend(
767          y + x - f32x4::from(-1.0),
768          y.cmp_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x),
769        );
770        y = neg_bit.blend(-y, y);
771
772        no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y))
773      }
774    }
775  }
776
777  /// Rounds each lane into an integer. This is a faster implementation than
778  /// `round_int`, but it doesn't handle out of range values or NaNs. For those
779  /// values you get implementation defined behavior.
780  #[inline]
781  #[must_use]
782  pub fn fast_round_int(self) -> i32x4 {
783    pick! {
784      if #[cfg(target_feature="sse2")] {
785        cast(convert_to_i32_m128i_from_m128(self.sse))
786      } else {
787        self.round_int()
788      }
789    }
790  }
791
792  /// Rounds each lane into an integer. This saturates out of range values and
793  /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that
794  /// doesn't handle out of range values or NaNs.
795  #[inline]
796  #[must_use]
797  pub fn round_int(self) -> i32x4 {
798    pick! {
799      if #[cfg(target_feature="sse2")] {
800        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
801        let non_nan_mask = self.cmp_eq(self);
802        let non_nan = self & non_nan_mask;
803        let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
804        let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse));
805        flip_to_max ^ cast
806      } else if #[cfg(target_feature="simd128")] {
807        cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) })
808      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
809        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }})
810      } else {
811        let rounded: [f32; 4] = cast(self.round());
812        cast([
813          rounded[0] as i32,
814          rounded[1] as i32,
815          rounded[2] as i32,
816          rounded[3] as i32,
817        ])
818      }
819    }
820  }
821
822  /// Truncates each lane into an integer. This is a faster implementation than
823  /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those
824  /// values you get implementation defined behavior.
825  #[inline]
826  #[must_use]
827  pub fn fast_trunc_int(self) -> i32x4 {
828    pick! {
829      if #[cfg(target_feature="sse2")] {
830        cast(truncate_m128_to_m128i(self.sse))
831      } else {
832        self.trunc_int()
833      }
834    }
835  }
836
837  /// Truncates each lane into an integer. This saturates out of range values
838  /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation
839  /// that doesn't handle out of range values or NaNs.
840  #[inline]
841  #[must_use]
842  pub fn trunc_int(self) -> i32x4 {
843    pick! {
844      if #[cfg(target_feature="sse2")] {
845        // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504
846        let non_nan_mask = self.cmp_eq(self);
847        let non_nan = self & non_nan_mask;
848        let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0)));
849        let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse));
850        flip_to_max ^ cast
851      } else if #[cfg(target_feature="simd128")] {
852        cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) })
853      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
854        cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }})
855      } else {
856        let n: [f32;4] = cast(self);
857        cast([
858          n[0] as i32,
859          n[1] as i32,
860          n[2] as i32,
861          n[3] as i32,
862        ])
863      }
864    }
865  }
866  #[inline]
867  #[must_use]
868  pub fn mul_add(self, m: Self, a: Self) -> Self {
869    pick! {
870      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
871        Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) }
872      } else {
873        (self * m) + a
874      }
875    }
876  }
877
878  #[inline]
879  #[must_use]
880  pub fn mul_sub(self, m: Self, s: Self) -> Self {
881    pick! {
882      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
883        Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) }
884      } else {
885        (self * m) - s
886      }
887    }
888  }
889
890  #[inline]
891  #[must_use]
892  pub fn mul_neg_add(self, m: Self, a: Self) -> Self {
893    pick! {
894      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
895        Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) }
896      } else {
897        a - (self * m)
898      }
899    }
900  }
901
902  #[inline]
903  #[must_use]
904  pub fn mul_neg_sub(self, m: Self, a: Self) -> Self {
905    pick! {
906      if #[cfg(all(target_feature="sse2",target_feature="fma"))] {
907        Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, a.sse) }
908      } else {
909        -(self * m) - a
910      }
911    }
912  }
913
914  #[inline]
915  #[must_use]
916  pub fn flip_signs(self, signs: Self) -> Self {
917    self ^ (signs & Self::from(-0.0))
918  }
919
920  #[inline]
921  #[must_use]
922  pub fn copysign(self, sign: Self) -> Self {
923    let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1));
924    (self & magnitude_mask) | (sign & Self::from(-0.0))
925  }
926
927  #[inline]
928  pub fn asin_acos(self) -> (Self, Self) {
929    // Based on the Agner Fog "vector class library":
930    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
931    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
932    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
933    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
934    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
935    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
936
937    let xa = self.abs();
938    let big = xa.cmp_ge(f32x4::splat(0.5));
939
940    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
941    let x2 = xa * xa;
942    let x3 = big.blend(x1, x2);
943
944    let xb = x1.sqrt();
945
946    let x4 = big.blend(xb, xa);
947
948    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
949    let z = z.mul_add(x3 * x4, x4);
950
951    let z1 = z + z;
952
953    // acos
954    let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
955    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
956    let acos = big.blend(z3, z4);
957
958    // asin
959    let z3 = f32x4::FRAC_PI_2 - z1;
960    let asin = big.blend(z3, z);
961    let asin = asin.flip_signs(self);
962
963    (asin, acos)
964  }
965
966  #[inline]
967  pub fn asin(self) -> Self {
968    // Based on the Agner Fog "vector class library":
969    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
970    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
971    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
972    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
973    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
974    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
975
976    let xa = self.abs();
977    let big = xa.cmp_ge(f32x4::splat(0.5));
978
979    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
980    let x2 = xa * xa;
981    let x3 = big.blend(x1, x2);
982
983    let xb = x1.sqrt();
984
985    let x4 = big.blend(xb, xa);
986
987    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
988    let z = z.mul_add(x3 * x4, x4);
989
990    let z1 = z + z;
991
992    // asin
993    let z3 = f32x4::FRAC_PI_2 - z1;
994    let asin = big.blend(z3, z);
995    let asin = asin.flip_signs(self);
996
997    asin
998  }
999
1000  #[inline]
1001  #[must_use]
1002  pub fn acos(self) -> Self {
1003    // Based on the Agner Fog "vector class library":
1004    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1005    const_f32_as_f32x4!(P4asinf, 4.2163199048E-2);
1006    const_f32_as_f32x4!(P3asinf, 2.4181311049E-2);
1007    const_f32_as_f32x4!(P2asinf, 4.5470025998E-2);
1008    const_f32_as_f32x4!(P1asinf, 7.4953002686E-2);
1009    const_f32_as_f32x4!(P0asinf, 1.6666752422E-1);
1010
1011    let xa = self.abs();
1012    let big = xa.cmp_ge(f32x4::splat(0.5));
1013
1014    let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa);
1015    let x2 = xa * xa;
1016    let x3 = big.blend(x1, x2);
1017
1018    let xb = x1.sqrt();
1019
1020    let x4 = big.blend(xb, xa);
1021
1022    let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf);
1023    let z = z.mul_add(x3 * x4, x4);
1024
1025    let z1 = z + z;
1026
1027    // acos
1028    let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1);
1029    let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self);
1030    let acos = big.blend(z3, z4);
1031
1032    acos
1033  }
1034
1035  #[inline]
1036  pub fn atan(self) -> Self {
1037    // Based on the Agner Fog "vector class library":
1038    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1039    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1040    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1041    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1042    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1043
1044    let t = self.abs();
1045
1046    // small:  z = t / 1.0;
1047    // medium: z = (t-1.0) / (t+1.0);
1048    // big:    z = -1.0 / t;
1049    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1050    let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE);
1051
1052    let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2);
1053    s = notsmal & s;
1054
1055    let mut a = notbig & t;
1056    a = notsmal.blend(a - Self::ONE, a);
1057    let mut b = notbig & Self::ONE;
1058    b = notsmal.blend(b + t, b);
1059    let z = a / b;
1060
1061    let zz = z * z;
1062
1063    // Taylor expansion
1064    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1065    re = re.mul_add(zz * z, z) + s;
1066
1067    // get sign bit
1068    re = (self.sign_bit()).blend(-re, re);
1069
1070    re
1071  }
1072
1073  #[inline]
1074  pub fn atan2(self, x: Self) -> Self {
1075    // Based on the Agner Fog "vector class library":
1076    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1077    const_f32_as_f32x4!(P3atanf, 8.05374449538E-2);
1078    const_f32_as_f32x4!(P2atanf, -1.38776856032E-1);
1079    const_f32_as_f32x4!(P1atanf, 1.99777106478E-1);
1080    const_f32_as_f32x4!(P0atanf, -3.33329491539E-1);
1081
1082    let y = self;
1083
1084    // move in first octant
1085    let x1 = x.abs();
1086    let y1 = y.abs();
1087    let swapxy = y1.cmp_gt(x1);
1088    // swap x and y if y1 > x1
1089    let mut x2 = swapxy.blend(y1, x1);
1090    let mut y2 = swapxy.blend(x1, y1);
1091
1092    // check for special case: x and y are both +/- INF
1093    let both_infinite = x.is_inf() & y.is_inf();
1094    if both_infinite.any() {
1095      let minus_one = -Self::ONE;
1096      x2 = both_infinite.blend(x2 & minus_one, x2);
1097      y2 = both_infinite.blend(y2 & minus_one, y2);
1098    }
1099
1100    // x = y = 0 will produce NAN. No problem, fixed below
1101    let t = y2 / x2;
1102
1103    // small:  z = t / 1.0;
1104    // medium: z = (t-1.0) / (t+1.0);
1105    let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE);
1106
1107    let a = notsmal.blend(t - Self::ONE, t);
1108    let b = notsmal.blend(t + Self::ONE, Self::ONE);
1109    let s = notsmal & Self::FRAC_PI_4;
1110    let z = a / b;
1111
1112    let zz = z * z;
1113
1114    // Taylor expansion
1115    let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf);
1116    re = re.mul_add(zz * z, z) + s;
1117
1118    // move back in place
1119    re = swapxy.blend(Self::FRAC_PI_2 - re, re);
1120    re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re);
1121    re = (x.sign_bit()).blend(Self::PI - re, re);
1122
1123    // get sign bit
1124    re = (y.sign_bit()).blend(-re, re);
1125
1126    re
1127  }
1128
1129  #[inline]
1130  #[must_use]
1131  pub fn sin_cos(self) -> (Self, Self) {
1132    // Based on the Agner Fog "vector class library":
1133    // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h
1134
1135    const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0);
1136    const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0);
1137    const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0);
1138
1139    const_f32_as_f32x4!(P0sinf, -1.6666654611E-1);
1140    const_f32_as_f32x4!(P1sinf, 8.3321608736E-3);
1141    const_f32_as_f32x4!(P2sinf, -1.9515295891E-4);
1142
1143    const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2);
1144    const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3);
1145    const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5);
1146
1147    const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI);
1148
1149    let xa = self.abs();
1150
1151    // Find quadrant
1152    let y = (xa * TWO_OVER_PI).round();
1153    let q: i32x4 = y.round_int();
1154
1155    let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa)));
1156
1157    let x2 = x * x;
1158    let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x;
1159    let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2)
1160      + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0));
1161
1162    let swap = !(q & i32x4::from(1)).cmp_eq(i32x4::from(0));
1163
1164    let mut overflow: f32x4 = cast(q.cmp_gt(i32x4::from(0x2000000)));
1165    overflow &= xa.is_finite();
1166    s = overflow.blend(f32x4::from(0.0), s);
1167    c = overflow.blend(f32x4::from(1.0), c);
1168
1169    // calc sin
1170    let mut sin1 = cast::<_, f32x4>(swap).blend(c, s);
1171    let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self);
1172    sin1 = sin1.flip_signs(cast(sign_sin));
1173
1174    // calc cos
1175    let mut cos1 = cast::<_, f32x4>(swap).blend(s, c);
1176    let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30;
1177    cos1 ^= cast::<_, f32x4>(sign_cos);
1178
1179    (sin1, cos1)
1180  }
1181
1182  #[inline]
1183  #[must_use]
1184  pub fn sin(self) -> Self {
1185    let (s, _) = self.sin_cos();
1186    s
1187  }
1188  #[inline]
1189  #[must_use]
1190  pub fn cos(self) -> Self {
1191    let (_, c) = self.sin_cos();
1192    c
1193  }
1194  #[inline]
1195  #[must_use]
1196  pub fn tan(self) -> Self {
1197    let (s, c) = self.sin_cos();
1198    s / c
1199  }
1200  #[inline]
1201  #[must_use]
1202  pub fn to_degrees(self) -> Self {
1203    const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI);
1204    self * RAD_TO_DEG_RATIO
1205  }
1206  #[inline]
1207  #[must_use]
1208  pub fn to_radians(self) -> Self {
1209    const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32);
1210    self * DEG_TO_RAD_RATIO
1211  }
1212  #[inline]
1213  #[must_use]
1214  pub fn recip(self) -> Self {
1215    pick! {
1216      if #[cfg(target_feature="sse")] {
1217        Self { sse: reciprocal_m128(self.sse) }
1218      } else if #[cfg(target_feature="simd128")] {
1219        Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) }
1220      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1221        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }}
1222      } else {
1223        Self { arr: [
1224          1.0 / self.arr[0],
1225          1.0 / self.arr[1],
1226          1.0 / self.arr[2],
1227          1.0 / self.arr[3],
1228        ]}
1229      }
1230    }
1231  }
1232  #[inline]
1233  #[must_use]
1234  pub fn recip_sqrt(self) -> Self {
1235    pick! {
1236      if #[cfg(target_feature="sse")] {
1237        Self { sse: reciprocal_sqrt_m128(self.sse) }
1238      } else if #[cfg(target_feature="simd128")] {
1239        Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) }
1240      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1241        unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }}
1242      } else if #[cfg(feature="std")] {
1243        Self { arr: [
1244          1.0 / self.arr[0].sqrt(),
1245          1.0 / self.arr[1].sqrt(),
1246          1.0 / self.arr[2].sqrt(),
1247          1.0 / self.arr[3].sqrt(),
1248        ]}
1249      } else {
1250        Self { arr: [
1251          1.0 / software_sqrt(self.arr[0] as f64) as f32,
1252          1.0 / software_sqrt(self.arr[1] as f64) as f32,
1253          1.0 / software_sqrt(self.arr[2] as f64) as f32,
1254          1.0 / software_sqrt(self.arr[3] as f64) as f32,
1255        ]}
1256      }
1257    }
1258  }
1259  #[inline]
1260  #[must_use]
1261  pub fn sqrt(self) -> Self {
1262    pick! {
1263      if #[cfg(target_feature="sse")] {
1264        Self { sse: sqrt_m128(self.sse) }
1265      } else if #[cfg(target_feature="simd128")] {
1266        Self { simd: f32x4_sqrt(self.simd) }
1267      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1268        unsafe {Self { neon: vsqrtq_f32(self.neon) }}
1269      } else if #[cfg(feature="std")] {
1270        Self { arr: [
1271          self.arr[0].sqrt(),
1272          self.arr[1].sqrt(),
1273          self.arr[2].sqrt(),
1274          self.arr[3].sqrt(),
1275        ]}
1276      } else {
1277        Self { arr: [
1278          software_sqrt(self.arr[0] as f64) as f32,
1279          software_sqrt(self.arr[1] as f64) as f32,
1280          software_sqrt(self.arr[2] as f64) as f32,
1281          software_sqrt(self.arr[3] as f64) as f32,
1282        ]}
1283      }
1284    }
1285  }
1286
1287  #[inline]
1288  #[must_use]
1289  pub fn move_mask(self) -> i32 {
1290    pick! {
1291      if #[cfg(target_feature="sse")] {
1292        move_mask_m128(self.sse)
1293      } else if #[cfg(target_feature="simd128")] {
1294        u32x4_bitmask(self.simd) as i32
1295      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{
1296        unsafe
1297        {
1298          // set all to 1 if top bit is set, else 0
1299          let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0));
1300
1301          // select the right bit out of each lane
1302          let selectbit : uint32x4_t = core::mem::transmute([1u32, 2, 4, 8]);
1303          let r = vandq_u32(masked, selectbit);
1304
1305          // horizontally add the 16-bit lanes
1306          vaddvq_u32(r) as i32
1307        }
1308      } else {
1309        (((self.arr[0].to_bits() as i32) < 0) as i32) << 0 |
1310        (((self.arr[1].to_bits() as i32) < 0) as i32) << 1 |
1311        (((self.arr[2].to_bits() as i32) < 0) as i32) << 2 |
1312        (((self.arr[3].to_bits() as i32) < 0) as i32) << 3
1313      }
1314    }
1315  }
1316  #[inline]
1317  #[must_use]
1318  pub fn any(self) -> bool {
1319    pick! {
1320      if #[cfg(target_feature="simd128")] {
1321        v128_any_true(self.simd)
1322      } else {
1323        self.move_mask() != 0
1324      }
1325    }
1326  }
1327  #[inline]
1328  #[must_use]
1329  pub fn all(self) -> bool {
1330    pick! {
1331      if #[cfg(target_feature="simd128")] {
1332        u32x4_all_true(self.simd)
1333      } else {
1334        // four lanes
1335        self.move_mask() == 0b1111
1336      }
1337    }
1338  }
1339  #[inline]
1340  #[must_use]
1341  pub fn none(self) -> bool {
1342    !self.any()
1343  }
1344
1345  #[inline]
1346  fn vm_pow2n(self) -> Self {
1347    const_f32_as_f32x4!(pow2_23, 8388608.0);
1348    const_f32_as_f32x4!(bias, 127.0);
1349    let a = self + (bias + pow2_23);
1350    let c = cast::<_, i32x4>(a) << 23;
1351    cast::<_, f32x4>(c)
1352  }
1353
1354  /// Calculate the exponent of a packed `f32x4`
1355  #[inline]
1356  #[must_use]
1357  pub fn exp(self) -> Self {
1358    const_f32_as_f32x4!(P0, 1.0 / 2.0);
1359    const_f32_as_f32x4!(P1, 1.0 / 6.0);
1360    const_f32_as_f32x4!(P2, 1. / 24.);
1361    const_f32_as_f32x4!(P3, 1. / 120.);
1362    const_f32_as_f32x4!(P4, 1. / 720.);
1363    const_f32_as_f32x4!(P5, 1. / 5040.);
1364    const_f32_as_f32x4!(LN2D_HI, 0.693359375);
1365    const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4);
1366    let max_x = f32x4::from(87.3);
1367    let r = (self * Self::LOG2_E).round();
1368    let x = r.mul_neg_add(LN2D_HI, self);
1369    let x = r.mul_neg_add(LN2D_LO, x);
1370    let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5);
1371    let x2 = x * x;
1372    let z = z.mul_add(x2, x);
1373    let n2 = Self::vm_pow2n(r);
1374    let z = (z + Self::ONE) * n2;
1375    // check for overflow
1376    let in_range = self.abs().cmp_lt(max_x);
1377    let in_range = in_range & self.is_finite();
1378    in_range.blend(z, Self::ZERO)
1379  }
1380
1381  #[inline]
1382  fn exponent(self) -> f32x4 {
1383    const_f32_as_f32x4!(pow2_23, 8388608.0);
1384    const_f32_as_f32x4!(bias, 127.0);
1385    let a = cast::<_, u32x4>(self);
1386    let b = a >> 23;
1387    let c = b | cast::<_, u32x4>(pow2_23);
1388    let d = cast::<_, f32x4>(c);
1389    let e = d - (pow2_23 + bias);
1390    e
1391  }
1392
1393  #[inline]
1394  fn fraction_2(self) -> Self {
1395    let t1 = cast::<_, u32x4>(self);
1396    let t2 = cast::<_, u32x4>(
1397      (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000),
1398    );
1399    cast::<_, f32x4>(t2)
1400  }
1401  #[inline]
1402  fn is_zero_or_subnormal(self) -> Self {
1403    let t = cast::<_, i32x4>(self);
1404    let t = t & i32x4::splat(0x7F800000);
1405    i32x4::round_float(t.cmp_eq(i32x4::splat(0)))
1406  }
1407  #[inline]
1408  fn infinity() -> Self {
1409    cast::<_, f32x4>(i32x4::splat(0x7F800000))
1410  }
1411  #[inline]
1412  fn nan_log() -> Self {
1413    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1414  }
1415  #[inline]
1416  fn nan_pow() -> Self {
1417    cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF))
1418  }
1419  #[inline]
1420  pub fn sign_bit(self) -> Self {
1421    let t1 = cast::<_, i32x4>(self);
1422    let t2 = t1 >> 31;
1423    !cast::<_, f32x4>(t2).cmp_eq(f32x4::ZERO)
1424  }
1425
1426  /// horizontal add of all the elements of the vector
1427  #[inline]
1428  #[must_use]
1429  pub fn reduce_add(self) -> f32 {
1430    let arr: [f32; 4] = cast(self);
1431    arr.iter().sum()
1432  }
1433
1434  /// Natural log (ln(x))
1435  #[inline]
1436  #[must_use]
1437  pub fn ln(self) -> Self {
1438    const_f32_as_f32x4!(HALF, 0.5);
1439    const_f32_as_f32x4!(P0, 3.3333331174E-1);
1440    const_f32_as_f32x4!(P1, -2.4999993993E-1);
1441    const_f32_as_f32x4!(P2, 2.0000714765E-1);
1442    const_f32_as_f32x4!(P3, -1.6668057665E-1);
1443    const_f32_as_f32x4!(P4, 1.4249322787E-1);
1444    const_f32_as_f32x4!(P5, -1.2420140846E-1);
1445    const_f32_as_f32x4!(P6, 1.1676998740E-1);
1446    const_f32_as_f32x4!(P7, -1.1514610310E-1);
1447    const_f32_as_f32x4!(P8, 7.0376836292E-2);
1448    const_f32_as_f32x4!(LN2F_HI, 0.693359375);
1449    const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4);
1450    const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38);
1451
1452    let x1 = self;
1453    let x = Self::fraction_2(x1);
1454    let e = Self::exponent(x1);
1455    let mask = x.cmp_gt(Self::SQRT_2 * HALF);
1456    let x = (!mask).blend(x + x, x);
1457    let fe = mask.blend(e + Self::ONE, e);
1458    let x = x - Self::ONE;
1459    let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8);
1460    let x2 = x * x;
1461    let res = x2 * x * res;
1462    let res = fe.mul_add(LN2F_LO, res);
1463    let res = res + x2.mul_neg_add(HALF, x);
1464    let res = fe.mul_add(LN2F_HI, res);
1465    let overflow = !self.is_finite();
1466    let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL);
1467    let mask = overflow | underflow;
1468    if !mask.any() {
1469      res
1470    } else {
1471      let is_zero = self.is_zero_or_subnormal();
1472      let res = underflow.blend(Self::nan_log(), res);
1473      let res = is_zero.blend(Self::infinity(), res);
1474      let res = overflow.blend(self, res);
1475      res
1476    }
1477  }
1478
1479  #[inline]
1480  #[must_use]
1481  pub fn log2(self) -> Self {
1482    Self::ln(self) * Self::LOG2_E
1483  }
1484  #[inline]
1485  #[must_use]
1486  pub fn log10(self) -> Self {
1487    Self::ln(self) * Self::LOG10_E
1488  }
1489
1490  #[inline]
1491  #[must_use]
1492  pub fn pow_f32x4(self, y: f32x4) -> Self {
1493    const_f32_as_f32x4!(ln2f_hi, 0.693359375);
1494    const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4);
1495    const_f32_as_f32x4!(P0logf, 3.3333331174E-1);
1496    const_f32_as_f32x4!(P1logf, -2.4999993993E-1);
1497    const_f32_as_f32x4!(P2logf, 2.0000714765E-1);
1498    const_f32_as_f32x4!(P3logf, -1.6668057665E-1);
1499    const_f32_as_f32x4!(P4logf, 1.4249322787E-1);
1500    const_f32_as_f32x4!(P5logf, -1.2420140846E-1);
1501    const_f32_as_f32x4!(P6logf, 1.1676998740E-1);
1502    const_f32_as_f32x4!(P7logf, -1.1514610310E-1);
1503    const_f32_as_f32x4!(P8logf, 7.0376836292E-2);
1504
1505    const_f32_as_f32x4!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp
1506    const_f32_as_f32x4!(p3expf, 1.0 / 6.0);
1507    const_f32_as_f32x4!(p4expf, 1.0 / 24.0);
1508    const_f32_as_f32x4!(p5expf, 1.0 / 120.0);
1509    const_f32_as_f32x4!(p6expf, 1.0 / 720.0);
1510    const_f32_as_f32x4!(p7expf, 1.0 / 5040.0);
1511
1512    let x1 = self.abs();
1513    let x = x1.fraction_2();
1514
1515    let mask = x.cmp_gt(f32x4::SQRT_2 * f32x4::HALF);
1516    let x = (!mask).blend(x + x, x);
1517
1518    let x = x - f32x4::ONE;
1519    let x2 = x * x;
1520    let lg1 = polynomial_8!(
1521      x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf
1522    );
1523    let lg1 = lg1 * x2 * x;
1524
1525    let ef = x1.exponent();
1526    let ef = mask.blend(ef + f32x4::ONE, ef);
1527
1528    let e1 = (ef * y).round();
1529    let yr = ef.mul_sub(y, e1);
1530
1531    let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1;
1532    let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2);
1533    let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1;
1534
1535    let e2 = (lg * y * f32x4::LOG2_E).round();
1536    let v = lg.mul_sub(y, e2 * ln2f_hi);
1537    let v = e2.mul_neg_add(ln2f_lo, v);
1538    let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2);
1539
1540    let x = v;
1541    let e3 = (x * f32x4::LOG2_E).round();
1542    let x = e3.mul_neg_add(f32x4::LN_2, x);
1543    let x2 = x * x;
1544    let z = x2.mul_add(
1545      polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf),
1546      x + f32x4::ONE,
1547    );
1548
1549    let ee = e1 + e2 + e3;
1550    let ei = cast::<_, i32x4>(ee.round_int());
1551    let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23));
1552
1553    let overflow = cast::<_, f32x4>(ej.cmp_gt(i32x4::splat(0x0FF)))
1554      | (ee.cmp_gt(f32x4::splat(300.0)));
1555    let underflow = cast::<_, f32x4>(ej.cmp_lt(i32x4::splat(0x000)))
1556      | (ee.cmp_lt(f32x4::splat(-300.0)));
1557
1558    // Add exponent by integer addition
1559    let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23));
1560
1561    // Check for overflow/underflow
1562    let z = if (overflow | underflow).any() {
1563      let z = underflow.blend(f32x4::ZERO, z);
1564      overflow.blend(Self::infinity(), z)
1565    } else {
1566      z
1567    };
1568
1569    // Check for self == 0
1570    let x_zero = self.is_zero_or_subnormal();
1571    let z = x_zero.blend(
1572      y.cmp_lt(f32x4::ZERO).blend(
1573        Self::infinity(),
1574        y.cmp_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO),
1575      ),
1576      z,
1577    );
1578
1579    let x_sign = self.sign_bit();
1580    let z = if x_sign.any() {
1581      // Y into an integer
1582      let yi = y.cmp_eq(y.round());
1583      // Is y odd?
1584      let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float();
1585
1586      let z1 =
1587        yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow()));
1588      x_sign.blend(z1, z)
1589    } else {
1590      z
1591    };
1592
1593    let x_finite = self.is_finite();
1594    let y_finite = y.is_finite();
1595    let e_finite = ee.is_finite();
1596    if (x_finite & y_finite & (e_finite | x_zero)).all() {
1597      return z;
1598    }
1599
1600    (self.is_nan() | y.is_nan()).blend(self + y, z)
1601  }
1602
1603  #[inline]
1604  pub fn powf(self, y: f32) -> Self {
1605    Self::pow_f32x4(self, f32x4::splat(y))
1606  }
1607
1608  /// Transpose matrix of 4x4 `f32` matrix. Currently only accelerated on SSE.
1609  #[must_use]
1610  #[inline]
1611  pub fn transpose(data: [f32x4; 4]) -> [f32x4; 4] {
1612    pick! {
1613      if #[cfg(target_feature="sse")] {
1614        let mut e0 = data[0];
1615        let mut e1 = data[1];
1616        let mut e2 = data[2];
1617        let mut e3 = data[3];
1618
1619        transpose_four_m128(&mut e0.sse, &mut e1.sse, &mut e2.sse, &mut e3.sse);
1620
1621        [e0, e1, e2, e3]
1622      } else {
1623        #[inline(always)]
1624        fn transpose_column(data: &[f32x4; 4], index: usize) -> f32x4 {
1625          f32x4::new([
1626            data[0].as_array_ref()[index],
1627            data[1].as_array_ref()[index],
1628            data[2].as_array_ref()[index],
1629            data[3].as_array_ref()[index],
1630          ])
1631        }
1632
1633        [
1634          transpose_column(&data, 0),
1635          transpose_column(&data, 1),
1636          transpose_column(&data, 2),
1637          transpose_column(&data, 3),
1638        ]
1639      }
1640    }
1641  }
1642
1643  #[inline]
1644  pub fn to_array(self) -> [f32; 4] {
1645    cast(self)
1646  }
1647
1648  #[inline]
1649  pub fn as_array_ref(&self) -> &[f32; 4] {
1650    cast_ref(self)
1651  }
1652
1653  #[inline]
1654  pub fn as_array_mut(&mut self) -> &mut [f32; 4] {
1655    cast_mut(self)
1656  }
1657
1658  #[inline]
1659  pub fn from_i32x4(v: i32x4) -> Self {
1660    pick! {
1661      if #[cfg(target_feature="sse2")] {
1662        Self { sse: convert_to_m128_from_i32_m128i(v.sse) }
1663      } else if #[cfg(target_feature="simd128")] {
1664        Self { simd: f32x4_convert_i32x4(v.simd) }
1665      } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] {
1666        Self { neon: unsafe { vcvtq_f32_s32(v.neon) }}
1667      } else {
1668        Self { arr: [
1669            v.as_array_ref()[0] as f32,
1670            v.as_array_ref()[1] as f32,
1671            v.as_array_ref()[2] as f32,
1672            v.as_array_ref()[3] as f32,
1673          ] }
1674      }
1675    }
1676  }
1677}
wide/f32x4_.rs

wide/
f32x4_.rs