Created
November 12, 2025 04:43
-
-
Save raphlinus/9cad030a74deed64e7b05a620807ecac to your computer and use it in GitHub Desktop.
prototype SIMD implementation of new flatten algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| use fearless_simd::{Level, Select, Simd, SimdInto, dispatch, f32x4}; | |
| #[repr(C)] | |
| #[derive(Clone, Copy, Debug)] | |
| struct Point { | |
| x: f32, | |
| y: f32, | |
| } | |
| impl Point { | |
| fn new(x: f32, y: f32) -> Self { | |
| Self { x, y } | |
| } | |
| fn hypot2(self) -> f32 { | |
| self.x * self.x + self.y * self.y | |
| } | |
| fn cross(self, other: Self) -> f32 { | |
| self.x * other.y - self.y * other.x | |
| } | |
| } | |
| impl core::ops::Sub for Point { | |
| type Output = Self; | |
| fn sub(self, rhs: Self) -> Self { | |
| Point::new(self.x - rhs.x, self.y - rhs.y) | |
| } | |
| } | |
| fn prefix_sum_inclusive<S: Simd>(simd: S, x: f32x4<S>) -> f32x4<S> { | |
| use core::arch::aarch64::*; | |
| if let Some(_neon) = simd.level().as_neon() { | |
| unsafe { | |
| let x2 = vextq_f32::<2>(vdupq_n_f32(0.0), x.into()); | |
| let y2 = vaddq_f32(x.into(), x2); | |
| let x3 = vextq_f32::<3>(vdupq_n_f32(0.0), y2); | |
| vaddq_f32(y2, x3).simd_into(simd) | |
| } | |
| } else { | |
| todo!("scalar fallback") | |
| } | |
| } | |
| fn vext3<S: Simd>(simd: S, x: f32x4<S>, y: f32x4<S>) -> f32x4<S> { | |
| use core::arch::aarch64::*; | |
| if let Some(_neon) = simd.level().as_neon() { | |
| unsafe { | |
| vextq_f32::<3>(x.into(), y.into()).simd_into(simd) | |
| } | |
| } else { | |
| todo!("scalar fallback") | |
| } | |
| } | |
| #[inline(always)] | |
| fn flatten_impl<S: Simd>(simd: S, c: [Point; 4], tol: f32, out: &mut Vec<Point>) { | |
| let d0 = c[1] - c[0]; | |
| let d1 = c[2] - c[1]; | |
| let d2 = c[3] - c[2]; | |
| let dd0 = d1 - d0; | |
| let dd1 = d2 - d1; | |
| let dd2 = dd0.hypot2().max(dd1.hypot2()); | |
| // Number of subdivisions as per Wang's method | |
| let n = (0.75 * dd2.sqrt() / tol).sqrt().ceil(); | |
| let n_int = n as usize; | |
| //println!("{n_int}"); | |
| let iota: f32x4<S> = [0., 1., 2., 3.].simd_into(simd); | |
| let dt = 1.0 / n; | |
| let p1xp0 = d1.cross(d0); | |
| let p2xp0 = d2.cross(d0); | |
| let p2xp1 = d2.cross(d1); | |
| let c0 = p1xp0; | |
| let c1 = p2xp0 - 2. * p1xp0; | |
| let c2 = p2xp1 - p2xp0 + p1xp0; | |
| let ddd = dd1 - dd0; | |
| let mut last_sum = simd.splat_f32x4(0.0); | |
| let mut ts = [0f32; 32]; // TODO: deal with variable sizing | |
| let mut j = 0; | |
| for i in 0..n_int.div_ceil(4) { | |
| let t = (iota + (i as f32 * 4.0 + 0.5)) * dt; | |
| let t2 = t * t; | |
| let dx = d0.x + 2. * dd0.x * t + ddd.x * t2; | |
| let dy = d0.y + 2. * dd0.y * t + ddd.y * t2; | |
| let cross = c0 + t * c1 + t2 * c2; | |
| let dens = (cross.abs() / (tol * (dx * dx + dy * dy).sqrt())).sqrt(); | |
| let dens_sum = last_sum[3] + prefix_sum_inclusive(simd, dens * dt); | |
| let mask = dens_sum.floor().simd_gt(vext3(simd, last_sum, dens_sum).floor()); | |
| let mask2 = mask & t.simd_lt(1.0); | |
| let sample_t = t + 0.5 * dt - (dens_sum - dens_sum.floor()) / dens; | |
| let masked_sample = mask2.select(sample_t, simd.splat_f32x4(0.0)); | |
| for t_sample in Into::<[f32; 4]>::into(masked_sample) { | |
| if t_sample != 0.0 { | |
| ts[j] = t_sample; | |
| j += 1; | |
| } | |
| } | |
| last_sum = dens_sum; | |
| } | |
| let target = j.next_multiple_of(4) + 2; | |
| if target > out.capacity() { | |
| out.reserve(target - out.capacity()); | |
| } | |
| // Evaluate cubic Bézier for computed t values | |
| for i in 0..j.div_ceil(4) { | |
| let t_arr: [f32; 4] = ts[i * 4..][..4].try_into().unwrap(); | |
| let t: f32x4<S> = t_arr.simd_into(simd); | |
| let t2 = t * t; | |
| let mt = 1.0 - t; | |
| let x = mt * (mt * mt * c[0].x + 3.0 * mt * t * c[1].x + 3.0 * t2 * c[2].x) + t * t2 * c[3].x; | |
| let y = mt * (mt * mt * c[0].y + 3.0 * mt * t * c[1].y + 3.0 * t2 * c[2].y) + t * t2 * c[3].y; | |
| if simd.level().as_neon().is_some() { | |
| use core::arch::aarch64::*; | |
| unsafe { | |
| let xy = float32x4x2_t(x.into(), y.into()); | |
| vst2q_f32((out.as_mut_ptr() as *mut f32).add(8 * i + 2), xy); | |
| } | |
| } else { | |
| todo!() | |
| } | |
| } | |
| unsafe { | |
| out.as_mut_ptr().write(c[0]); | |
| out.as_mut_ptr().add(j + 1).write(c[3]); | |
| out.set_len(j + 2); | |
| } | |
| } | |
| #[inline(never)] | |
| fn flatten(level: Level, c: [Point; 4], tol: f32, out: &mut Vec<Point>) { | |
| dispatch!(level, simd => flatten_impl(simd, c, tol, out)); | |
| } | |
| fn main() { | |
| let level = Level::new(); | |
| let c = [Point::new(0., 0.), Point::new(0., 120.), Point::new(100., 100.), Point::new(100., 0.)]; | |
| let mut out = vec![]; | |
| const TOL: f32 = 0.2; | |
| flatten(level, c, TOL, &mut out); | |
| println!("{out:?}"); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment