raphlinus · November 12, 2025 04:43
diff --git a/neo_flatten.rs b/neo_flatten.rs
 use fearless_simd::{Level, Select, Simd, SimdInto, dispatch, f32x4};

 #[repr(C)]
 #[derive(Clone, Copy, Debug)]
 struct Point {
    x: f32,
    y: f32,
 }

 impl Point {
    fn new(x: f32, y: f32) -> Self {
        Self { x, y }
    }

    fn hypot2(self) -> f32 {
        self.x * self.x + self.y * self.y
    }

    fn cross(self, other: Self) -> f32 {
        self.x * other.y - self.y * other.x
    }
 }

 impl core::ops::Sub for Point {
    type Output = Self;

    fn sub(self, rhs: Self) -> Self {
        Point::new(self.x - rhs.x, self.y - rhs.y)
    }
 }

 fn prefix_sum_inclusive<S: Simd>(simd: S, x: f32x4<S>) -> f32x4<S> {
    use core::arch::aarch64::*;
    if let Some(_neon) = simd.level().as_neon() {
        unsafe {
            let x2 = vextq_f32::<2>(vdupq_n_f32(0.0), x.into());
            let y2 = vaddq_f32(x.into(), x2);
            let x3 = vextq_f32::<3>(vdupq_n_f32(0.0), y2);
            vaddq_f32(y2, x3).simd_into(simd)
        }
    } else {
        todo!("scalar fallback")
    }
 }

 fn vext3<S: Simd>(simd: S, x: f32x4<S>, y: f32x4<S>) -> f32x4<S> {
    use core::arch::aarch64::*;
    if let Some(_neon) = simd.level().as_neon() {
        unsafe {
            vextq_f32::<3>(x.into(), y.into()).simd_into(simd)
        }
    } else {
        todo!("scalar fallback")
    }
 }

 #[inline(always)]
 fn flatten_impl<S: Simd>(simd: S, c: [Point; 4], tol: f32, out: &mut Vec<Point>) {
    let d0 = c[1] - c[0];
    let d1 = c[2] - c[1];
    let d2 = c[3] - c[2];
    let dd0 = d1 - d0;
    let dd1 = d2 - d1;
    let dd2 = dd0.hypot2().max(dd1.hypot2());
    // Number of subdivisions as per Wang's method
    let n = (0.75 * dd2.sqrt() / tol).sqrt().ceil();
    let n_int = n as usize;
    //println!("{n_int}");
    let iota: f32x4<S> = [0., 1., 2., 3.].simd_into(simd);
    let dt = 1.0 / n;
    let p1xp0 = d1.cross(d0);
    let p2xp0 = d2.cross(d0);
    let p2xp1 = d2.cross(d1);
    let c0 = p1xp0;
    let c1 = p2xp0 - 2. * p1xp0;
    let c2 = p2xp1 - p2xp0 + p1xp0;
    let ddd = dd1 - dd0;
    let mut last_sum = simd.splat_f32x4(0.0);
    let mut ts = [0f32; 32]; // TODO: deal with variable sizing
    let mut j = 0;
    for i in 0..n_int.div_ceil(4) {
        let t = (iota + (i as f32 * 4.0 + 0.5)) * dt;
        let t2 = t * t;
        let dx = d0.x + 2. * dd0.x * t + ddd.x * t2;
        let dy = d0.y + 2. * dd0.y * t + ddd.y * t2;
        let cross = c0 + t * c1 + t2 * c2;
        let dens = (cross.abs() / (tol * (dx * dx + dy * dy).sqrt())).sqrt();
        let dens_sum = last_sum[3] + prefix_sum_inclusive(simd, dens * dt);
        let mask = dens_sum.floor().simd_gt(vext3(simd, last_sum, dens_sum).floor());
        let mask2 = mask & t.simd_lt(1.0);
        let sample_t = t + 0.5 * dt - (dens_sum - dens_sum.floor()) / dens;
        let masked_sample = mask2.select(sample_t, simd.splat_f32x4(0.0));
        for t_sample in Into::<[f32; 4]>::into(masked_sample) {
            if t_sample != 0.0 {
                ts[j] = t_sample;
                j += 1;
            }
        }
        last_sum = dens_sum;
    }
    let target = j.next_multiple_of(4) + 2;
    if target > out.capacity() {
        out.reserve(target - out.capacity());
    }
    // Evaluate cubic Bézier for computed t values
    for i in 0..j.div_ceil(4) {
        let t_arr: [f32; 4] = ts[i * 4..][..4].try_into().unwrap();
        let t: f32x4<S> = t_arr.simd_into(simd);
        let t2 = t * t;
        let mt = 1.0 - t;
        let x = mt * (mt * mt * c[0].x + 3.0 * mt * t * c[1].x + 3.0 * t2 * c[2].x) + t * t2 * c[3].x;
        let y = mt * (mt * mt * c[0].y + 3.0 * mt * t * c[1].y + 3.0 * t2 * c[2].y) + t * t2 * c[3].y;
        if simd.level().as_neon().is_some() {
            use core::arch::aarch64::*;
            unsafe {
                let xy = float32x4x2_t(x.into(), y.into());
                vst2q_f32((out.as_mut_ptr() as *mut f32).add(8 * i + 2), xy);
            }
        } else {
            todo!()
        }
    }
    unsafe {
        out.as_mut_ptr().write(c[0]);
        out.as_mut_ptr().add(j + 1).write(c[3]);
        out.set_len(j + 2);
    }
 }

 #[inline(never)]
 fn flatten(level: Level, c: [Point; 4], tol: f32, out: &mut Vec<Point>) {
    dispatch!(level, simd => flatten_impl(simd, c, tol, out));
 }

 fn main() {
    let level = Level::new();
    let c = [Point::new(0., 0.), Point::new(0., 120.), Point::new(100., 100.), Point::new(100., 0.)];
    let mut out = vec![];
    const TOL: f32 = 0.2;
    flatten(level, c, TOL, &mut out);
    println!("{out:?}");
 }
	use fearless_simd::{Level, Select, Simd, SimdInto, dispatch, f32x4};

	#[repr(C)]
	#[derive(Clone, Copy, Debug)]
	struct Point {
	x: f32,
	y: f32,
	}

	impl Point {
	fn new(x: f32, y: f32) -> Self {
	Self { x, y }
	}

	fn hypot2(self) -> f32 {
	self.x * self.x + self.y * self.y
	}

	fn cross(self, other: Self) -> f32 {
	self.x * other.y - self.y * other.x
	}
	}

	impl core::ops::Sub for Point {
	type Output = Self;

	fn sub(self, rhs: Self) -> Self {
	Point::new(self.x - rhs.x, self.y - rhs.y)
	}
	}

	fn prefix_sum_inclusive<S: Simd>(simd: S, x: f32x4<S>) -> f32x4<S> {
	use core::arch::aarch64::*;
	if let Some(_neon) = simd.level().as_neon() {
	unsafe {
	let x2 = vextq_f32::<2>(vdupq_n_f32(0.0), x.into());
	let y2 = vaddq_f32(x.into(), x2);
	let x3 = vextq_f32::<3>(vdupq_n_f32(0.0), y2);
	vaddq_f32(y2, x3).simd_into(simd)
	}
	} else {
	todo!("scalar fallback")
	}
	}

	fn vext3<S: Simd>(simd: S, x: f32x4<S>, y: f32x4<S>) -> f32x4<S> {
	use core::arch::aarch64::*;
	if let Some(_neon) = simd.level().as_neon() {
	unsafe {
	vextq_f32::<3>(x.into(), y.into()).simd_into(simd)
	}
	} else {
	todo!("scalar fallback")
	}
	}

	#[inline(always)]
	fn flatten_impl<S: Simd>(simd: S, c: [Point; 4], tol: f32, out: &mut Vec<Point>) {
	let d0 = c[1] - c[0];
	let d1 = c[2] - c[1];
	let d2 = c[3] - c[2];
	let dd0 = d1 - d0;
	let dd1 = d2 - d1;
	let dd2 = dd0.hypot2().max(dd1.hypot2());
	// Number of subdivisions as per Wang's method
	let n = (0.75 * dd2.sqrt() / tol).sqrt().ceil();
	let n_int = n as usize;
	//println!("{n_int}");
	let iota: f32x4<S> = [0., 1., 2., 3.].simd_into(simd);
	let dt = 1.0 / n;
	let p1xp0 = d1.cross(d0);
	let p2xp0 = d2.cross(d0);
	let p2xp1 = d2.cross(d1);
	let c0 = p1xp0;
	let c1 = p2xp0 - 2. * p1xp0;
	let c2 = p2xp1 - p2xp0 + p1xp0;
	let ddd = dd1 - dd0;
	let mut last_sum = simd.splat_f32x4(0.0);
	let mut ts = [0f32; 32]; // TODO: deal with variable sizing
	let mut j = 0;
	for i in 0..n_int.div_ceil(4) {
	let t = (iota + (i as f32 * 4.0 + 0.5)) * dt;
	let t2 = t * t;
	let dx = d0.x + 2. * dd0.x * t + ddd.x * t2;
	let dy = d0.y + 2. * dd0.y * t + ddd.y * t2;
	let cross = c0 + t * c1 + t2 * c2;
	let dens = (cross.abs() / (tol * (dx * dx + dy * dy).sqrt())).sqrt();
	let dens_sum = last_sum[3] + prefix_sum_inclusive(simd, dens * dt);
	let mask = dens_sum.floor().simd_gt(vext3(simd, last_sum, dens_sum).floor());
	let mask2 = mask & t.simd_lt(1.0);
	let sample_t = t + 0.5 * dt - (dens_sum - dens_sum.floor()) / dens;
	let masked_sample = mask2.select(sample_t, simd.splat_f32x4(0.0));
	for t_sample in Into::<[f32; 4]>::into(masked_sample) {
	if t_sample != 0.0 {
	ts[j] = t_sample;
	j += 1;
	}
	}
	last_sum = dens_sum;
	}
	let target = j.next_multiple_of(4) + 2;
	if target > out.capacity() {
	out.reserve(target - out.capacity());
	}
	// Evaluate cubic Bézier for computed t values
	for i in 0..j.div_ceil(4) {
	let t_arr: [f32; 4] = ts[i * 4..][..4].try_into().unwrap();
	let t: f32x4<S> = t_arr.simd_into(simd);
	let t2 = t * t;
	let mt = 1.0 - t;
	let x = mt * (mt * mt * c[0].x + 3.0 * mt * t * c[1].x + 3.0 * t2 * c[2].x) + t * t2 * c[3].x;
	let y = mt * (mt * mt * c[0].y + 3.0 * mt * t * c[1].y + 3.0 * t2 * c[2].y) + t * t2 * c[3].y;
	if simd.level().as_neon().is_some() {
	use core::arch::aarch64::*;
	unsafe {
	let xy = float32x4x2_t(x.into(), y.into());
	vst2q_f32((out.as_mut_ptr() as mut f32).add(8 i + 2), xy);
	}
	} else {
	todo!()
	}
	}
	unsafe {
	out.as_mut_ptr().write(c[0]);
	out.as_mut_ptr().add(j + 1).write(c[3]);
	out.set_len(j + 2);
	}
	}

	#[inline(never)]
	fn flatten(level: Level, c: [Point; 4], tol: f32, out: &mut Vec<Point>) {
	dispatch!(level, simd => flatten_impl(simd, c, tol, out));
	}

	fn main() {
	let level = Level::new();
	let c = [Point::new(0., 0.), Point::new(0., 120.), Point::new(100., 100.), Point::new(100., 0.)];
	let mut out = vec![];
	const TOL: f32 = 0.2;
	flatten(level, c, TOL, &mut out);
	println!("{out:?}");
	}
No results found