Skip to content

Instantly share code, notes, and snippets.

@raphlinus
Created November 12, 2025 04:43
Show Gist options
  • Select an option

  • Save raphlinus/9cad030a74deed64e7b05a620807ecac to your computer and use it in GitHub Desktop.

Select an option

Save raphlinus/9cad030a74deed64e7b05a620807ecac to your computer and use it in GitHub Desktop.
prototype SIMD implementation of new flatten algorithm
use fearless_simd::{Level, Select, Simd, SimdInto, dispatch, f32x4};
#[repr(C)]
#[derive(Clone, Copy, Debug)]
struct Point {
x: f32,
y: f32,
}
impl Point {
fn new(x: f32, y: f32) -> Self {
Self { x, y }
}
fn hypot2(self) -> f32 {
self.x * self.x + self.y * self.y
}
fn cross(self, other: Self) -> f32 {
self.x * other.y - self.y * other.x
}
}
impl core::ops::Sub for Point {
type Output = Self;
fn sub(self, rhs: Self) -> Self {
Point::new(self.x - rhs.x, self.y - rhs.y)
}
}
fn prefix_sum_inclusive<S: Simd>(simd: S, x: f32x4<S>) -> f32x4<S> {
use core::arch::aarch64::*;
if let Some(_neon) = simd.level().as_neon() {
unsafe {
let x2 = vextq_f32::<2>(vdupq_n_f32(0.0), x.into());
let y2 = vaddq_f32(x.into(), x2);
let x3 = vextq_f32::<3>(vdupq_n_f32(0.0), y2);
vaddq_f32(y2, x3).simd_into(simd)
}
} else {
todo!("scalar fallback")
}
}
fn vext3<S: Simd>(simd: S, x: f32x4<S>, y: f32x4<S>) -> f32x4<S> {
use core::arch::aarch64::*;
if let Some(_neon) = simd.level().as_neon() {
unsafe {
vextq_f32::<3>(x.into(), y.into()).simd_into(simd)
}
} else {
todo!("scalar fallback")
}
}
#[inline(always)]
fn flatten_impl<S: Simd>(simd: S, c: [Point; 4], tol: f32, out: &mut Vec<Point>) {
let d0 = c[1] - c[0];
let d1 = c[2] - c[1];
let d2 = c[3] - c[2];
let dd0 = d1 - d0;
let dd1 = d2 - d1;
let dd2 = dd0.hypot2().max(dd1.hypot2());
// Number of subdivisions as per Wang's method
let n = (0.75 * dd2.sqrt() / tol).sqrt().ceil();
let n_int = n as usize;
//println!("{n_int}");
let iota: f32x4<S> = [0., 1., 2., 3.].simd_into(simd);
let dt = 1.0 / n;
let p1xp0 = d1.cross(d0);
let p2xp0 = d2.cross(d0);
let p2xp1 = d2.cross(d1);
let c0 = p1xp0;
let c1 = p2xp0 - 2. * p1xp0;
let c2 = p2xp1 - p2xp0 + p1xp0;
let ddd = dd1 - dd0;
let mut last_sum = simd.splat_f32x4(0.0);
let mut ts = [0f32; 32]; // TODO: deal with variable sizing
let mut j = 0;
for i in 0..n_int.div_ceil(4) {
let t = (iota + (i as f32 * 4.0 + 0.5)) * dt;
let t2 = t * t;
let dx = d0.x + 2. * dd0.x * t + ddd.x * t2;
let dy = d0.y + 2. * dd0.y * t + ddd.y * t2;
let cross = c0 + t * c1 + t2 * c2;
let dens = (cross.abs() / (tol * (dx * dx + dy * dy).sqrt())).sqrt();
let dens_sum = last_sum[3] + prefix_sum_inclusive(simd, dens * dt);
let mask = dens_sum.floor().simd_gt(vext3(simd, last_sum, dens_sum).floor());
let mask2 = mask & t.simd_lt(1.0);
let sample_t = t + 0.5 * dt - (dens_sum - dens_sum.floor()) / dens;
let masked_sample = mask2.select(sample_t, simd.splat_f32x4(0.0));
for t_sample in Into::<[f32; 4]>::into(masked_sample) {
if t_sample != 0.0 {
ts[j] = t_sample;
j += 1;
}
}
last_sum = dens_sum;
}
let target = j.next_multiple_of(4) + 2;
if target > out.capacity() {
out.reserve(target - out.capacity());
}
// Evaluate cubic Bézier for computed t values
for i in 0..j.div_ceil(4) {
let t_arr: [f32; 4] = ts[i * 4..][..4].try_into().unwrap();
let t: f32x4<S> = t_arr.simd_into(simd);
let t2 = t * t;
let mt = 1.0 - t;
let x = mt * (mt * mt * c[0].x + 3.0 * mt * t * c[1].x + 3.0 * t2 * c[2].x) + t * t2 * c[3].x;
let y = mt * (mt * mt * c[0].y + 3.0 * mt * t * c[1].y + 3.0 * t2 * c[2].y) + t * t2 * c[3].y;
if simd.level().as_neon().is_some() {
use core::arch::aarch64::*;
unsafe {
let xy = float32x4x2_t(x.into(), y.into());
vst2q_f32((out.as_mut_ptr() as *mut f32).add(8 * i + 2), xy);
}
} else {
todo!()
}
}
unsafe {
out.as_mut_ptr().write(c[0]);
out.as_mut_ptr().add(j + 1).write(c[3]);
out.set_len(j + 2);
}
}
#[inline(never)]
fn flatten(level: Level, c: [Point; 4], tol: f32, out: &mut Vec<Point>) {
dispatch!(level, simd => flatten_impl(simd, c, tol, out));
}
fn main() {
let level = Level::new();
let c = [Point::new(0., 0.), Point::new(0., 120.), Point::new(100., 100.), Point::new(100., 0.)];
let mut out = vec![];
const TOL: f32 = 0.2;
flatten(level, c, TOL, &mut out);
println!("{out:?}");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment