Skip to content

Instantly share code, notes, and snippets.

@stillwwater
Last active November 27, 2025 04:27
Show Gist options
  • Select an option

  • Save stillwwater/9bf5a917f5ac25397a9a48e334edf1c5 to your computer and use it in GitHub Desktop.

Select an option

Save stillwwater/9bf5a917f5ac25397a9a48e334edf1c5 to your computer and use it in GitHub Desktop.
IR convolution (for reverbs)
#define dsp_rfft_size(fftsize) ((fftsize) / 2 + 8)
#define dsp_overlap_size(fftsize) ((fftsize) / 2)
#define dsp_complex_array_count(fftsize) ((fftsize) * 2)
enum {
FFT_MIN_SIZE = 16,
FFT_MAX_SIZE = 4096,
};
struct fir_filter {
uint32 fftsize;
uint32 blocksize;
uint32 nblocks;
float wet;
uint32 t;
uint32 block;
// RFFT
float *output;
// Real (zero padded to fftsize)
float *input_real;
float *overlap_real;
// RFFT blocks
// see dsproom for memory layout
float *transferfunc_blocks;
float *input_blocks;
};
#include "module.h"
bool fft_isinit;
uint32 fft_bitrev[FFT_MAX_SIZE];
// The largest twiddle array has `1 << (log2(N) - 1)` elements. Each array is stored
// sequentially and so `2 * (1 << (log2(N) - 1)) == N` storage is used.
attr(aligned(32)) float fft_twiddles_r[FFT_MAX_SIZE];
attr(aligned(32)) float fft_twiddles_i[FFT_MAX_SIZE];
void attr(cold, noinline)
dsp_fft_init(void)
{
profile();
unsigned i, j;
unsigned s;
unsigned log2n = log2_int(FFT_MAX_SIZE);
// twiddle factors
for (s = 0; s < log2n; ++s) {
unsigned half_m = 1u << s;
for (j = 0; j < half_m; ++j) {
double tw = -M_PI * j / half_m;
// complex exp
fft_twiddles_r[half_m - 1 + j] = cos(tw);
fft_twiddles_i[half_m - 1 + j] = sin(tw);
}
}
// x86 does not have a bitreverse instruction and so a lot of ops are needed.
// it could be parallelized somewhat if the bitreverse step is mixed in with the first
// fft stage, mixing the integer bitreverse instructions with the float ops.
for (i = 0; i < FFT_MAX_SIZE; ++i) {
fft_bitrev[i] = __builtin_bitreverse32(i);
}
fft_isinit = true;
}
// Inplace Fast Fourier Transform.
//
// X:
// Complex array spectral input and output. The memory layout is planar (as
// opposed to the more common interleaved format). N real floats are followed
// by N imaginary floats.
//
// X_real = X
// X_imag = X + n
//
// ex. n=4: [RRRRIIII]
//
// n:
// FFT size
void
dsp_fft(float *X, unsigned n)
{
float Y[dsp_complex_array_count(FFT_MAX_SIZE)];
unsigned i, j, k;
unsigned s;
if (__builtin_expect(!fft_isinit, 0))
dsp_fft_init();
profile();
assert(n >= FFT_MIN_SIZE);
assert(n <= FFT_MAX_SIZE);
unsigned log2n = log2_int(n);
float *X_r = X;
float *X_i = X + n;
// bitreverse for DIT
//
// could also be done in place, but this is faster since it removes the
// branch in the loop
float *Y_r = Y;
float *Y_i = Y + n;
for (i = 0; i < n; ++i) {
#ifdef __aarch64__
j = __builtin_bitreverse32(i) >> (32 - log2n);
#else
j = fft_bitrev[i] >> (32 - log2n);
#endif
Y_r[i] = X_r[j];
Y_i[i] = X_i[j];
}
memcpy(X, Y, dsp_complex_array_count(n) * sizeof *X);
// DIT radix-2 butterflies
for (s = 0; s < log2_int(FLOATN); ++s) {
unsigned half_m = 1u << s;
unsigned m = half_m << 1;
float *w_r = &fft_twiddles_r[half_m - 1];
float *w_i = &fft_twiddles_i[half_m - 1];
for (k = 0; k < n; k += m) {
for (j = 0; j < half_m; ++j) {
float t_r, t_i;
float u_r, u_i;
unsigned i0 = k + j;
unsigned i1 = i0 + half_m;
t_r = w_r[j] * X_r[i1] - w_i[j] * X_i[i1];
t_i = w_r[j] * X_i[i1] + w_i[j] * X_r[i1];
u_r = X_r[i0];
u_i = X_i[i0];
X_r[i0] = u_r + t_r;
X_i[i0] = u_i + t_i;
X_r[i1] = u_r - t_r;
X_i[i1] = u_i - t_i;
}
}
}
for (; s < log2n; ++s) {
unsigned half_m = 1u << s;
unsigned m = half_m << 1;
float *w_r = &fft_twiddles_r[half_m - 1];
float *w_i = &fft_twiddles_i[half_m - 1];
for (k = 0; k < n; k += m) {
for (j = 0; j < half_m; j += FLOATN) {
floatN t_r, t_i;
floatN u_r, u_i;
unsigned i0 = k + j;
unsigned i1 = i0 + half_m;
t_r = float_load(&w_r[j]) * float_load(&X_r[i1])
- float_load(&w_i[j]) * float_load(&X_i[i1]);
t_i = float_load(&w_r[j]) * float_load(&X_i[i1])
+ float_load(&w_i[j]) * float_load(&X_r[i1]);
u_r = float_load(&X_r[i0]);
u_i = float_load(&X_i[i0]);
float_store(&X_r[i0], u_r + t_r);
float_store(&X_i[i0], u_i + t_i);
float_store(&X_r[i1], u_r - t_r);
float_store(&X_i[i1], u_i - t_i);
}
}
}
}
// Fast Fourier Transform of a real signal. This is an optimization for signals
// where imag=0 taking advange of its Hermitian symmetry. The input signal is
// compressed by storing even samples in the real part and odd samples in the
// imaginary part, effectively reducing the FFT computation in half.
//
// The inverse FFT must use `dsp_rifft` to restore the real signal.
//
// x:
// Real input
//
// Y:
// Spectral output: complex array of size `dsp_rfft_size(n)`.
//
// n:
// Number of real input samples. Must be even.
void
dsp_rfft(float *Y, float *x, unsigned n)
{
if (__builtin_expect(!fft_isinit, 0))
dsp_fft_init();
profile();
static attr(aligned(32)) float X[dsp_complex_array_count(FFT_MAX_SIZE / 2)];
unsigned k;
unsigned half_size = n / 2;
assert(n <= FFT_MAX_SIZE);
assert(half_size >= FFT_MIN_SIZE);
float *X_r = X;
float *X_i = X + half_size;
float *Y_r = Y;
float *Y_i = Y + dsp_rfft_size(n);
for (k = 0; k < half_size; ++k) {
X_r[k] = x[2 * k];
X_i[k] = x[2 * k + 1];
}
dsp_fft(X_r, half_size);
// DC
Y_r[0] = X_r[0] + X_i[0];
Y_i[0] = 0.0;
// Nyquist
Y_r[half_size] = X_r[0] - X_i[0];
Y_i[half_size] = 0.0;
for (k = 1; k < 8; ++k)
Y_r[half_size + k] = Y_i[half_size + k] = 0.0;
float *w_r = &fft_twiddles_r[half_size - 1];
float *w_i = &fft_twiddles_i[half_size - 1];
for (k = 1; k < half_size; ++k) {
unsigned kn = half_size - k;
float u_r = 0.5f * (X_r[k] + X_r[kn]);
float u_i = 0.5f * (X_i[k] - X_i[kn]);
float v_r = 0.5f * (X_i[k] + X_i[kn]);
float v_i = -0.5f * (X_r[k] - X_r[kn]);
float t_r = w_r[k] * v_r - w_i[k] * v_i;
float t_i = w_r[k] * v_i + w_i[k] * v_r;
// Add u + t
Y_r[k] = u_r + t_r;
Y_i[k] = u_i + t_i;
// symmetry since N-k is already needed above allowing a k <= half_size/2 loop
// instead. it's usually faster to compute the entire array though due to the
// extra shuffles this requires.
// Y_r[kn] = u_r - t_r;
// Y_i[kn] = t_i - u_i;
}
}
// Inverse Fast Fourier Transform of a real signal.
//
// y:
// Real output of size `n`.
//
// X:
// Spectral input: complex array of size `dsp_rfft_size(n)`.
//
// n:
// Number of real output samples. Must be even.
void
dsp_rifft(float *y, float *X, unsigned n)
{
if (__builtin_expect(!fft_isinit, 0))
dsp_fft_init();
profile();
static attr(aligned(32)) float Y[dsp_complex_array_count(FFT_MAX_SIZE / 2)];
unsigned k;
unsigned half_size = n / 2;
assert(n <= FFT_MAX_SIZE);
assert(half_size >= FFT_MIN_SIZE);
float *X_r = X;
float *X_i = X + dsp_rfft_size(n);
float *Y_r = Y;
float *Y_i = Y + half_size;
// DC and Nyquist conjugate
Y_r[0] = 0.5f * (X_r[0] + X_r[half_size]);
Y_i[0] = -0.5f * (X_r[0] - X_r[half_size]);
float *w_r = &fft_twiddles_r[half_size - 1];
float *w_i = &fft_twiddles_i[half_size - 1];
for (k = 1; k < half_size; ++k) {
// conjugate
unsigned kn = half_size - k;
float u_r = 0.5f * (X_r[k] + X_r[kn]);
float u_i = 0.5f * (X_i[k] - X_i[kn]);
float v_r = 0.5f * (X_i[k] + X_i[kn]);
float v_i = -0.5f * (X_r[k] - X_r[kn]);
float t_r = w_r[k] * v_r - (-w_i[k]) * v_i;
float t_i = w_r[k] * v_i + (-w_i[k]) * v_r;
// A[k] = conj(A[k])
Y_r[k] = u_r - t_r;
Y_i[k] = -(u_i - t_i);
// symmetry since N-k is already needed above
// Y_r[kn] = u_r + t_r;
// Y_i[kn] = u_i + t_i;
}
dsp_fft(Y_r, half_size);
// conjugate, de-interleave, and normalize output
float scale = 1.0f / half_size;
for (k = 0; k < half_size; ++k) {
y[2 * k] = Y_r[k] * scale;
y[2 * k + 1] = Y_i[k] * -scale;
}
}
void
dsp_complex_mul(float *Y, float *X, float *H, unsigned n)
{
unsigned i;
assert(Y != X);
assert(Y != H);
assert((n & (FLOATN - 1)) == 0);
float *Y_r = Y;
float *Y_i = Y_r + n;
float *X_r = X;
float *X_i = X_r + n;
float *H_r = H;
float *H_i = H_r + n;
for (i = 0; i < n; i += FLOATN) {
floatN y_r = float_load(&Y_r[i]);
floatN y_i = float_load(&Y_i[i]);
floatN x_r = float_load(&X_r[i]);
floatN x_i = float_load(&X_i[i]);
floatN h_r = float_load(&H_r[i]);
floatN h_i = float_load(&H_i[i]);
float_store(&Y_r[i], y_r + x_r * h_r - x_i * h_i);
float_store(&Y_i[i], y_i + x_r * h_i + x_i * h_r);
}
}
// Linear convolution applying a transfer function (IR) to the input buffer.
//
// This is done with FFT(X) * FFT(IR). Since this method would produce circular
// convolution, the inputs are padded to 2N. The second half is saved / added
// to the next call to fir_filter, allowing continuous output.
//
// This approach introduces delay equal to the size of the overlap buffer. It
// is generally fine for reverbs, but faster responding IRs require smaller
// block sizes (which is also slower). If this becomes a problem, consider
// splitting the IR into a short head and long tail for faster response of
// early reflections.
void
dsp_fir_filter(struct fir_filter *f, int chan, struct audio_buffer *buffer, uint64 starttime, uint64 endtime)
{
uint64 t;
unsigned i, k;
attr(aligned(32)) float output[dsp_complex_array_count(dsp_rfft_size(FFT_MAX_SIZE))];
attr(aligned(32)) float output_real[FFT_MAX_SIZE];
float dry = 1.0f - f->wet;
if (!f->nblocks)
return;
profile();
unsigned processed = 0;
unsigned nsamples = endtime - starttime;
unsigned overlapsize = dsp_overlap_size(f->fftsize);
while (processed < nsamples) {
unsigned count = min(nsamples - processed, overlapsize - f->t);
// FFT input buffer
for (i = 0; i < count; ++i) {
t = starttime + processed + i;
float *frame = (float *)&buffer->samples[t & buffer->mask];
float in = frame[chan]; // frame->left + frame->right
f->input_real[i + f->t] = 0.5f * in;
}
dsp_rfft(&f->input_blocks[f->block * f->blocksize], f->input_real, f->fftsize);
// multiply input blocks with all IR blocks
// use the cached convolution if we don't have a full block yet
if (f->t == 0) {
profile_value("init_block", f->nblocks);
memset(f->output, 0, f->blocksize * sizeof *f->output);
for (k = 1; k < f->nblocks; ++k) {
unsigned block = (f->block - k + f->nblocks) % f->nblocks;
float *Xk = &f->input_blocks[block * f->blocksize];
float *Hk = &f->transferfunc_blocks[k * f->blocksize];
dsp_complex_mul(f->output, Xk, Hk, dsp_rfft_size(f->fftsize));
}
}
// apply the most recent (potentially incomplete) input block with
// the first IR block
memcpy(output, f->output, f->blocksize * sizeof *f->output);
dsp_complex_mul(output,
&f->input_blocks[f->block * f->blocksize],
&f->transferfunc_blocks[0],
dsp_rfft_size(f->fftsize));
dsp_rifft(output_real, output, f->fftsize);
// overlapp add
for (i = 0; i < count; ++i) {
t = starttime + processed + i;
float *frame = (float *)&buffer->samples[t & buffer->mask];
// 0.5 tweak
float out = 0.5f * (output_real[i + f->t] + f->overlap_real[f->t + i]);
frame[chan] = dry * frame[chan] + f->wet * out;
}
f->t += count;
if (f->t == overlapsize) {
// update overlap buffer
memcpy(f->overlap_real, output_real + overlapsize, overlapsize * sizeof(float));
// starting a new block
memset(f->input_real, 0, f->fftsize * sizeof *f->input_real);
f->block = (f->block + 1) % f->nblocks;
f->t = 0;
}
processed += count;
}
}
void *
dsp_fir_filter_alloc(void *arena, usize *arenasize, struct dsproom *rir)
{
usize fftmem = rir->blocksize * rir->nchannels * sizeof(float);
usize realmem = rir->fftsize * rir->nchannels * sizeof(float);
usize newsize
= fftmem * rir->nblocks // input
+ fftmem // output
+ realmem // input_real
+ realmem; // overlap_real
if (!arena || newsize > *arenasize) {
sys_free(arena);
arena = sys_alloc(H_OS, newsize);
*arenasize = newsize;
}
return arena;
}
unsigned
dsp_fir_filter_init(struct fir_filter *filters, unsigned nfilters, void *arena, struct dsproom *rir)
{
unsigned nchannels = min(nfilters, rir->nchannels);
usize fftmem = rir->blocksize * sizeof(float);
usize realmem = rir->fftsize * sizeof(float);
assert((fftmem & 31) == 0);
assert((realmem & 31) == 0);
for (unsigned chan = 0; chan < nchannels; ++chan) {
struct fir_filter *f = &filters[chan];
f->fftsize = rir->fftsize;
f->blocksize = rir->blocksize;
f->nblocks = rir->nblocks;
f->wet = rir->wet;
f->output = arena;
arena += fftmem;
f->input_real = arena;
arena += realmem;
f->overlap_real = arena;
arena += realmem;
f->transferfunc_blocks = &cl_base->fftdata[rir->data];
f->input_blocks = arena;
arena += fftmem * rir->nblocks;
}
return nchannels;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment