stillwwater · November 27, 2025 04:27
diff --git a/dsp.c b/dsp.c
 #define dsp_rfft_size(fftsize) ((fftsize) / 2 + 8)
 #define dsp_overlap_size(fftsize) ((fftsize) / 2)
 #define dsp_complex_array_count(fftsize) ((fftsize) * 2)

 enum {
    FFT_MIN_SIZE = 16,
    FFT_MAX_SIZE = 4096,
 };

 struct fir_filter {
    uint32 fftsize;
    uint32 blocksize;
    uint32 nblocks;
    float  wet;

    uint32 t;
    uint32 block;

    // RFFT
    float *output;

    // Real (zero padded to fftsize)
    float *input_real;
    float *overlap_real;

    // RFFT blocks
    // see dsproom for memory layout
    float *transferfunc_blocks;
    float *input_blocks;
 };

 #include "module.h"

 bool   fft_isinit;
 uint32 fft_bitrev[FFT_MAX_SIZE];

 // The largest twiddle array has `1 << (log2(N) - 1)` elements. Each array is stored
 // sequentially and so `2 * (1 << (log2(N) - 1)) == N` storage is used.
 attr(aligned(32)) float fft_twiddles_r[FFT_MAX_SIZE];
 attr(aligned(32)) float fft_twiddles_i[FFT_MAX_SIZE];

 void attr(cold, noinline)
 dsp_fft_init(void)
 {
    profile();

    unsigned i, j;
    unsigned s;
    unsigned log2n = log2_int(FFT_MAX_SIZE);

    // twiddle factors

    for (s = 0; s < log2n; ++s) {
        unsigned half_m = 1u << s;

        for (j = 0; j < half_m; ++j) {
            double tw = -M_PI * j / half_m;

            // complex exp
            fft_twiddles_r[half_m - 1 + j] = cos(tw);
            fft_twiddles_i[half_m - 1 + j] = sin(tw);
        }
    }

    // x86 does not have a bitreverse instruction and so a lot of ops are needed.
    // it could be parallelized somewhat if the bitreverse step is mixed in with the first
    // fft stage, mixing the integer bitreverse instructions with the float ops.

    for (i = 0; i < FFT_MAX_SIZE; ++i) {
        fft_bitrev[i] = __builtin_bitreverse32(i);
    }

    fft_isinit = true;
 }

 // Inplace Fast Fourier Transform.
 //
 // X:
 //   Complex array spectral input and output. The memory layout is planar (as
 //   opposed to the more common interleaved format). N real floats are followed
 //   by N imaginary floats.
 //
 //      X_real = X
 //      X_imag = X + n
 //
 //   ex. n=4: [RRRRIIII]
 //
 // n:
 //   FFT size
 void
 dsp_fft(float *X, unsigned n)
 {
    float Y[dsp_complex_array_count(FFT_MAX_SIZE)];
    unsigned i, j, k;
    unsigned s;

    if (__builtin_expect(!fft_isinit, 0))
        dsp_fft_init();

    profile();

    assert(n >= FFT_MIN_SIZE);
    assert(n <= FFT_MAX_SIZE);
    unsigned log2n = log2_int(n);

    float *X_r = X;
    float *X_i = X + n;

    // bitreverse for DIT
    //
    // could also be done in place, but this is faster since it removes the
    // branch in the loop

    float *Y_r = Y;
    float *Y_i = Y + n;

    for (i = 0; i < n; ++i) {
 #ifdef __aarch64__
        j = __builtin_bitreverse32(i) >> (32 - log2n);
 #else
        j = fft_bitrev[i] >> (32 - log2n);
 #endif
        Y_r[i] = X_r[j];
        Y_i[i] = X_i[j];
    }

    memcpy(X, Y, dsp_complex_array_count(n) * sizeof *X);

    // DIT radix-2 butterflies

    for (s = 0; s < log2_int(FLOATN); ++s) {
        unsigned half_m = 1u << s;
        unsigned m = half_m << 1;

        float *w_r = &fft_twiddles_r[half_m - 1];
        float *w_i = &fft_twiddles_i[half_m - 1];

        for (k = 0; k < n; k += m) {
            for (j = 0; j < half_m; ++j) {
                float t_r, t_i;
                float u_r, u_i;

                unsigned i0 = k + j;
                unsigned i1 = i0 + half_m;

                t_r = w_r[j] * X_r[i1] - w_i[j] * X_i[i1];
                t_i = w_r[j] * X_i[i1] + w_i[j] * X_r[i1];

                u_r = X_r[i0];
                u_i = X_i[i0];

                X_r[i0] = u_r + t_r;
                X_i[i0] = u_i + t_i;
                X_r[i1] = u_r - t_r;
                X_i[i1] = u_i - t_i;
            }
        }
    }

    for (; s < log2n; ++s) {
        unsigned half_m = 1u << s;
        unsigned m = half_m << 1;

        float *w_r = &fft_twiddles_r[half_m - 1];
        float *w_i = &fft_twiddles_i[half_m - 1];

        for (k = 0; k < n; k += m) {
            for (j = 0; j < half_m; j += FLOATN) {
                floatN t_r, t_i;
                floatN u_r, u_i;

                unsigned i0 = k + j;
                unsigned i1 = i0 + half_m;

                t_r = float_load(&w_r[j]) * float_load(&X_r[i1])
                    - float_load(&w_i[j]) * float_load(&X_i[i1]);

                t_i = float_load(&w_r[j]) * float_load(&X_i[i1])
                    + float_load(&w_i[j]) * float_load(&X_r[i1]);

                u_r = float_load(&X_r[i0]);
                u_i = float_load(&X_i[i0]);

                float_store(&X_r[i0], u_r + t_r);
                float_store(&X_i[i0], u_i + t_i);
                float_store(&X_r[i1], u_r - t_r);
                float_store(&X_i[i1], u_i - t_i);
            }
        }
    }
 }

 // Fast Fourier Transform of a real signal. This is an optimization for signals
 // where imag=0 taking advange of its Hermitian symmetry. The input signal is
 // compressed by storing even samples in the real part and odd samples in the
 // imaginary part, effectively reducing the FFT computation in half.
 //
 // The inverse FFT must use `dsp_rifft` to restore the real signal.
 //
 // x:
 //   Real input
 //
 // Y:
 //   Spectral output: complex array of size `dsp_rfft_size(n)`.
 //
 // n:
 //   Number of real input samples. Must be even.
 void
 dsp_rfft(float *Y, float *x, unsigned n)
 {
    if (__builtin_expect(!fft_isinit, 0))
        dsp_fft_init();

    profile();

    static attr(aligned(32)) float X[dsp_complex_array_count(FFT_MAX_SIZE / 2)];

    unsigned k;
    unsigned half_size = n / 2;

    assert(n <= FFT_MAX_SIZE);
    assert(half_size >= FFT_MIN_SIZE);

    float *X_r = X;
    float *X_i = X + half_size;
    float *Y_r = Y;
    float *Y_i = Y + dsp_rfft_size(n);

    for (k = 0; k < half_size; ++k) {
        X_r[k] = x[2 * k];
        X_i[k] = x[2 * k + 1];
    }

    dsp_fft(X_r, half_size);

    // DC
    Y_r[0] = X_r[0] + X_i[0];
    Y_i[0] = 0.0;

    // Nyquist
    Y_r[half_size] = X_r[0] - X_i[0];
    Y_i[half_size] = 0.0;

    for (k = 1; k < 8; ++k)
        Y_r[half_size + k] = Y_i[half_size + k] = 0.0;

    float *w_r = &fft_twiddles_r[half_size - 1];
    float *w_i = &fft_twiddles_i[half_size - 1];

    for (k = 1; k < half_size; ++k) {
        unsigned kn = half_size - k;

        float u_r =  0.5f * (X_r[k] + X_r[kn]);
        float u_i =  0.5f * (X_i[k] - X_i[kn]);
        float v_r =  0.5f * (X_i[k] + X_i[kn]);
        float v_i = -0.5f * (X_r[k] - X_r[kn]);

        float t_r = w_r[k] * v_r - w_i[k] * v_i;
        float t_i = w_r[k] * v_i + w_i[k] * v_r;

        // Add u + t
        Y_r[k] = u_r + t_r;
        Y_i[k] = u_i + t_i;

        // symmetry since N-k is already needed above allowing a k <= half_size/2 loop
        // instead. it's usually faster to compute the entire array though due to the
        // extra shuffles this requires.
        // Y_r[kn] = u_r - t_r;
        // Y_i[kn] = t_i - u_i;
    }
 }

 // Inverse Fast Fourier Transform of a real signal.
 //
 // y:
 //   Real output of size `n`.
 //
 // X:
 //   Spectral input: complex array of size `dsp_rfft_size(n)`.
 //
 // n:
 //   Number of real output samples. Must be even.
 void
 dsp_rifft(float *y, float *X, unsigned n)
 {
    if (__builtin_expect(!fft_isinit, 0))
        dsp_fft_init();

    profile();

    static attr(aligned(32)) float Y[dsp_complex_array_count(FFT_MAX_SIZE / 2)];

    unsigned k;
    unsigned half_size = n / 2;

    assert(n <= FFT_MAX_SIZE);
    assert(half_size >= FFT_MIN_SIZE);

    float *X_r = X;
    float *X_i = X + dsp_rfft_size(n);
    float *Y_r = Y;
    float *Y_i = Y + half_size;

    // DC and Nyquist conjugate
    Y_r[0] =  0.5f * (X_r[0] + X_r[half_size]);
    Y_i[0] = -0.5f * (X_r[0] - X_r[half_size]);

    float *w_r = &fft_twiddles_r[half_size - 1];
    float *w_i = &fft_twiddles_i[half_size - 1];

    for (k = 1; k < half_size; ++k) {
        // conjugate
        unsigned kn = half_size - k;

        float u_r =  0.5f * (X_r[k] + X_r[kn]);
        float u_i =  0.5f * (X_i[k] - X_i[kn]);
        float v_r =  0.5f * (X_i[k] + X_i[kn]);
        float v_i = -0.5f * (X_r[k] - X_r[kn]);

        float t_r = w_r[k] * v_r - (-w_i[k]) * v_i;
        float t_i = w_r[k] * v_i + (-w_i[k]) * v_r;

        // A[k] = conj(A[k])
        Y_r[k] = u_r - t_r;
        Y_i[k] = -(u_i - t_i);

        // symmetry since N-k is already needed above
        // Y_r[kn] = u_r + t_r;
        // Y_i[kn] = u_i + t_i;
    }

    dsp_fft(Y_r, half_size);

    // conjugate, de-interleave, and normalize output
    float scale = 1.0f / half_size;

    for (k = 0; k < half_size; ++k) {
        y[2 * k]     = Y_r[k] * scale;
        y[2 * k + 1] = Y_i[k] * -scale;
    }
 }

 void
 dsp_complex_mul(float *Y, float *X, float *H, unsigned n)
 {
    unsigned i;

    assert(Y != X);
    assert(Y != H);

    assert((n & (FLOATN - 1)) == 0);

    float *Y_r = Y;
    float *Y_i = Y_r + n;
    float *X_r = X;
    float *X_i = X_r + n;
    float *H_r = H;
    float *H_i = H_r + n;

    for (i = 0; i < n; i += FLOATN) {
        floatN y_r = float_load(&Y_r[i]);
        floatN y_i = float_load(&Y_i[i]);
        floatN x_r = float_load(&X_r[i]);
        floatN x_i = float_load(&X_i[i]);
        floatN h_r = float_load(&H_r[i]);
        floatN h_i = float_load(&H_i[i]);

        float_store(&Y_r[i], y_r + x_r * h_r - x_i * h_i);
        float_store(&Y_i[i], y_i + x_r * h_i + x_i * h_r);
    }
 }

 // Linear convolution applying a transfer function (IR) to the input buffer.
 //
 // This is done with FFT(X) * FFT(IR). Since this method would produce circular
 // convolution, the inputs are padded to 2N. The second half is saved / added
 // to the next call to fir_filter, allowing continuous output.
 //
 // This approach introduces delay equal to the size of the overlap buffer. It
 // is generally fine for reverbs, but faster responding IRs require smaller
 // block sizes (which is also slower). If this becomes a problem, consider
 // splitting the IR into a short head and long tail for faster response of
 // early reflections.
 void
 dsp_fir_filter(struct fir_filter *f, int chan, struct audio_buffer *buffer, uint64 starttime, uint64 endtime)
 {
    uint64 t;
    unsigned i, k;
    attr(aligned(32)) float output[dsp_complex_array_count(dsp_rfft_size(FFT_MAX_SIZE))];
    attr(aligned(32)) float output_real[FFT_MAX_SIZE];

    float dry = 1.0f - f->wet;
    if (!f->nblocks)
        return;

    profile();

    unsigned processed = 0;
    unsigned nsamples = endtime - starttime;
    unsigned overlapsize = dsp_overlap_size(f->fftsize);

    while (processed < nsamples) {
        unsigned count = min(nsamples - processed, overlapsize - f->t);

        // FFT input buffer

        for (i = 0; i < count; ++i) {
            t = starttime + processed + i;
            float *frame = (float *)&buffer->samples[t & buffer->mask];
            float in = frame[chan]; // frame->left + frame->right

            f->input_real[i + f->t] = 0.5f * in;
        }

        dsp_rfft(&f->input_blocks[f->block * f->blocksize], f->input_real, f->fftsize);

        // multiply input blocks with all IR blocks
        // use the cached convolution if we don't have a full block yet

        if (f->t == 0) {
            profile_value("init_block", f->nblocks);
            memset(f->output, 0, f->blocksize * sizeof *f->output);

            for (k = 1; k < f->nblocks; ++k) {
                unsigned block = (f->block - k + f->nblocks) % f->nblocks;
                float *Xk = &f->input_blocks[block * f->blocksize];
                float *Hk = &f->transferfunc_blocks[k * f->blocksize];

                dsp_complex_mul(f->output, Xk, Hk, dsp_rfft_size(f->fftsize));
            }
        }

        // apply the most recent (potentially incomplete) input block with
        // the first IR block
        memcpy(output, f->output, f->blocksize * sizeof *f->output);

        dsp_complex_mul(output,
                &f->input_blocks[f->block * f->blocksize],
                &f->transferfunc_blocks[0],
                dsp_rfft_size(f->fftsize));

        dsp_rifft(output_real, output, f->fftsize);

        // overlapp add

        for (i = 0; i < count; ++i) {
            t = starttime + processed + i;
            float *frame = (float *)&buffer->samples[t & buffer->mask];

            // 0.5 tweak
            float out = 0.5f * (output_real[i + f->t] + f->overlap_real[f->t + i]);
            frame[chan] = dry * frame[chan] + f->wet * out;
        }

        f->t += count;

        if (f->t == overlapsize) {
            // update overlap buffer
            memcpy(f->overlap_real, output_real + overlapsize, overlapsize * sizeof(float));

            // starting a new block
            memset(f->input_real, 0, f->fftsize * sizeof *f->input_real);
            f->block = (f->block + 1) % f->nblocks;
            f->t = 0;
        }

        processed += count;
    }
 }

 void *
 dsp_fir_filter_alloc(void *arena, usize *arenasize, struct dsproom *rir)
 {
    usize fftmem  = rir->blocksize * rir->nchannels * sizeof(float);
    usize realmem = rir->fftsize * rir->nchannels * sizeof(float);

    usize newsize
        = fftmem * rir->nblocks // input
        + fftmem                // output
        + realmem               // input_real
        + realmem;              // overlap_real

    if (!arena || newsize > *arenasize) {
        sys_free(arena);

        arena = sys_alloc(H_OS, newsize);
        *arenasize = newsize;
    }

    return arena;
 }

 unsigned
 dsp_fir_filter_init(struct fir_filter *filters, unsigned nfilters, void *arena, struct dsproom *rir)
 {
    unsigned nchannels = min(nfilters, rir->nchannels);

    usize fftmem  = rir->blocksize * sizeof(float);
    usize realmem = rir->fftsize * sizeof(float);

    assert((fftmem & 31) == 0);
    assert((realmem & 31) == 0);

    for (unsigned chan = 0; chan < nchannels; ++chan) {
        struct fir_filter *f = &filters[chan];

        f->fftsize     = rir->fftsize;
        f->blocksize   = rir->blocksize;
        f->nblocks     = rir->nblocks;
        f->wet         = rir->wet;

        f->output = arena;
        arena += fftmem;

        f->input_real = arena;
        arena += realmem;

        f->overlap_real = arena;
        arena += realmem;

        f->transferfunc_blocks = &cl_base->fftdata[rir->data];
        f->input_blocks = arena;
        arena += fftmem * rir->nblocks;
    }

    return nchannels;
 }
	#define dsp_rfft_size(fftsize) ((fftsize) / 2 + 8)
	#define dsp_overlap_size(fftsize) ((fftsize) / 2)
	#define dsp_complex_array_count(fftsize) ((fftsize) * 2)

	enum {
	FFT_MIN_SIZE = 16,
	FFT_MAX_SIZE = 4096,
	};

	struct fir_filter {
	uint32 fftsize;
	uint32 blocksize;
	uint32 nblocks;
	float wet;

	uint32 t;
	uint32 block;

	// RFFT
	float *output;

	// Real (zero padded to fftsize)
	float *input_real;
	float *overlap_real;

	// RFFT blocks
	// see dsproom for memory layout
	float *transferfunc_blocks;
	float *input_blocks;
	};

	#include "module.h"

	bool fft_isinit;
	uint32 fft_bitrev[FFT_MAX_SIZE];

	// The largest twiddle array has `1 << (log2(N) - 1)` elements. Each array is stored
	// sequentially and so `2 * (1 << (log2(N) - 1)) == N` storage is used.
	attr(aligned(32)) float fft_twiddles_r[FFT_MAX_SIZE];
	attr(aligned(32)) float fft_twiddles_i[FFT_MAX_SIZE];

	void attr(cold, noinline)
	dsp_fft_init(void)
	{
	profile();

	unsigned i, j;
	unsigned s;
	unsigned log2n = log2_int(FFT_MAX_SIZE);

	// twiddle factors

	for (s = 0; s < log2n; ++s) {
	unsigned half_m = 1u << s;

	for (j = 0; j < half_m; ++j) {
	double tw = -M_PI * j / half_m;

	// complex exp
	fft_twiddles_r[half_m - 1 + j] = cos(tw);
	fft_twiddles_i[half_m - 1 + j] = sin(tw);
	}
	}

	// x86 does not have a bitreverse instruction and so a lot of ops are needed.
	// it could be parallelized somewhat if the bitreverse step is mixed in with the first
	// fft stage, mixing the integer bitreverse instructions with the float ops.

	for (i = 0; i < FFT_MAX_SIZE; ++i) {
	fft_bitrev[i] = __builtin_bitreverse32(i);
	}

	fft_isinit = true;
	}

	// Inplace Fast Fourier Transform.
	//
	// X:
	// Complex array spectral input and output. The memory layout is planar (as
	// opposed to the more common interleaved format). N real floats are followed
	// by N imaginary floats.
	//
	// X_real = X
	// X_imag = X + n
	//
	// ex. n=4: [RRRRIIII]
	//
	// n:
	// FFT size
	void
	dsp_fft(float *X, unsigned n)
	{
	float Y[dsp_complex_array_count(FFT_MAX_SIZE)];
	unsigned i, j, k;
	unsigned s;

	if (__builtin_expect(!fft_isinit, 0))
	dsp_fft_init();

	profile();

	assert(n >= FFT_MIN_SIZE);
	assert(n <= FFT_MAX_SIZE);
	unsigned log2n = log2_int(n);

	float *X_r = X;
	float *X_i = X + n;

	// bitreverse for DIT
	//
	// could also be done in place, but this is faster since it removes the
	// branch in the loop

	float *Y_r = Y;
	float *Y_i = Y + n;

	for (i = 0; i < n; ++i) {
	#ifdef __aarch64__
	j = __builtin_bitreverse32(i) >> (32 - log2n);
	#else
	j = fft_bitrev[i] >> (32 - log2n);
	#endif
	Y_r[i] = X_r[j];
	Y_i[i] = X_i[j];
	}

	memcpy(X, Y, dsp_complex_array_count(n) * sizeof *X);

	// DIT radix-2 butterflies

	for (s = 0; s < log2_int(FLOATN); ++s) {
	unsigned half_m = 1u << s;
	unsigned m = half_m << 1;

	float *w_r = &fft_twiddles_r[half_m - 1];
	float *w_i = &fft_twiddles_i[half_m - 1];

	for (k = 0; k < n; k += m) {
	for (j = 0; j < half_m; ++j) {
	float t_r, t_i;
	float u_r, u_i;

	unsigned i0 = k + j;
	unsigned i1 = i0 + half_m;

	t_r = w_r[j] * X_r[i1] - w_i[j] * X_i[i1];
	t_i = w_r[j] * X_i[i1] + w_i[j] * X_r[i1];

	u_r = X_r[i0];
	u_i = X_i[i0];

	X_r[i0] = u_r + t_r;
	X_i[i0] = u_i + t_i;
	X_r[i1] = u_r - t_r;
	X_i[i1] = u_i - t_i;
	}
	}
	}

	for (; s < log2n; ++s) {
	unsigned half_m = 1u << s;
	unsigned m = half_m << 1;

	float *w_r = &fft_twiddles_r[half_m - 1];
	float *w_i = &fft_twiddles_i[half_m - 1];

	for (k = 0; k < n; k += m) {
	for (j = 0; j < half_m; j += FLOATN) {
	floatN t_r, t_i;
	floatN u_r, u_i;

	unsigned i0 = k + j;
	unsigned i1 = i0 + half_m;

	t_r = float_load(&w_r[j]) * float_load(&X_r[i1])
	- float_load(&w_i[j]) * float_load(&X_i[i1]);

	t_i = float_load(&w_r[j]) * float_load(&X_i[i1])
	+ float_load(&w_i[j]) * float_load(&X_r[i1]);

	u_r = float_load(&X_r[i0]);
	u_i = float_load(&X_i[i0]);

	float_store(&X_r[i0], u_r + t_r);
	float_store(&X_i[i0], u_i + t_i);
	float_store(&X_r[i1], u_r - t_r);
	float_store(&X_i[i1], u_i - t_i);
	}
	}
	}
	}

	// Fast Fourier Transform of a real signal. This is an optimization for signals
	// where imag=0 taking advange of its Hermitian symmetry. The input signal is
	// compressed by storing even samples in the real part and odd samples in the
	// imaginary part, effectively reducing the FFT computation in half.
	//
	// The inverse FFT must use `dsp_rifft` to restore the real signal.
	//
	// x:
	// Real input
	//
	// Y:
	// Spectral output: complex array of size `dsp_rfft_size(n)`.
	//
	// n:
	// Number of real input samples. Must be even.
	void
	dsp_rfft(float Y, float x, unsigned n)
	{
	if (__builtin_expect(!fft_isinit, 0))
	dsp_fft_init();

	profile();

	static attr(aligned(32)) float X[dsp_complex_array_count(FFT_MAX_SIZE / 2)];

	unsigned k;
	unsigned half_size = n / 2;

	assert(n <= FFT_MAX_SIZE);
	assert(half_size >= FFT_MIN_SIZE);

	float *X_r = X;
	float *X_i = X + half_size;
	float *Y_r = Y;
	float *Y_i = Y + dsp_rfft_size(n);

	for (k = 0; k < half_size; ++k) {
	X_r[k] = x[2 * k];
	X_i[k] = x[2 * k + 1];
	}

	dsp_fft(X_r, half_size);

	// DC
	Y_r[0] = X_r[0] + X_i[0];
	Y_i[0] = 0.0;

	// Nyquist
	Y_r[half_size] = X_r[0] - X_i[0];
	Y_i[half_size] = 0.0;

	for (k = 1; k < 8; ++k)
	Y_r[half_size + k] = Y_i[half_size + k] = 0.0;

	float *w_r = &fft_twiddles_r[half_size - 1];
	float *w_i = &fft_twiddles_i[half_size - 1];

	for (k = 1; k < half_size; ++k) {
	unsigned kn = half_size - k;

	float u_r = 0.5f * (X_r[k] + X_r[kn]);
	float u_i = 0.5f * (X_i[k] - X_i[kn]);
	float v_r = 0.5f * (X_i[k] + X_i[kn]);
	float v_i = -0.5f * (X_r[k] - X_r[kn]);

	float t_r = w_r[k] * v_r - w_i[k] * v_i;
	float t_i = w_r[k] * v_i + w_i[k] * v_r;

	// Add u + t
	Y_r[k] = u_r + t_r;
	Y_i[k] = u_i + t_i;

	// symmetry since N-k is already needed above allowing a k <= half_size/2 loop
	// instead. it's usually faster to compute the entire array though due to the
	// extra shuffles this requires.
	// Y_r[kn] = u_r - t_r;
	// Y_i[kn] = t_i - u_i;
	}
	}

	// Inverse Fast Fourier Transform of a real signal.
	//
	// y:
	// Real output of size `n`.
	//
	// X:
	// Spectral input: complex array of size `dsp_rfft_size(n)`.
	//
	// n:
	// Number of real output samples. Must be even.
	void
	dsp_rifft(float y, float X, unsigned n)
	{
	if (__builtin_expect(!fft_isinit, 0))
	dsp_fft_init();

	profile();

	static attr(aligned(32)) float Y[dsp_complex_array_count(FFT_MAX_SIZE / 2)];

	unsigned k;
	unsigned half_size = n / 2;

	assert(n <= FFT_MAX_SIZE);
	assert(half_size >= FFT_MIN_SIZE);

	float *X_r = X;
	float *X_i = X + dsp_rfft_size(n);
	float *Y_r = Y;
	float *Y_i = Y + half_size;

	// DC and Nyquist conjugate
	Y_r[0] = 0.5f * (X_r[0] + X_r[half_size]);
	Y_i[0] = -0.5f * (X_r[0] - X_r[half_size]);

	float *w_r = &fft_twiddles_r[half_size - 1];
	float *w_i = &fft_twiddles_i[half_size - 1];

	for (k = 1; k < half_size; ++k) {
	// conjugate
	unsigned kn = half_size - k;

	float u_r = 0.5f * (X_r[k] + X_r[kn]);
	float u_i = 0.5f * (X_i[k] - X_i[kn]);
	float v_r = 0.5f * (X_i[k] + X_i[kn]);
	float v_i = -0.5f * (X_r[k] - X_r[kn]);

	float t_r = w_r[k] * v_r - (-w_i[k]) * v_i;
	float t_i = w_r[k] * v_i + (-w_i[k]) * v_r;

	// A[k] = conj(A[k])
	Y_r[k] = u_r - t_r;
	Y_i[k] = -(u_i - t_i);

	// symmetry since N-k is already needed above
	// Y_r[kn] = u_r + t_r;
	// Y_i[kn] = u_i + t_i;
	}

	dsp_fft(Y_r, half_size);

	// conjugate, de-interleave, and normalize output
	float scale = 1.0f / half_size;

	for (k = 0; k < half_size; ++k) {
	y[2 * k] = Y_r[k] * scale;
	y[2 * k + 1] = Y_i[k] * -scale;
	}
	}

	void
	dsp_complex_mul(float Y, float X, float *H, unsigned n)
	{
	unsigned i;

	assert(Y != X);
	assert(Y != H);

	assert((n & (FLOATN - 1)) == 0);

	float *Y_r = Y;
	float *Y_i = Y_r + n;
	float *X_r = X;
	float *X_i = X_r + n;
	float *H_r = H;
	float *H_i = H_r + n;

	for (i = 0; i < n; i += FLOATN) {
	floatN y_r = float_load(&Y_r[i]);
	floatN y_i = float_load(&Y_i[i]);
	floatN x_r = float_load(&X_r[i]);
	floatN x_i = float_load(&X_i[i]);
	floatN h_r = float_load(&H_r[i]);
	floatN h_i = float_load(&H_i[i]);

	float_store(&Y_r[i], y_r + x_r * h_r - x_i * h_i);
	float_store(&Y_i[i], y_i + x_r * h_i + x_i * h_r);
	}
	}

	// Linear convolution applying a transfer function (IR) to the input buffer.
	//
	// This is done with FFT(X) * FFT(IR). Since this method would produce circular
	// convolution, the inputs are padded to 2N. The second half is saved / added
	// to the next call to fir_filter, allowing continuous output.
	//
	// This approach introduces delay equal to the size of the overlap buffer. It
	// is generally fine for reverbs, but faster responding IRs require smaller
	// block sizes (which is also slower). If this becomes a problem, consider
	// splitting the IR into a short head and long tail for faster response of
	// early reflections.
	void
	dsp_fir_filter(struct fir_filter f, int chan, struct audio_buffer buffer, uint64 starttime, uint64 endtime)
	{
	uint64 t;
	unsigned i, k;
	attr(aligned(32)) float output[dsp_complex_array_count(dsp_rfft_size(FFT_MAX_SIZE))];
	attr(aligned(32)) float output_real[FFT_MAX_SIZE];

	float dry = 1.0f - f->wet;
	if (!f->nblocks)
	return;

	profile();

	unsigned processed = 0;
	unsigned nsamples = endtime - starttime;
	unsigned overlapsize = dsp_overlap_size(f->fftsize);

	while (processed < nsamples) {
	unsigned count = min(nsamples - processed, overlapsize - f->t);

	// FFT input buffer

	for (i = 0; i < count; ++i) {
	t = starttime + processed + i;
	float frame = (float )&buffer->samples[t & buffer->mask];
	float in = frame[chan]; // frame->left + frame->right

	f->input_real[i + f->t] = 0.5f * in;
	}

	dsp_rfft(&f->input_blocks[f->block * f->blocksize], f->input_real, f->fftsize);

	// multiply input blocks with all IR blocks
	// use the cached convolution if we don't have a full block yet

	if (f->t == 0) {
	profile_value("init_block", f->nblocks);
	memset(f->output, 0, f->blocksize * sizeof *f->output);

	for (k = 1; k < f->nblocks; ++k) {
	unsigned block = (f->block - k + f->nblocks) % f->nblocks;
	float Xk = &f->input_blocks[block f->blocksize];
	float Hk = &f->transferfunc_blocks[k f->blocksize];

	dsp_complex_mul(f->output, Xk, Hk, dsp_rfft_size(f->fftsize));
	}
	}

	// apply the most recent (potentially incomplete) input block with
	// the first IR block
	memcpy(output, f->output, f->blocksize * sizeof *f->output);

	dsp_complex_mul(output,
	&f->input_blocks[f->block * f->blocksize],
	&f->transferfunc_blocks[0],
	dsp_rfft_size(f->fftsize));

	dsp_rifft(output_real, output, f->fftsize);

	// overlapp add

	for (i = 0; i < count; ++i) {
	t = starttime + processed + i;
	float frame = (float )&buffer->samples[t & buffer->mask];

	// 0.5 tweak
	float out = 0.5f * (output_real[i + f->t] + f->overlap_real[f->t + i]);
	frame[chan] = dry * frame[chan] + f->wet * out;
	}

	f->t += count;

	if (f->t == overlapsize) {
	// update overlap buffer
	memcpy(f->overlap_real, output_real + overlapsize, overlapsize * sizeof(float));

	// starting a new block
	memset(f->input_real, 0, f->fftsize * sizeof *f->input_real);
	f->block = (f->block + 1) % f->nblocks;
	f->t = 0;
	}

	processed += count;
	}
	}

	void *
	dsp_fir_filter_alloc(void arena, usize arenasize, struct dsproom *rir)
	{
	usize fftmem = rir->blocksize * rir->nchannels * sizeof(float);
	usize realmem = rir->fftsize * rir->nchannels * sizeof(float);

	usize newsize
	= fftmem * rir->nblocks // input
	+ fftmem // output
	+ realmem // input_real
	+ realmem; // overlap_real

	if (!arena \|\| newsize > *arenasize) {
	sys_free(arena);

	arena = sys_alloc(H_OS, newsize);
	*arenasize = newsize;
	}

	return arena;
	}

	unsigned
	dsp_fir_filter_init(struct fir_filter filters, unsigned nfilters, void arena, struct dsproom *rir)
	{
	unsigned nchannels = min(nfilters, rir->nchannels);

	usize fftmem = rir->blocksize * sizeof(float);
	usize realmem = rir->fftsize * sizeof(float);

	assert((fftmem & 31) == 0);
	assert((realmem & 31) == 0);

	for (unsigned chan = 0; chan < nchannels; ++chan) {
	struct fir_filter *f = &filters[chan];

	f->fftsize = rir->fftsize;
	f->blocksize = rir->blocksize;
	f->nblocks = rir->nblocks;
	f->wet = rir->wet;

	f->output = arena;
	arena += fftmem;

	f->input_real = arena;
	arena += realmem;

	f->overlap_real = arena;
	arena += realmem;

	f->transferfunc_blocks = &cl_base->fftdata[rir->data];
	f->input_blocks = arena;
	arena += fftmem * rir->nblocks;
	}

	return nchannels;
	}
No results found