Last active
November 27, 2025 04:27
-
-
Save stillwwater/9bf5a917f5ac25397a9a48e334edf1c5 to your computer and use it in GitHub Desktop.
IR convolution (for reverbs)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #define dsp_rfft_size(fftsize) ((fftsize) / 2 + 8) | |
| #define dsp_overlap_size(fftsize) ((fftsize) / 2) | |
| #define dsp_complex_array_count(fftsize) ((fftsize) * 2) | |
| enum { | |
| FFT_MIN_SIZE = 16, | |
| FFT_MAX_SIZE = 4096, | |
| }; | |
| struct fir_filter { | |
| uint32 fftsize; | |
| uint32 blocksize; | |
| uint32 nblocks; | |
| float wet; | |
| uint32 t; | |
| uint32 block; | |
| // RFFT | |
| float *output; | |
| // Real (zero padded to fftsize) | |
| float *input_real; | |
| float *overlap_real; | |
| // RFFT blocks | |
| // see dsproom for memory layout | |
| float *transferfunc_blocks; | |
| float *input_blocks; | |
| }; | |
| #include "module.h" | |
| bool fft_isinit; | |
| uint32 fft_bitrev[FFT_MAX_SIZE]; | |
| // The largest twiddle array has `1 << (log2(N) - 1)` elements. Each array is stored | |
| // sequentially and so `2 * (1 << (log2(N) - 1)) == N` storage is used. | |
| attr(aligned(32)) float fft_twiddles_r[FFT_MAX_SIZE]; | |
| attr(aligned(32)) float fft_twiddles_i[FFT_MAX_SIZE]; | |
| void attr(cold, noinline) | |
| dsp_fft_init(void) | |
| { | |
| profile(); | |
| unsigned i, j; | |
| unsigned s; | |
| unsigned log2n = log2_int(FFT_MAX_SIZE); | |
| // twiddle factors | |
| for (s = 0; s < log2n; ++s) { | |
| unsigned half_m = 1u << s; | |
| for (j = 0; j < half_m; ++j) { | |
| double tw = -M_PI * j / half_m; | |
| // complex exp | |
| fft_twiddles_r[half_m - 1 + j] = cos(tw); | |
| fft_twiddles_i[half_m - 1 + j] = sin(tw); | |
| } | |
| } | |
| // x86 does not have a bitreverse instruction and so a lot of ops are needed. | |
| // it could be parallelized somewhat if the bitreverse step is mixed in with the first | |
| // fft stage, mixing the integer bitreverse instructions with the float ops. | |
| for (i = 0; i < FFT_MAX_SIZE; ++i) { | |
| fft_bitrev[i] = __builtin_bitreverse32(i); | |
| } | |
| fft_isinit = true; | |
| } | |
| // Inplace Fast Fourier Transform. | |
| // | |
| // X: | |
| // Complex array spectral input and output. The memory layout is planar (as | |
| // opposed to the more common interleaved format). N real floats are followed | |
| // by N imaginary floats. | |
| // | |
| // X_real = X | |
| // X_imag = X + n | |
| // | |
| // ex. n=4: [RRRRIIII] | |
| // | |
| // n: | |
| // FFT size | |
| void | |
| dsp_fft(float *X, unsigned n) | |
| { | |
| float Y[dsp_complex_array_count(FFT_MAX_SIZE)]; | |
| unsigned i, j, k; | |
| unsigned s; | |
| if (__builtin_expect(!fft_isinit, 0)) | |
| dsp_fft_init(); | |
| profile(); | |
| assert(n >= FFT_MIN_SIZE); | |
| assert(n <= FFT_MAX_SIZE); | |
| unsigned log2n = log2_int(n); | |
| float *X_r = X; | |
| float *X_i = X + n; | |
| // bitreverse for DIT | |
| // | |
| // could also be done in place, but this is faster since it removes the | |
| // branch in the loop | |
| float *Y_r = Y; | |
| float *Y_i = Y + n; | |
| for (i = 0; i < n; ++i) { | |
| #ifdef __aarch64__ | |
| j = __builtin_bitreverse32(i) >> (32 - log2n); | |
| #else | |
| j = fft_bitrev[i] >> (32 - log2n); | |
| #endif | |
| Y_r[i] = X_r[j]; | |
| Y_i[i] = X_i[j]; | |
| } | |
| memcpy(X, Y, dsp_complex_array_count(n) * sizeof *X); | |
| // DIT radix-2 butterflies | |
| for (s = 0; s < log2_int(FLOATN); ++s) { | |
| unsigned half_m = 1u << s; | |
| unsigned m = half_m << 1; | |
| float *w_r = &fft_twiddles_r[half_m - 1]; | |
| float *w_i = &fft_twiddles_i[half_m - 1]; | |
| for (k = 0; k < n; k += m) { | |
| for (j = 0; j < half_m; ++j) { | |
| float t_r, t_i; | |
| float u_r, u_i; | |
| unsigned i0 = k + j; | |
| unsigned i1 = i0 + half_m; | |
| t_r = w_r[j] * X_r[i1] - w_i[j] * X_i[i1]; | |
| t_i = w_r[j] * X_i[i1] + w_i[j] * X_r[i1]; | |
| u_r = X_r[i0]; | |
| u_i = X_i[i0]; | |
| X_r[i0] = u_r + t_r; | |
| X_i[i0] = u_i + t_i; | |
| X_r[i1] = u_r - t_r; | |
| X_i[i1] = u_i - t_i; | |
| } | |
| } | |
| } | |
| for (; s < log2n; ++s) { | |
| unsigned half_m = 1u << s; | |
| unsigned m = half_m << 1; | |
| float *w_r = &fft_twiddles_r[half_m - 1]; | |
| float *w_i = &fft_twiddles_i[half_m - 1]; | |
| for (k = 0; k < n; k += m) { | |
| for (j = 0; j < half_m; j += FLOATN) { | |
| floatN t_r, t_i; | |
| floatN u_r, u_i; | |
| unsigned i0 = k + j; | |
| unsigned i1 = i0 + half_m; | |
| t_r = float_load(&w_r[j]) * float_load(&X_r[i1]) | |
| - float_load(&w_i[j]) * float_load(&X_i[i1]); | |
| t_i = float_load(&w_r[j]) * float_load(&X_i[i1]) | |
| + float_load(&w_i[j]) * float_load(&X_r[i1]); | |
| u_r = float_load(&X_r[i0]); | |
| u_i = float_load(&X_i[i0]); | |
| float_store(&X_r[i0], u_r + t_r); | |
| float_store(&X_i[i0], u_i + t_i); | |
| float_store(&X_r[i1], u_r - t_r); | |
| float_store(&X_i[i1], u_i - t_i); | |
| } | |
| } | |
| } | |
| } | |
| // Fast Fourier Transform of a real signal. This is an optimization for signals | |
| // where imag=0 taking advange of its Hermitian symmetry. The input signal is | |
| // compressed by storing even samples in the real part and odd samples in the | |
| // imaginary part, effectively reducing the FFT computation in half. | |
| // | |
| // The inverse FFT must use `dsp_rifft` to restore the real signal. | |
| // | |
| // x: | |
| // Real input | |
| // | |
| // Y: | |
| // Spectral output: complex array of size `dsp_rfft_size(n)`. | |
| // | |
| // n: | |
| // Number of real input samples. Must be even. | |
| void | |
| dsp_rfft(float *Y, float *x, unsigned n) | |
| { | |
| if (__builtin_expect(!fft_isinit, 0)) | |
| dsp_fft_init(); | |
| profile(); | |
| static attr(aligned(32)) float X[dsp_complex_array_count(FFT_MAX_SIZE / 2)]; | |
| unsigned k; | |
| unsigned half_size = n / 2; | |
| assert(n <= FFT_MAX_SIZE); | |
| assert(half_size >= FFT_MIN_SIZE); | |
| float *X_r = X; | |
| float *X_i = X + half_size; | |
| float *Y_r = Y; | |
| float *Y_i = Y + dsp_rfft_size(n); | |
| for (k = 0; k < half_size; ++k) { | |
| X_r[k] = x[2 * k]; | |
| X_i[k] = x[2 * k + 1]; | |
| } | |
| dsp_fft(X_r, half_size); | |
| // DC | |
| Y_r[0] = X_r[0] + X_i[0]; | |
| Y_i[0] = 0.0; | |
| // Nyquist | |
| Y_r[half_size] = X_r[0] - X_i[0]; | |
| Y_i[half_size] = 0.0; | |
| for (k = 1; k < 8; ++k) | |
| Y_r[half_size + k] = Y_i[half_size + k] = 0.0; | |
| float *w_r = &fft_twiddles_r[half_size - 1]; | |
| float *w_i = &fft_twiddles_i[half_size - 1]; | |
| for (k = 1; k < half_size; ++k) { | |
| unsigned kn = half_size - k; | |
| float u_r = 0.5f * (X_r[k] + X_r[kn]); | |
| float u_i = 0.5f * (X_i[k] - X_i[kn]); | |
| float v_r = 0.5f * (X_i[k] + X_i[kn]); | |
| float v_i = -0.5f * (X_r[k] - X_r[kn]); | |
| float t_r = w_r[k] * v_r - w_i[k] * v_i; | |
| float t_i = w_r[k] * v_i + w_i[k] * v_r; | |
| // Add u + t | |
| Y_r[k] = u_r + t_r; | |
| Y_i[k] = u_i + t_i; | |
| // symmetry since N-k is already needed above allowing a k <= half_size/2 loop | |
| // instead. it's usually faster to compute the entire array though due to the | |
| // extra shuffles this requires. | |
| // Y_r[kn] = u_r - t_r; | |
| // Y_i[kn] = t_i - u_i; | |
| } | |
| } | |
| // Inverse Fast Fourier Transform of a real signal. | |
| // | |
| // y: | |
| // Real output of size `n`. | |
| // | |
| // X: | |
| // Spectral input: complex array of size `dsp_rfft_size(n)`. | |
| // | |
| // n: | |
| // Number of real output samples. Must be even. | |
| void | |
| dsp_rifft(float *y, float *X, unsigned n) | |
| { | |
| if (__builtin_expect(!fft_isinit, 0)) | |
| dsp_fft_init(); | |
| profile(); | |
| static attr(aligned(32)) float Y[dsp_complex_array_count(FFT_MAX_SIZE / 2)]; | |
| unsigned k; | |
| unsigned half_size = n / 2; | |
| assert(n <= FFT_MAX_SIZE); | |
| assert(half_size >= FFT_MIN_SIZE); | |
| float *X_r = X; | |
| float *X_i = X + dsp_rfft_size(n); | |
| float *Y_r = Y; | |
| float *Y_i = Y + half_size; | |
| // DC and Nyquist conjugate | |
| Y_r[0] = 0.5f * (X_r[0] + X_r[half_size]); | |
| Y_i[0] = -0.5f * (X_r[0] - X_r[half_size]); | |
| float *w_r = &fft_twiddles_r[half_size - 1]; | |
| float *w_i = &fft_twiddles_i[half_size - 1]; | |
| for (k = 1; k < half_size; ++k) { | |
| // conjugate | |
| unsigned kn = half_size - k; | |
| float u_r = 0.5f * (X_r[k] + X_r[kn]); | |
| float u_i = 0.5f * (X_i[k] - X_i[kn]); | |
| float v_r = 0.5f * (X_i[k] + X_i[kn]); | |
| float v_i = -0.5f * (X_r[k] - X_r[kn]); | |
| float t_r = w_r[k] * v_r - (-w_i[k]) * v_i; | |
| float t_i = w_r[k] * v_i + (-w_i[k]) * v_r; | |
| // A[k] = conj(A[k]) | |
| Y_r[k] = u_r - t_r; | |
| Y_i[k] = -(u_i - t_i); | |
| // symmetry since N-k is already needed above | |
| // Y_r[kn] = u_r + t_r; | |
| // Y_i[kn] = u_i + t_i; | |
| } | |
| dsp_fft(Y_r, half_size); | |
| // conjugate, de-interleave, and normalize output | |
| float scale = 1.0f / half_size; | |
| for (k = 0; k < half_size; ++k) { | |
| y[2 * k] = Y_r[k] * scale; | |
| y[2 * k + 1] = Y_i[k] * -scale; | |
| } | |
| } | |
| void | |
| dsp_complex_mul(float *Y, float *X, float *H, unsigned n) | |
| { | |
| unsigned i; | |
| assert(Y != X); | |
| assert(Y != H); | |
| assert((n & (FLOATN - 1)) == 0); | |
| float *Y_r = Y; | |
| float *Y_i = Y_r + n; | |
| float *X_r = X; | |
| float *X_i = X_r + n; | |
| float *H_r = H; | |
| float *H_i = H_r + n; | |
| for (i = 0; i < n; i += FLOATN) { | |
| floatN y_r = float_load(&Y_r[i]); | |
| floatN y_i = float_load(&Y_i[i]); | |
| floatN x_r = float_load(&X_r[i]); | |
| floatN x_i = float_load(&X_i[i]); | |
| floatN h_r = float_load(&H_r[i]); | |
| floatN h_i = float_load(&H_i[i]); | |
| float_store(&Y_r[i], y_r + x_r * h_r - x_i * h_i); | |
| float_store(&Y_i[i], y_i + x_r * h_i + x_i * h_r); | |
| } | |
| } | |
| // Linear convolution applying a transfer function (IR) to the input buffer. | |
| // | |
| // This is done with FFT(X) * FFT(IR). Since this method would produce circular | |
| // convolution, the inputs are padded to 2N. The second half is saved / added | |
| // to the next call to fir_filter, allowing continuous output. | |
| // | |
| // This approach introduces delay equal to the size of the overlap buffer. It | |
| // is generally fine for reverbs, but faster responding IRs require smaller | |
| // block sizes (which is also slower). If this becomes a problem, consider | |
| // splitting the IR into a short head and long tail for faster response of | |
| // early reflections. | |
| void | |
| dsp_fir_filter(struct fir_filter *f, int chan, struct audio_buffer *buffer, uint64 starttime, uint64 endtime) | |
| { | |
| uint64 t; | |
| unsigned i, k; | |
| attr(aligned(32)) float output[dsp_complex_array_count(dsp_rfft_size(FFT_MAX_SIZE))]; | |
| attr(aligned(32)) float output_real[FFT_MAX_SIZE]; | |
| float dry = 1.0f - f->wet; | |
| if (!f->nblocks) | |
| return; | |
| profile(); | |
| unsigned processed = 0; | |
| unsigned nsamples = endtime - starttime; | |
| unsigned overlapsize = dsp_overlap_size(f->fftsize); | |
| while (processed < nsamples) { | |
| unsigned count = min(nsamples - processed, overlapsize - f->t); | |
| // FFT input buffer | |
| for (i = 0; i < count; ++i) { | |
| t = starttime + processed + i; | |
| float *frame = (float *)&buffer->samples[t & buffer->mask]; | |
| float in = frame[chan]; // frame->left + frame->right | |
| f->input_real[i + f->t] = 0.5f * in; | |
| } | |
| dsp_rfft(&f->input_blocks[f->block * f->blocksize], f->input_real, f->fftsize); | |
| // multiply input blocks with all IR blocks | |
| // use the cached convolution if we don't have a full block yet | |
| if (f->t == 0) { | |
| profile_value("init_block", f->nblocks); | |
| memset(f->output, 0, f->blocksize * sizeof *f->output); | |
| for (k = 1; k < f->nblocks; ++k) { | |
| unsigned block = (f->block - k + f->nblocks) % f->nblocks; | |
| float *Xk = &f->input_blocks[block * f->blocksize]; | |
| float *Hk = &f->transferfunc_blocks[k * f->blocksize]; | |
| dsp_complex_mul(f->output, Xk, Hk, dsp_rfft_size(f->fftsize)); | |
| } | |
| } | |
| // apply the most recent (potentially incomplete) input block with | |
| // the first IR block | |
| memcpy(output, f->output, f->blocksize * sizeof *f->output); | |
| dsp_complex_mul(output, | |
| &f->input_blocks[f->block * f->blocksize], | |
| &f->transferfunc_blocks[0], | |
| dsp_rfft_size(f->fftsize)); | |
| dsp_rifft(output_real, output, f->fftsize); | |
| // overlapp add | |
| for (i = 0; i < count; ++i) { | |
| t = starttime + processed + i; | |
| float *frame = (float *)&buffer->samples[t & buffer->mask]; | |
| // 0.5 tweak | |
| float out = 0.5f * (output_real[i + f->t] + f->overlap_real[f->t + i]); | |
| frame[chan] = dry * frame[chan] + f->wet * out; | |
| } | |
| f->t += count; | |
| if (f->t == overlapsize) { | |
| // update overlap buffer | |
| memcpy(f->overlap_real, output_real + overlapsize, overlapsize * sizeof(float)); | |
| // starting a new block | |
| memset(f->input_real, 0, f->fftsize * sizeof *f->input_real); | |
| f->block = (f->block + 1) % f->nblocks; | |
| f->t = 0; | |
| } | |
| processed += count; | |
| } | |
| } | |
| void * | |
| dsp_fir_filter_alloc(void *arena, usize *arenasize, struct dsproom *rir) | |
| { | |
| usize fftmem = rir->blocksize * rir->nchannels * sizeof(float); | |
| usize realmem = rir->fftsize * rir->nchannels * sizeof(float); | |
| usize newsize | |
| = fftmem * rir->nblocks // input | |
| + fftmem // output | |
| + realmem // input_real | |
| + realmem; // overlap_real | |
| if (!arena || newsize > *arenasize) { | |
| sys_free(arena); | |
| arena = sys_alloc(H_OS, newsize); | |
| *arenasize = newsize; | |
| } | |
| return arena; | |
| } | |
| unsigned | |
| dsp_fir_filter_init(struct fir_filter *filters, unsigned nfilters, void *arena, struct dsproom *rir) | |
| { | |
| unsigned nchannels = min(nfilters, rir->nchannels); | |
| usize fftmem = rir->blocksize * sizeof(float); | |
| usize realmem = rir->fftsize * sizeof(float); | |
| assert((fftmem & 31) == 0); | |
| assert((realmem & 31) == 0); | |
| for (unsigned chan = 0; chan < nchannels; ++chan) { | |
| struct fir_filter *f = &filters[chan]; | |
| f->fftsize = rir->fftsize; | |
| f->blocksize = rir->blocksize; | |
| f->nblocks = rir->nblocks; | |
| f->wet = rir->wet; | |
| f->output = arena; | |
| arena += fftmem; | |
| f->input_real = arena; | |
| arena += realmem; | |
| f->overlap_real = arena; | |
| arena += realmem; | |
| f->transferfunc_blocks = &cl_base->fftdata[rir->data]; | |
| f->input_blocks = arena; | |
| arena += fftmem * rir->nblocks; | |
| } | |
| return nchannels; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment