Skip to content

Instantly share code, notes, and snippets.

@p0nce
Created February 19, 2019 01:57
Show Gist options
  • Select an option

  • Save p0nce/027324584b79d26acc5304defe235059 to your computer and use it in GitHub Desktop.

Select an option

Save p0nce/027324584b79d26acc5304defe235059 to your computer and use it in GitHub Desktop.
Another failure to optimize table lookup
/**
Color-correction post-effect.
This is a widget intended to correct colors before display, at the Raw level.
Copyright: Guillaume Piolat 2018.
License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
*/
module dplug.pbrwidgets.colorcorrection;
import gfm.math.matrix;
import dplug.core.math;
import dplug.gui.element;
version = oldColorCorrection;
version(oldColorCorrection)
{
/// FlatBackgroundGUI provides a background that is loaded from a PNG or JPEG
/// image. The string for backgroundPath should be in "stringImportPaths"
/// specified in dub.json
class UIColorCorrection : UIElement
{
public:
nothrow:
@nogc:
this(UIContext context)
{
super(context, flagRaw);
_tableArea.reallocBuffer(256 * 3, 32);
_redTransferTable = _tableArea.ptr;
_greenTransferTable = _tableArea.ptr + 256;
_blueTransferTable = _tableArea.ptr + 512;
// Set identity tables
foreach(i; 0..256)
{
_redTransferTable[i] = cast(ubyte)i;
_greenTransferTable[i] = cast(ubyte)i;
_blueTransferTable[i] = cast(ubyte)i;
}
}
~this()
{
_tableArea.reallocBuffer(0, 32);
}
/// Calling this setup color correction table, with the well
/// known lift-gamma-gain formula.
void setLiftGammaGainContrast(float lift = 0.0f, float gamma = 1.0f, float gain = 1.0f, float contrast = 0.0f)
{
setLiftGammaGainContrastRGB(lift, gamma, gain, contrast,
lift, gamma, gain, contrast,
lift, gamma, gain, contrast);
}
/// Calling this setup color correction table, with the well
/// known lift-gamma-gain formula, per channel.
void setLiftGammaGainRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f,
float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f,
float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f)
{
setLiftGammaGainContrastRGB(rLift, rGamma, rGain, 0.0f,
gLift, gGamma, gGain, 0.0f,
bLift, bGamma, bGain, 0.0f);
}
/// Calling this setup color correction table, with the less
/// known lift-gamma-gain formula + contrast addition, per channel.
void setLiftGammaGainContrastRGB(
float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, float rContrast = 0.0f,
float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, float gContrast = 0.0f,
float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f, float bContrast = 0.0f)
{
static float safePow(float a, float b) nothrow @nogc
{
if (a < 0)
a = 0;
if (a > 1)
a = 1;
return a ^^ b;
}
for (int b = 0; b < 256; ++b)
{
float inp = b / 255.0f;
float outR = rGain*(inp + rLift*(1-inp));
float outG = gGain*(inp + gLift*(1-inp));
float outB = bGain*(inp + bLift*(1-inp));
outR = safePow(outR, 1.0f / rGamma );
outG = safePow(outG, 1.0f / gGamma );
outB = safePow(outB, 1.0f / bGamma );
if (outR < 0)
outR = 0;
if (outG < 0)
outG = 0;
if (outB < 0)
outB = 0;
if (outR > 1)
outR = 1;
if (outG > 1)
outG = 1;
if (outB > 1)
outB = 1;
outR = lerp!float(outR, smoothStep!float(0, 1, outR), rContrast);
outG = lerp!float(outG, smoothStep!float(0, 1, outG), gContrast);
outB = lerp!float(outB, smoothStep!float(0, 1, outB), bContrast);
_redTransferTable[b] = cast(ubyte)(0.5f + outR * 255.0f);
_greenTransferTable[b] = cast(ubyte)(0.5f + outG * 255.0f);
_blueTransferTable[b] = cast(ubyte)(0.5f + outB * 255.0f);
}
setDirtyWhole();
}
/// ditto
void setLiftGammaGainContrastRGB(mat3x4f liftGammaGainContrast)
{
auto m = liftGammaGainContrast;
setLiftGammaGainContrastRGB(m.c[0][0], m.c[0][1], m.c[0][2], m.c[0][3],
m.c[1][0], m.c[1][1], m.c[1][2], m.c[1][3],
m.c[2][0], m.c[2][1], m.c[2][2], m.c[2][3]);
}
override void onDrawRaw(ImageRef!RGBA rawMap, box2i[] dirtyRects)
{
foreach(dirtyRect; dirtyRects)
{
rawMap.cropImageRef(dirtyRect).applyColorCorrection(_redTransferTable);
}
}
private:
ubyte[] _tableArea = null;
ubyte* _redTransferTable;
ubyte* _greenTransferTable;
ubyte* _blueTransferTable;
}
void applyColorCorrection(ImageRef!RGBA image, const(ubyte*) rgbTable) pure nothrow @nogc
{
int w = image.w;
int h = image.h;
for (int j = 0; j < h; ++j)
{
ubyte* scan = cast(ubyte*)image.scanline(j).ptr;
for (int i = 0; i < w; ++i)
{
ubyte r = scan[4*i];
ubyte g = scan[4*i+1];
ubyte b = scan[4*i+2];
scan[4*i] = rgbTable[r];
scan[4*i+1] = rgbTable[g+256];
scan[4*i+2] = rgbTable[b+512];
}
}
}
}
else
{
import inteli.xmmintrin;
import inteli.emmintrin;
class UIColorCorrection : UIElement
{
public:
nothrow:
@nogc:
this(UIContext context)
{
super(context, flagRaw);
_tempBuffer.reallocBuffer(256);
_coef.reallocBuffer(16, 16);
}
~this()
{
_tempBuffer.reallocBuffer(0);
_coef.reallocBuffer(0, 16);
}
/// Calling this setup color correction table, with the well
/// known lift-gamma-gain formula.
void setLiftGammaGainContrast(float lift = 0.0f, float gamma = 1.0f, float gain = 1.0f, float contrast = 0.0f)
{
setLiftGammaGainContrastRGB(lift, gamma, gain, contrast,
lift, gamma, gain, contrast,
lift, gamma, gain, contrast);
}
/// Calling this setup color correction table, with the well
/// known lift-gamma-gain formula, per channel.
void setLiftGammaGainRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f,
float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f,
float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f)
{
setLiftGammaGainContrastRGB(rLift, rGamma, rGain, 0.0f,
gLift, gGamma, gGain, 0.0f,
bLift, bGamma, bGain, 0.0f);
}
/// Calling this setup color correction table, with the less
/// known lift-gamma-gain formula + contrast addition, per channel.
void setLiftGammaGainContrastRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, float rContrast = 0.0f,
float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, float gContrast = 0.0f,
float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f, float bContrast = 0.0f)
{
float[4] redPoly = approximateComponentCurve(rLift, rGamma, rGain, rContrast, _tempBuffer.ptr);
float[4] greenPoly = approximateComponentCurve(gLift, gGamma, gGain, gContrast, _tempBuffer.ptr);
float[4] bluePoly = approximateComponentCurve(bLift, bGamma, bGain, bContrast, _tempBuffer.ptr);
float[4] alphaPoly = [0.0f, 0.0f, 1.0f, 0.0f];
foreach(i; 0..4)
{
_coef[4*i+0] = redPoly[i];
_coef[4*i+1] = greenPoly[i];
_coef[4*i+2] = bluePoly[i];
_coef[4*i+3] = alphaPoly[i];
}
_coef[12] += 0.5f; // for rounding before truncation to int
_coef[13] += 0.5f;
_coef[14] += 0.5f;
_coef[15] += 0.5f;
setDirtyWhole();
}
/// ditto
void setLiftGammaGainContrastRGB(mat3x4f liftGammaGainContrast)
{
auto m = liftGammaGainContrast;
setLiftGammaGainContrastRGB(m.c[0][0], m.c[0][1], m.c[0][2], m.c[0][3],
m.c[1][0], m.c[1][1], m.c[1][2], m.c[1][3],
m.c[2][0], m.c[2][1], m.c[2][2], m.c[2][3]);
}
override void onDrawRaw(ImageRef!RGBA rawMap, box2i[] dirtyRects)
{
foreach(dirtyRect; dirtyRects)
{
rawMap.cropImageRef(dirtyRect).applyColorCorrectionApprox(_coef.ptr);
}
}
private:
// 4x 3rd order polynomials for remapping components
// A_red A_green A_blue A_alpha
// B_red B_green B_blue B_alpha
// C_red C_green C_blue C_alpha
// D_red D_green D_blue D_alpha
float[] _coef;
// A temporary table, for computing polynomials.
float[] _tempBuffer = null;
}
private:
void applyColorCorrectionApprox(ImageRef!RGBA image, const(float*) coeff16) pure nothrow @nogc
{
immutable int w = image.w;
immutable int h = image.h;
for (int j = 0; j < h; ++j)
{
ubyte* scan = cast(ubyte*)image.scanline(j).ptr;
// load all coefficients
const(__m128) A = _mm_load_ps(coeff16);
const(__m128) B = _mm_load_ps(coeff16+4);
const(__m128) C = _mm_load_ps(coeff16+8);
const(__m128) D = _mm_load_ps(coeff16+12);
const(__m128i) zero = _mm_setzero_si128();
// unroll by 4
int i = 0;
for(; i + 3 < w; i += 4)
{
__m128i rgba = _mm_loadu_si128( cast(__m128i*)(&scan[4*i]) );
// unpack to 32-bit int
__m128i rgba_lo8 = _mm_unpacklo_epi8(rgba, zero);
__m128i rgba_hi8 = _mm_unpackhi_epi8(rgba, zero);
__m128i rgba0 = _mm_unpacklo_epi16(rgba_lo8, zero);
__m128i rgba1 = _mm_unpackhi_epi16(rgba_lo8, zero);
__m128i rgba2 = _mm_unpacklo_epi16(rgba_hi8, zero);
__m128i rgba3 = _mm_unpackhi_epi16(rgba_hi8, zero);
// convert to float
__m128 x0 = _mm_cvtepi32_ps(rgba0);
__m128 x1 = _mm_cvtepi32_ps(rgba1);
__m128 x2 = _mm_cvtepi32_ps(rgba2);
__m128 x3 = _mm_cvtepi32_ps(rgba3);
// evaluate polynomials using horner's rule
__m128 y0 = ((A * x0 + B) * x0 + C) * x0 + D;
__m128 y1 = ((A * x1 + B) * x1 + C) * x1 + D;
__m128 y2 = ((A * x2 + B) * x2 + C) * x2 + D;
__m128 y3 = ((A * x3 + B) * x3 + C) * x3 + D;
rgba0 = _mm_cvttps_epi32(y0);
rgba1 = _mm_cvttps_epi32(y1);
rgba2 = _mm_cvttps_epi32(y2);
rgba3 = _mm_cvttps_epi32(y3);
rgba0 = _mm_packs_epi32(rgba0, rgba1);
rgba2 = _mm_packs_epi32(rgba2, rgba3);
rgba0 = _mm_packus_epi16(rgba0, rgba2);
_mm_storeu_si128(cast(__m128i*)(&scan[4*i]), rgba0);
}
for(; i < w; i++)
{
__m128i rgba = _mm_loadu_si32(&scan[4*i]);
// unpack to 32-bit int
rgba = _mm_unpacklo_epi8(rgba, zero);
rgba = _mm_unpacklo_epi16(rgba, zero);
// convert to float
__m128 x = _mm_cvtepi32_ps(rgba);
// evaluate polynomials using horner's rule
__m128 y = ((A * x + B) * x + C) * x + D;
rgba = _mm_cvttps_epi32(y);
rgba = _mm_packs_epi32(rgba, zero);
rgba = _mm_packus_epi16(rgba, zero);
*cast(int*)(&scan[4*i]) = rgba[0];
}
/+
for(int i = 0; i < w; i ++)
{
float r = scan[4*i+0];
float g = scan[4*i+1];
float b = scan[4*i+2];
float a = scan[4*i+3];
float outR = coeff16[12] + r * (coeff16[8] + r * (coeff16[4] + r * coeff16[0]));
float outG = coeff16[13] + g * (coeff16[9] + g * (coeff16[5] + g * coeff16[1]));
float outB = coeff16[14] + b * (coeff16[10] + b * (coeff16[6] + b * coeff16[2]));
float outA = coeff16[15] + a * (coeff16[11] + a * (coeff16[7] + a * coeff16[3]));
int approxR = cast(int)(0.5f + outR);
int approxG = cast(int)(0.5f + outG);
int approxB = cast(int)(0.5f + outB);
int approxA = cast(int)(0.5f + outA);
if (approxR < 0)
approxR = 0;
if (approxG < 0)
approxG = 0;
if (approxB < 0)
approxB = 0;
if (approxA < 0)
approxA = 0;
if (approxR > 255)
approxR = 255;
if (approxG > 255)
approxG = 255;
if (approxB > 255)
approxB = 255;
if (approxA > 255)
approxA = 255;
scan[4*i+0] = cast(ubyte)(approxR);
scan[4*i+1] = cast(ubyte)(approxG);
scan[4*i+2] = cast(ubyte)(approxB);
scan[4*i+3] = cast(ubyte)(approxA);
}+/
}
}
float[4] approximateComponentCurve(float lift, float gamma, float gain, float contrast, float* tempBuffer) nothrow @nogc
{
static float safePow(float a, float b) nothrow @nogc
{
if (a < 0)
a = 0;
if (a > 1)
a = 1;
return a ^^ b;
}
// create curve for this component
for (int b = 0; b < 256; ++b)
{
float inp = b / 255.0f;
float outR = gain*(inp + lift*(1-inp));
outR = safePow(outR, 1.0f / gamma );
outR = lerp!float(outR, smoothStep!float(0, 1, outR), contrast);
tempBuffer[b] = outR * 255.0f;
}
// First first point above 0
int first = 0;
int last = 255;
while (tempBuffer[first] == 0) first++;
while (tempBuffer[last] == 255) last--;
// quite significant for the quality of interp
first += 10;
last -= 10;
// Choose some points to interpolate
float x0 = first;
float y0 = tempBuffer[first];
int x1i = (first * 2 + last + 1) / 3;
int x2i = (first * 1 + last*2 + 2) / 3;
float x1 = x1i;
float y1 = tempBuffer[x1i];
float x2 = x2i;
float y2 = tempBuffer[x2i];
float x3 = last;
float y3 = tempBuffer[last];
// Lagrange to find 3rd degree polynomial, passing into (xn, yn)
// P(x) = Ax^3 + Bx^2 + Cx + D
float d0 = (x0-x1)*(x0-x2)*(x0-x3);
float d1 = (x1-x0)*(x1-x2)*(x1-x3);
float d2 = (x2-x0)*(x2-x1)*(x2-x3);
float d3 = (x3-x0)*(x3-x1)*(x3-x2);
y0 /= d0;
y1 /= d1;
y2 /= d2;
y3 /= d3;
// first term of Lagrange Polynomial => (x^3 - x²*(x1+x2+x3) + x * (x2*x1 + x3*x1 + x3*x2) - x3*x2*x1)*y0
float A = y0 + y1 + y2 + y3;
float B = -(x1+x2+x3)*(y0) - (x0+x2+x3)*(y1) - (x0+x1+x3)*(y2) - (x0+x1+x2)*(y3);
float C = (x2*x1 + x3*x1 + x3*x2)*(y0) + (x2*x0 + x3*x0 + x3*x2)*(y1) + (x1*x0 + x3*x0 + x3*x1)*(y2) + (x1*x0 + x2*x0 + x2*x1)*(y3);
float D = - x1*x2*x3*(y0) - x0*x2*x3*(y1) - x0*x1*x3*(y2) - x0*x1*x2*(y3);
return [A, B, C, D];
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment