p0nce · February 19, 2019 01:57
diff --git a/gistfile1.txt b/gistfile1.txt
 /**
 Color-correction post-effect.
 This is a widget intended to correct colors before display, at the Raw level.

 Copyright: Guillaume Piolat 2018.
 License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
 */
 module dplug.pbrwidgets.colorcorrection;


 import gfm.math.matrix;

 import dplug.core.math;
 import dplug.gui.element;

 version = oldColorCorrection;

 version(oldColorCorrection)
 {

 /// FlatBackgroundGUI provides a background that is loaded from a PNG or JPEG
 /// image. The string for backgroundPath should be in "stringImportPaths"
 /// specified in dub.json
 class UIColorCorrection : UIElement
 {
 public:
 nothrow:
 @nogc:

    this(UIContext context)
    {
        super(context, flagRaw);

        _tableArea.reallocBuffer(256 * 3, 32);
        _redTransferTable = _tableArea.ptr;
        _greenTransferTable = _tableArea.ptr + 256;
        _blueTransferTable = _tableArea.ptr + 512;

        // Set identity tables
        foreach(i; 0..256)
        {
            _redTransferTable[i] = cast(ubyte)i;
            _greenTransferTable[i] = cast(ubyte)i;
            _blueTransferTable[i] = cast(ubyte)i;
        }
    }

    ~this()
    {
        _tableArea.reallocBuffer(0, 32);
    }

    /// Calling this setup color correction table, with the well
    /// known lift-gamma-gain formula.
    void setLiftGammaGainContrast(float lift = 0.0f, float gamma = 1.0f, float gain = 1.0f, float contrast = 0.0f)
    {
        setLiftGammaGainContrastRGB(lift, gamma, gain, contrast,
                                    lift, gamma, gain, contrast,
                                    lift, gamma, gain, contrast);
    }

    /// Calling this setup color correction table, with the well
    /// known lift-gamma-gain formula, per channel.
    void setLiftGammaGainRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f,
                             float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f,
                             float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f)
    {
        setLiftGammaGainContrastRGB(rLift, rGamma, rGain, 0.0f,
                                    gLift, gGamma, gGain, 0.0f,
                                    bLift, bGamma, bGain, 0.0f);
    }

    /// Calling this setup color correction table, with the less
    /// known lift-gamma-gain formula + contrast addition, per channel.
    void setLiftGammaGainContrastRGB(
                                     float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, float rContrast = 0.0f,
                                     float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, float gContrast = 0.0f,
                                     float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f, float bContrast = 0.0f)
    {
        static float safePow(float a, float b) nothrow @nogc
        {
            if (a < 0)
                a = 0;
            if (a > 1)
                a = 1;
            return a ^^ b;
        }

        for (int b = 0; b < 256; ++b)
        {
            float inp = b / 255.0f;
            float outR = rGain*(inp + rLift*(1-inp));
            float outG = gGain*(inp + gLift*(1-inp));
            float outB = bGain*(inp + bLift*(1-inp));

            outR = safePow(outR, 1.0f / rGamma );
            outG = safePow(outG, 1.0f / gGamma );
            outB = safePow(outB, 1.0f / bGamma );

            if (outR < 0)
                outR = 0;
            if (outG < 0)
                outG = 0;
            if (outB < 0)
                outB = 0;
            if (outR > 1)
                outR = 1;
            if (outG > 1)
                outG = 1;
            if (outB > 1)
                outB = 1;

            outR = lerp!float(outR, smoothStep!float(0, 1, outR), rContrast);
            outG = lerp!float(outG, smoothStep!float(0, 1, outG), gContrast);
            outB = lerp!float(outB, smoothStep!float(0, 1, outB), bContrast);

            _redTransferTable[b] = cast(ubyte)(0.5f + outR * 255.0f);
            _greenTransferTable[b] = cast(ubyte)(0.5f + outG * 255.0f);
            _blueTransferTable[b] = cast(ubyte)(0.5f + outB * 255.0f);
        }
        setDirtyWhole();
    }

    /// ditto
    void setLiftGammaGainContrastRGB(mat3x4f liftGammaGainContrast)
    {
        auto m = liftGammaGainContrast;
        setLiftGammaGainContrastRGB(m.c[0][0], m.c[0][1], m.c[0][2], m.c[0][3],
                                    m.c[1][0], m.c[1][1], m.c[1][2], m.c[1][3],
                                    m.c[2][0], m.c[2][1], m.c[2][2], m.c[2][3]);
    }
    
    override void onDrawRaw(ImageRef!RGBA rawMap, box2i[] dirtyRects)
    {
        foreach(dirtyRect; dirtyRects)
        {
            rawMap.cropImageRef(dirtyRect).applyColorCorrection(_redTransferTable);
        }
    }

 private:
    ubyte[] _tableArea = null;
    ubyte* _redTransferTable;
    ubyte* _greenTransferTable;
    ubyte* _blueTransferTable;
 }

 void applyColorCorrection(ImageRef!RGBA image, const(ubyte*) rgbTable) pure nothrow @nogc
 {
    int w = image.w;
    int h = image.h;
    for (int j = 0; j < h; ++j)
    {
        ubyte* scan = cast(ubyte*)image.scanline(j).ptr;
        for (int i = 0; i < w; ++i)
        {
            ubyte r = scan[4*i];
            ubyte g = scan[4*i+1];
            ubyte b = scan[4*i+2];
            scan[4*i]   = rgbTable[r];
            scan[4*i+1] = rgbTable[g+256];
            scan[4*i+2] = rgbTable[b+512];
        }
    }
 }

 }
 else
 {

 import inteli.xmmintrin;
 import inteli.emmintrin;

 class UIColorCorrection : UIElement
 {
 public:
 nothrow:
 @nogc:

    this(UIContext context)
    {
        super(context, flagRaw);
        _tempBuffer.reallocBuffer(256);    
        _coef.reallocBuffer(16, 16);
    }

    ~this()
    {
        _tempBuffer.reallocBuffer(0);
        _coef.reallocBuffer(0, 16);
    }

    /// Calling this setup color correction table, with the well
    /// known lift-gamma-gain formula.
    void setLiftGammaGainContrast(float lift = 0.0f, float gamma = 1.0f, float gain = 1.0f, float contrast = 0.0f)
    {
        setLiftGammaGainContrastRGB(lift, gamma, gain, contrast,
                                    lift, gamma, gain, contrast,
                                    lift, gamma, gain, contrast);
    }

    /// Calling this setup color correction table, with the well
    /// known lift-gamma-gain formula, per channel.
    void setLiftGammaGainRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f,
                             float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f,
                             float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f)
    {
        setLiftGammaGainContrastRGB(rLift, rGamma, rGain, 0.0f,
                                    gLift, gGamma, gGain, 0.0f,
                                    bLift, bGamma, bGain, 0.0f);
    }

    /// Calling this setup color correction table, with the less
    /// known lift-gamma-gain formula + contrast addition, per channel.
    void setLiftGammaGainContrastRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, float rContrast = 0.0f,
                                     float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, float gContrast = 0.0f,
                                     float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f, float bContrast = 0.0f)
    {

        float[4] redPoly   = approximateComponentCurve(rLift, rGamma, rGain, rContrast, _tempBuffer.ptr);
        float[4] greenPoly = approximateComponentCurve(gLift, gGamma, gGain, gContrast, _tempBuffer.ptr);
        float[4] bluePoly  = approximateComponentCurve(bLift, bGamma, bGain, bContrast, _tempBuffer.ptr);
        float[4] alphaPoly = [0.0f, 0.0f, 1.0f, 0.0f];
        foreach(i; 0..4)
        {
            _coef[4*i+0] = redPoly[i];
            _coef[4*i+1] = greenPoly[i];
            _coef[4*i+2] = bluePoly[i];
            _coef[4*i+3] = alphaPoly[i];
        }
        _coef[12] += 0.5f; // for rounding before truncation to int
        _coef[13] += 0.5f;
        _coef[14] += 0.5f;
        _coef[15] += 0.5f;
        setDirtyWhole();
    }

    /// ditto
    void setLiftGammaGainContrastRGB(mat3x4f liftGammaGainContrast)
    {
        auto m = liftGammaGainContrast;
        setLiftGammaGainContrastRGB(m.c[0][0], m.c[0][1], m.c[0][2], m.c[0][3],
                                    m.c[1][0], m.c[1][1], m.c[1][2], m.c[1][3],
                                    m.c[2][0], m.c[2][1], m.c[2][2], m.c[2][3]);
    }  

    override void onDrawRaw(ImageRef!RGBA rawMap, box2i[] dirtyRects)
    {
        foreach(dirtyRect; dirtyRects)
        {
            rawMap.cropImageRef(dirtyRect).applyColorCorrectionApprox(_coef.ptr);
        }
    }

 private:

    // 4x 3rd order polynomials for remapping components
    // A_red A_green A_blue A_alpha
    // B_red B_green B_blue B_alpha
    // C_red C_green C_blue C_alpha
    // D_red D_green D_blue D_alpha
    float[] _coef;

    // A temporary table, for computing polynomials.
    float[] _tempBuffer = null;
 }

 private:

 void applyColorCorrectionApprox(ImageRef!RGBA image, const(float*) coeff16) pure nothrow @nogc
 {
    immutable int w = image.w;
    immutable int h = image.h;
    for (int j = 0; j < h; ++j)
    {
        ubyte* scan = cast(ubyte*)image.scanline(j).ptr;

        // load all coefficients
        const(__m128) A = _mm_load_ps(coeff16);
        const(__m128) B = _mm_load_ps(coeff16+4);
        const(__m128) C = _mm_load_ps(coeff16+8);
        const(__m128) D = _mm_load_ps(coeff16+12);
        const(__m128i) zero = _mm_setzero_si128();

        // unroll by 4
        int i = 0;
        for(; i + 3 < w; i += 4)
        {
            __m128i rgba = _mm_loadu_si128( cast(__m128i*)(&scan[4*i]) );

            // unpack to 32-bit int
            __m128i rgba_lo8 = _mm_unpacklo_epi8(rgba, zero);
            __m128i rgba_hi8 = _mm_unpackhi_epi8(rgba, zero);
            __m128i rgba0 = _mm_unpacklo_epi16(rgba_lo8, zero);
            __m128i rgba1 = _mm_unpackhi_epi16(rgba_lo8, zero);
            __m128i rgba2 = _mm_unpacklo_epi16(rgba_hi8, zero);
            __m128i rgba3 = _mm_unpackhi_epi16(rgba_hi8, zero);

            // convert to float
            __m128 x0 = _mm_cvtepi32_ps(rgba0);
            __m128 x1 = _mm_cvtepi32_ps(rgba1);
            __m128 x2 = _mm_cvtepi32_ps(rgba2);
            __m128 x3 = _mm_cvtepi32_ps(rgba3);

            // evaluate polynomials using horner's rule
            __m128 y0 = ((A * x0 + B) * x0 + C) * x0 + D;
            __m128 y1 = ((A * x1 + B) * x1 + C) * x1 + D;
            __m128 y2 = ((A * x2 + B) * x2 + C) * x2 + D;
            __m128 y3 = ((A * x3 + B) * x3 + C) * x3 + D;

            rgba0 = _mm_cvttps_epi32(y0);
            rgba1 = _mm_cvttps_epi32(y1);
            rgba2 = _mm_cvttps_epi32(y2);
            rgba3 = _mm_cvttps_epi32(y3);
            rgba0 = _mm_packs_epi32(rgba0, rgba1);
            rgba2 = _mm_packs_epi32(rgba2, rgba3);
            rgba0 = _mm_packus_epi16(rgba0, rgba2);
            _mm_storeu_si128(cast(__m128i*)(&scan[4*i]), rgba0);
        }

        for(; i < w; i++)
        {
            __m128i rgba = _mm_loadu_si32(&scan[4*i]);

            // unpack to 32-bit int
            rgba = _mm_unpacklo_epi8(rgba, zero);
            rgba = _mm_unpacklo_epi16(rgba, zero);

            // convert to float
            __m128 x = _mm_cvtepi32_ps(rgba);

            // evaluate polynomials using horner's rule
            __m128 y = ((A * x + B) * x + C) * x + D;

            rgba = _mm_cvttps_epi32(y);
            rgba = _mm_packs_epi32(rgba, zero);
            rgba = _mm_packus_epi16(rgba, zero);
            *cast(int*)(&scan[4*i]) = rgba[0];
        }

 /+
        for(int i = 0; i < w; i ++)
        {

            float r = scan[4*i+0];
            float g = scan[4*i+1];
            float b = scan[4*i+2];
            float a = scan[4*i+3];
            float outR = coeff16[12] + r * (coeff16[8] + r * (coeff16[4] + r * coeff16[0]));
            float outG = coeff16[13] + g * (coeff16[9] + g * (coeff16[5] + g * coeff16[1]));
            float outB = coeff16[14] + b * (coeff16[10] + b * (coeff16[6] + b * coeff16[2]));
            float outA = coeff16[15] + a * (coeff16[11] + a * (coeff16[7] + a * coeff16[3]));
            int approxR = cast(int)(0.5f + outR);
            int approxG = cast(int)(0.5f + outG);
            int approxB = cast(int)(0.5f + outB);
            int approxA = cast(int)(0.5f + outA);
            if (approxR < 0)
                approxR = 0;
            if (approxG < 0)
                approxG = 0;
            if (approxB < 0)
                approxB = 0;
            if (approxA < 0)
                approxA = 0;
            if (approxR > 255)
                approxR = 255;
            if (approxG > 255)
                approxG = 255;
            if (approxB > 255)
                approxB = 255;
            if (approxA > 255)
                approxA = 255;
            scan[4*i+0] = cast(ubyte)(approxR);
            scan[4*i+1] = cast(ubyte)(approxG);
            scan[4*i+2] = cast(ubyte)(approxB);
            scan[4*i+3] = cast(ubyte)(approxA);
        }+/
    }
 }

 float[4] approximateComponentCurve(float lift, float gamma, float gain, float contrast, float* tempBuffer) nothrow @nogc
 {
    static float safePow(float a, float b) nothrow @nogc
    {
        if (a < 0)
            a = 0;
        if (a > 1)
            a = 1;
        return a ^^ b;
    }

    // create curve for this component
    for (int b = 0; b < 256; ++b)
    {
        float inp = b / 255.0f;
        float outR = gain*(inp + lift*(1-inp));
        outR = safePow(outR, 1.0f / gamma );

        outR = lerp!float(outR, smoothStep!float(0, 1, outR), contrast);
        tempBuffer[b] = outR * 255.0f;
    }

    // First first point above 0
    int first = 0;
    int last = 255;
    while (tempBuffer[first] == 0) first++;
    while (tempBuffer[last] == 255) last--;

    // quite significant for the quality of interp
    first += 10;
    last -= 10;

    // Choose some points to interpolate
    float x0 = first;
    float y0 = tempBuffer[first];
    int x1i = (first * 2 + last + 1) / 3;
    int x2i = (first * 1 + last*2 + 2) / 3;
    float x1 = x1i;
    float y1 = tempBuffer[x1i];
    float x2 = x2i;
    float y2 = tempBuffer[x2i];
    float x3 = last;
    float y3 = tempBuffer[last];

    // Lagrange to find 3rd degree polynomial, passing into (xn, yn)
    // P(x) = Ax^3 + Bx^2 + Cx + D

    float d0 = (x0-x1)*(x0-x2)*(x0-x3);
    float d1 = (x1-x0)*(x1-x2)*(x1-x3);
    float d2 = (x2-x0)*(x2-x1)*(x2-x3);
    float d3 = (x3-x0)*(x3-x1)*(x3-x2);
    y0 /= d0;
    y1 /= d1;
    y2 /= d2;
    y3 /= d3;

    // first term of Lagrange Polynomial => (x^3 - x²*(x1+x2+x3) + x * (x2*x1 + x3*x1 + x3*x2) - x3*x2*x1)*y0
    float A = y0 + y1 + y2 + y3;
    float B = -(x1+x2+x3)*(y0) - (x0+x2+x3)*(y1) - (x0+x1+x3)*(y2) - (x0+x1+x2)*(y3);
    float C = (x2*x1 + x3*x1 + x3*x2)*(y0) + (x2*x0 + x3*x0 + x3*x2)*(y1) + (x1*x0 + x3*x0 + x3*x1)*(y2) + (x1*x0 + x2*x0 + x2*x1)*(y3);
    float D = - x1*x2*x3*(y0) - x0*x2*x3*(y1) - x0*x1*x3*(y2) - x0*x1*x2*(y3);
    return [A, B, C, D];
 }

 }
	/**
	Color-correction post-effect.
	This is a widget intended to correct colors before display, at the Raw level.

	Copyright: Guillaume Piolat 2018.
	License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
	*/
	module dplug.pbrwidgets.colorcorrection;


	import gfm.math.matrix;

	import dplug.core.math;
	import dplug.gui.element;

	version = oldColorCorrection;

	version(oldColorCorrection)
	{

	/// FlatBackgroundGUI provides a background that is loaded from a PNG or JPEG
	/// image. The string for backgroundPath should be in "stringImportPaths"
	/// specified in dub.json
	class UIColorCorrection : UIElement
	{
	public:
	nothrow:
	@nogc:

	this(UIContext context)
	{
	super(context, flagRaw);

	_tableArea.reallocBuffer(256 * 3, 32);
	_redTransferTable = _tableArea.ptr;
	_greenTransferTable = _tableArea.ptr + 256;
	_blueTransferTable = _tableArea.ptr + 512;

	// Set identity tables
	foreach(i; 0..256)
	{
	_redTransferTable[i] = cast(ubyte)i;
	_greenTransferTable[i] = cast(ubyte)i;
	_blueTransferTable[i] = cast(ubyte)i;
	}
	}

	~this()
	{
	_tableArea.reallocBuffer(0, 32);
	}

	/// Calling this setup color correction table, with the well
	/// known lift-gamma-gain formula.
	void setLiftGammaGainContrast(float lift = 0.0f, float gamma = 1.0f, float gain = 1.0f, float contrast = 0.0f)
	{
	setLiftGammaGainContrastRGB(lift, gamma, gain, contrast,
	lift, gamma, gain, contrast,
	lift, gamma, gain, contrast);
	}

	/// Calling this setup color correction table, with the well
	/// known lift-gamma-gain formula, per channel.
	void setLiftGammaGainRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f,
	float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f,
	float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f)
	{
	setLiftGammaGainContrastRGB(rLift, rGamma, rGain, 0.0f,
	gLift, gGamma, gGain, 0.0f,
	bLift, bGamma, bGain, 0.0f);
	}

	/// Calling this setup color correction table, with the less
	/// known lift-gamma-gain formula + contrast addition, per channel.
	void setLiftGammaGainContrastRGB(
	float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, float rContrast = 0.0f,
	float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, float gContrast = 0.0f,
	float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f, float bContrast = 0.0f)
	{
	static float safePow(float a, float b) nothrow @nogc
	{
	if (a < 0)
	a = 0;
	if (a > 1)
	a = 1;
	return a ^^ b;
	}

	for (int b = 0; b < 256; ++b)
	{
	float inp = b / 255.0f;
	float outR = rGain(inp + rLift(1-inp));
	float outG = gGain(inp + gLift(1-inp));
	float outB = bGain(inp + bLift(1-inp));

	outR = safePow(outR, 1.0f / rGamma );
	outG = safePow(outG, 1.0f / gGamma );
	outB = safePow(outB, 1.0f / bGamma );

	if (outR < 0)
	outR = 0;
	if (outG < 0)
	outG = 0;
	if (outB < 0)
	outB = 0;
	if (outR > 1)
	outR = 1;
	if (outG > 1)
	outG = 1;
	if (outB > 1)
	outB = 1;

	outR = lerp!float(outR, smoothStep!float(0, 1, outR), rContrast);
	outG = lerp!float(outG, smoothStep!float(0, 1, outG), gContrast);
	outB = lerp!float(outB, smoothStep!float(0, 1, outB), bContrast);

	_redTransferTable[b] = cast(ubyte)(0.5f + outR * 255.0f);
	_greenTransferTable[b] = cast(ubyte)(0.5f + outG * 255.0f);
	_blueTransferTable[b] = cast(ubyte)(0.5f + outB * 255.0f);
	}
	setDirtyWhole();
	}

	/// ditto
	void setLiftGammaGainContrastRGB(mat3x4f liftGammaGainContrast)
	{
	auto m = liftGammaGainContrast;
	setLiftGammaGainContrastRGB(m.c[0][0], m.c[0][1], m.c[0][2], m.c[0][3],
	m.c[1][0], m.c[1][1], m.c[1][2], m.c[1][3],
	m.c[2][0], m.c[2][1], m.c[2][2], m.c[2][3]);
	}

	override void onDrawRaw(ImageRef!RGBA rawMap, box2i[] dirtyRects)
	{
	foreach(dirtyRect; dirtyRects)
	{
	rawMap.cropImageRef(dirtyRect).applyColorCorrection(_redTransferTable);
	}
	}

	private:
	ubyte[] _tableArea = null;
	ubyte* _redTransferTable;
	ubyte* _greenTransferTable;
	ubyte* _blueTransferTable;
	}

	void applyColorCorrection(ImageRef!RGBA image, const(ubyte*) rgbTable) pure nothrow @nogc
	{
	int w = image.w;
	int h = image.h;
	for (int j = 0; j < h; ++j)
	{
	ubyte* scan = cast(ubyte*)image.scanline(j).ptr;
	for (int i = 0; i < w; ++i)
	{
	ubyte r = scan[4*i];
	ubyte g = scan[4*i+1];
	ubyte b = scan[4*i+2];
	scan[4*i] = rgbTable[r];
	scan[4*i+1] = rgbTable[g+256];
	scan[4*i+2] = rgbTable[b+512];
	}
	}
	}

	}
	else
	{

	import inteli.xmmintrin;
	import inteli.emmintrin;

	class UIColorCorrection : UIElement
	{
	public:
	nothrow:
	@nogc:

	this(UIContext context)
	{
	super(context, flagRaw);
	_tempBuffer.reallocBuffer(256);
	_coef.reallocBuffer(16, 16);
	}

	~this()
	{
	_tempBuffer.reallocBuffer(0);
	_coef.reallocBuffer(0, 16);
	}

	/// Calling this setup color correction table, with the well
	/// known lift-gamma-gain formula.
	void setLiftGammaGainContrast(float lift = 0.0f, float gamma = 1.0f, float gain = 1.0f, float contrast = 0.0f)
	{
	setLiftGammaGainContrastRGB(lift, gamma, gain, contrast,
	lift, gamma, gain, contrast,
	lift, gamma, gain, contrast);
	}

	/// Calling this setup color correction table, with the well
	/// known lift-gamma-gain formula, per channel.
	void setLiftGammaGainRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f,
	float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f,
	float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f)
	{
	setLiftGammaGainContrastRGB(rLift, rGamma, rGain, 0.0f,
	gLift, gGamma, gGain, 0.0f,
	bLift, bGamma, bGain, 0.0f);
	}

	/// Calling this setup color correction table, with the less
	/// known lift-gamma-gain formula + contrast addition, per channel.
	void setLiftGammaGainContrastRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, float rContrast = 0.0f,
	float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, float gContrast = 0.0f,
	float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f, float bContrast = 0.0f)
	{

	float[4] redPoly = approximateComponentCurve(rLift, rGamma, rGain, rContrast, _tempBuffer.ptr);
	float[4] greenPoly = approximateComponentCurve(gLift, gGamma, gGain, gContrast, _tempBuffer.ptr);
	float[4] bluePoly = approximateComponentCurve(bLift, bGamma, bGain, bContrast, _tempBuffer.ptr);
	float[4] alphaPoly = [0.0f, 0.0f, 1.0f, 0.0f];
	foreach(i; 0..4)
	{
	_coef[4*i+0] = redPoly[i];
	_coef[4*i+1] = greenPoly[i];
	_coef[4*i+2] = bluePoly[i];
	_coef[4*i+3] = alphaPoly[i];
	}
	_coef[12] += 0.5f; // for rounding before truncation to int
	_coef[13] += 0.5f;
	_coef[14] += 0.5f;
	_coef[15] += 0.5f;
	setDirtyWhole();
	}

	/// ditto
	void setLiftGammaGainContrastRGB(mat3x4f liftGammaGainContrast)
	{
	auto m = liftGammaGainContrast;
	setLiftGammaGainContrastRGB(m.c[0][0], m.c[0][1], m.c[0][2], m.c[0][3],
	m.c[1][0], m.c[1][1], m.c[1][2], m.c[1][3],
	m.c[2][0], m.c[2][1], m.c[2][2], m.c[2][3]);
	}

	override void onDrawRaw(ImageRef!RGBA rawMap, box2i[] dirtyRects)
	{
	foreach(dirtyRect; dirtyRects)
	{
	rawMap.cropImageRef(dirtyRect).applyColorCorrectionApprox(_coef.ptr);
	}
	}

	private:

	// 4x 3rd order polynomials for remapping components
	// A_red A_green A_blue A_alpha
	// B_red B_green B_blue B_alpha
	// C_red C_green C_blue C_alpha
	// D_red D_green D_blue D_alpha
	float[] _coef;

	// A temporary table, for computing polynomials.
	float[] _tempBuffer = null;
	}

	private:

	void applyColorCorrectionApprox(ImageRef!RGBA image, const(float*) coeff16) pure nothrow @nogc
	{
	immutable int w = image.w;
	immutable int h = image.h;
	for (int j = 0; j < h; ++j)
	{
	ubyte* scan = cast(ubyte*)image.scanline(j).ptr;

	// load all coefficients
	const(__m128) A = _mm_load_ps(coeff16);
	const(__m128) B = _mm_load_ps(coeff16+4);
	const(__m128) C = _mm_load_ps(coeff16+8);
	const(__m128) D = _mm_load_ps(coeff16+12);
	const(__m128i) zero = _mm_setzero_si128();

	// unroll by 4
	int i = 0;
	for(; i + 3 < w; i += 4)
	{
	__m128i rgba = _mm_loadu_si128( cast(__m128i)(&scan[4i]) );

	// unpack to 32-bit int
	__m128i rgba_lo8 = _mm_unpacklo_epi8(rgba, zero);
	__m128i rgba_hi8 = _mm_unpackhi_epi8(rgba, zero);
	__m128i rgba0 = _mm_unpacklo_epi16(rgba_lo8, zero);
	__m128i rgba1 = _mm_unpackhi_epi16(rgba_lo8, zero);
	__m128i rgba2 = _mm_unpacklo_epi16(rgba_hi8, zero);
	__m128i rgba3 = _mm_unpackhi_epi16(rgba_hi8, zero);

	// convert to float
	__m128 x0 = _mm_cvtepi32_ps(rgba0);
	__m128 x1 = _mm_cvtepi32_ps(rgba1);
	__m128 x2 = _mm_cvtepi32_ps(rgba2);
	__m128 x3 = _mm_cvtepi32_ps(rgba3);

	// evaluate polynomials using horner's rule
	__m128 y0 = ((A * x0 + B) * x0 + C) * x0 + D;
	__m128 y1 = ((A * x1 + B) * x1 + C) * x1 + D;
	__m128 y2 = ((A * x2 + B) * x2 + C) * x2 + D;
	__m128 y3 = ((A * x3 + B) * x3 + C) * x3 + D;

	rgba0 = _mm_cvttps_epi32(y0);
	rgba1 = _mm_cvttps_epi32(y1);
	rgba2 = _mm_cvttps_epi32(y2);
	rgba3 = _mm_cvttps_epi32(y3);
	rgba0 = _mm_packs_epi32(rgba0, rgba1);
	rgba2 = _mm_packs_epi32(rgba2, rgba3);
	rgba0 = _mm_packus_epi16(rgba0, rgba2);
	_mm_storeu_si128(cast(__m128i)(&scan[4i]), rgba0);
	}

	for(; i < w; i++)
	{
	__m128i rgba = _mm_loadu_si32(&scan[4*i]);

	// unpack to 32-bit int
	rgba = _mm_unpacklo_epi8(rgba, zero);
	rgba = _mm_unpacklo_epi16(rgba, zero);

	// convert to float
	__m128 x = _mm_cvtepi32_ps(rgba);

	// evaluate polynomials using horner's rule
	__m128 y = ((A * x + B) * x + C) * x + D;

	rgba = _mm_cvttps_epi32(y);
	rgba = _mm_packs_epi32(rgba, zero);
	rgba = _mm_packus_epi16(rgba, zero);
	cast(int)(&scan[4*i]) = rgba[0];
	}

	/+
	for(int i = 0; i < w; i ++)
	{

	float r = scan[4*i+0];
	float g = scan[4*i+1];
	float b = scan[4*i+2];
	float a = scan[4*i+3];
	float outR = coeff16[12] + r * (coeff16[8] + r * (coeff16[4] + r * coeff16[0]));
	float outG = coeff16[13] + g * (coeff16[9] + g * (coeff16[5] + g * coeff16[1]));
	float outB = coeff16[14] + b * (coeff16[10] + b * (coeff16[6] + b * coeff16[2]));
	float outA = coeff16[15] + a * (coeff16[11] + a * (coeff16[7] + a * coeff16[3]));
	int approxR = cast(int)(0.5f + outR);
	int approxG = cast(int)(0.5f + outG);
	int approxB = cast(int)(0.5f + outB);
	int approxA = cast(int)(0.5f + outA);
	if (approxR < 0)
	approxR = 0;
	if (approxG < 0)
	approxG = 0;
	if (approxB < 0)
	approxB = 0;
	if (approxA < 0)
	approxA = 0;
	if (approxR > 255)
	approxR = 255;
	if (approxG > 255)
	approxG = 255;
	if (approxB > 255)
	approxB = 255;
	if (approxA > 255)
	approxA = 255;
	scan[4*i+0] = cast(ubyte)(approxR);
	scan[4*i+1] = cast(ubyte)(approxG);
	scan[4*i+2] = cast(ubyte)(approxB);
	scan[4*i+3] = cast(ubyte)(approxA);
	}+/
	}
	}

	float[4] approximateComponentCurve(float lift, float gamma, float gain, float contrast, float* tempBuffer) nothrow @nogc
	{
	static float safePow(float a, float b) nothrow @nogc
	{
	if (a < 0)
	a = 0;
	if (a > 1)
	a = 1;
	return a ^^ b;
	}

	// create curve for this component
	for (int b = 0; b < 256; ++b)
	{
	float inp = b / 255.0f;
	float outR = gain(inp + lift(1-inp));
	outR = safePow(outR, 1.0f / gamma );

	outR = lerp!float(outR, smoothStep!float(0, 1, outR), contrast);
	tempBuffer[b] = outR * 255.0f;
	}

	// First first point above 0
	int first = 0;
	int last = 255;
	while (tempBuffer[first] == 0) first++;
	while (tempBuffer[last] == 255) last--;

	// quite significant for the quality of interp
	first += 10;
	last -= 10;

	// Choose some points to interpolate
	float x0 = first;
	float y0 = tempBuffer[first];
	int x1i = (first * 2 + last + 1) / 3;
	int x2i = (first * 1 + last*2 + 2) / 3;
	float x1 = x1i;
	float y1 = tempBuffer[x1i];
	float x2 = x2i;
	float y2 = tempBuffer[x2i];
	float x3 = last;
	float y3 = tempBuffer[last];

	// Lagrange to find 3rd degree polynomial, passing into (xn, yn)
	// P(x) = Ax^3 + Bx^2 + Cx + D

	float d0 = (x0-x1)(x0-x2)(x0-x3);
	float d1 = (x1-x0)(x1-x2)(x1-x3);
	float d2 = (x2-x0)(x2-x1)(x2-x3);
	float d3 = (x3-x0)(x3-x1)(x3-x2);
	y0 /= d0;
	y1 /= d1;
	y2 /= d2;
	y3 /= d3;

	// first term of Lagrange Polynomial => (x^3 - x²(x1+x2+x3) + x (x2x1 + x3x1 + x3x2) - x3x2x1)y0
	float A = y0 + y1 + y2 + y3;
	float B = -(x1+x2+x3)(y0) - (x0+x2+x3)(y1) - (x0+x1+x3)(y2) - (x0+x1+x2)(y3);
	float C = (x2x1 + x3x1 + x3x2)(y0) + (x2x0 + x3x0 + x3x2)(y1) + (x1x0 + x3x0 + x3x1)(y2) + (x1x0 + x2x0 + x2x1)(y3);
	float D = - x1x2x3(y0) - x0x2x3(y1) - x0x1x3(y2) - x0x1x2(y3);
	return [A, B, C, D];
	}

	}
No results found