Created
February 19, 2019 01:57
-
-
Save p0nce/027324584b79d26acc5304defe235059 to your computer and use it in GitHub Desktop.
Another failure to optimize table lookup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| Color-correction post-effect. | |
| This is a widget intended to correct colors before display, at the Raw level. | |
| Copyright: Guillaume Piolat 2018. | |
| License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) | |
| */ | |
| module dplug.pbrwidgets.colorcorrection; | |
| import gfm.math.matrix; | |
| import dplug.core.math; | |
| import dplug.gui.element; | |
| version = oldColorCorrection; | |
| version(oldColorCorrection) | |
| { | |
| /// FlatBackgroundGUI provides a background that is loaded from a PNG or JPEG | |
| /// image. The string for backgroundPath should be in "stringImportPaths" | |
| /// specified in dub.json | |
| class UIColorCorrection : UIElement | |
| { | |
| public: | |
| nothrow: | |
| @nogc: | |
| this(UIContext context) | |
| { | |
| super(context, flagRaw); | |
| _tableArea.reallocBuffer(256 * 3, 32); | |
| _redTransferTable = _tableArea.ptr; | |
| _greenTransferTable = _tableArea.ptr + 256; | |
| _blueTransferTable = _tableArea.ptr + 512; | |
| // Set identity tables | |
| foreach(i; 0..256) | |
| { | |
| _redTransferTable[i] = cast(ubyte)i; | |
| _greenTransferTable[i] = cast(ubyte)i; | |
| _blueTransferTable[i] = cast(ubyte)i; | |
| } | |
| } | |
| ~this() | |
| { | |
| _tableArea.reallocBuffer(0, 32); | |
| } | |
| /// Calling this setup color correction table, with the well | |
| /// known lift-gamma-gain formula. | |
| void setLiftGammaGainContrast(float lift = 0.0f, float gamma = 1.0f, float gain = 1.0f, float contrast = 0.0f) | |
| { | |
| setLiftGammaGainContrastRGB(lift, gamma, gain, contrast, | |
| lift, gamma, gain, contrast, | |
| lift, gamma, gain, contrast); | |
| } | |
| /// Calling this setup color correction table, with the well | |
| /// known lift-gamma-gain formula, per channel. | |
| void setLiftGammaGainRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, | |
| float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, | |
| float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f) | |
| { | |
| setLiftGammaGainContrastRGB(rLift, rGamma, rGain, 0.0f, | |
| gLift, gGamma, gGain, 0.0f, | |
| bLift, bGamma, bGain, 0.0f); | |
| } | |
| /// Calling this setup color correction table, with the less | |
| /// known lift-gamma-gain formula + contrast addition, per channel. | |
| void setLiftGammaGainContrastRGB( | |
| float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, float rContrast = 0.0f, | |
| float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, float gContrast = 0.0f, | |
| float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f, float bContrast = 0.0f) | |
| { | |
| static float safePow(float a, float b) nothrow @nogc | |
| { | |
| if (a < 0) | |
| a = 0; | |
| if (a > 1) | |
| a = 1; | |
| return a ^^ b; | |
| } | |
| for (int b = 0; b < 256; ++b) | |
| { | |
| float inp = b / 255.0f; | |
| float outR = rGain*(inp + rLift*(1-inp)); | |
| float outG = gGain*(inp + gLift*(1-inp)); | |
| float outB = bGain*(inp + bLift*(1-inp)); | |
| outR = safePow(outR, 1.0f / rGamma ); | |
| outG = safePow(outG, 1.0f / gGamma ); | |
| outB = safePow(outB, 1.0f / bGamma ); | |
| if (outR < 0) | |
| outR = 0; | |
| if (outG < 0) | |
| outG = 0; | |
| if (outB < 0) | |
| outB = 0; | |
| if (outR > 1) | |
| outR = 1; | |
| if (outG > 1) | |
| outG = 1; | |
| if (outB > 1) | |
| outB = 1; | |
| outR = lerp!float(outR, smoothStep!float(0, 1, outR), rContrast); | |
| outG = lerp!float(outG, smoothStep!float(0, 1, outG), gContrast); | |
| outB = lerp!float(outB, smoothStep!float(0, 1, outB), bContrast); | |
| _redTransferTable[b] = cast(ubyte)(0.5f + outR * 255.0f); | |
| _greenTransferTable[b] = cast(ubyte)(0.5f + outG * 255.0f); | |
| _blueTransferTable[b] = cast(ubyte)(0.5f + outB * 255.0f); | |
| } | |
| setDirtyWhole(); | |
| } | |
| /// ditto | |
| void setLiftGammaGainContrastRGB(mat3x4f liftGammaGainContrast) | |
| { | |
| auto m = liftGammaGainContrast; | |
| setLiftGammaGainContrastRGB(m.c[0][0], m.c[0][1], m.c[0][2], m.c[0][3], | |
| m.c[1][0], m.c[1][1], m.c[1][2], m.c[1][3], | |
| m.c[2][0], m.c[2][1], m.c[2][2], m.c[2][3]); | |
| } | |
| override void onDrawRaw(ImageRef!RGBA rawMap, box2i[] dirtyRects) | |
| { | |
| foreach(dirtyRect; dirtyRects) | |
| { | |
| rawMap.cropImageRef(dirtyRect).applyColorCorrection(_redTransferTable); | |
| } | |
| } | |
| private: | |
| ubyte[] _tableArea = null; | |
| ubyte* _redTransferTable; | |
| ubyte* _greenTransferTable; | |
| ubyte* _blueTransferTable; | |
| } | |
| void applyColorCorrection(ImageRef!RGBA image, const(ubyte*) rgbTable) pure nothrow @nogc | |
| { | |
| int w = image.w; | |
| int h = image.h; | |
| for (int j = 0; j < h; ++j) | |
| { | |
| ubyte* scan = cast(ubyte*)image.scanline(j).ptr; | |
| for (int i = 0; i < w; ++i) | |
| { | |
| ubyte r = scan[4*i]; | |
| ubyte g = scan[4*i+1]; | |
| ubyte b = scan[4*i+2]; | |
| scan[4*i] = rgbTable[r]; | |
| scan[4*i+1] = rgbTable[g+256]; | |
| scan[4*i+2] = rgbTable[b+512]; | |
| } | |
| } | |
| } | |
| } | |
| else | |
| { | |
| import inteli.xmmintrin; | |
| import inteli.emmintrin; | |
| class UIColorCorrection : UIElement | |
| { | |
| public: | |
| nothrow: | |
| @nogc: | |
| this(UIContext context) | |
| { | |
| super(context, flagRaw); | |
| _tempBuffer.reallocBuffer(256); | |
| _coef.reallocBuffer(16, 16); | |
| } | |
| ~this() | |
| { | |
| _tempBuffer.reallocBuffer(0); | |
| _coef.reallocBuffer(0, 16); | |
| } | |
| /// Calling this setup color correction table, with the well | |
| /// known lift-gamma-gain formula. | |
| void setLiftGammaGainContrast(float lift = 0.0f, float gamma = 1.0f, float gain = 1.0f, float contrast = 0.0f) | |
| { | |
| setLiftGammaGainContrastRGB(lift, gamma, gain, contrast, | |
| lift, gamma, gain, contrast, | |
| lift, gamma, gain, contrast); | |
| } | |
| /// Calling this setup color correction table, with the well | |
| /// known lift-gamma-gain formula, per channel. | |
| void setLiftGammaGainRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, | |
| float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, | |
| float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f) | |
| { | |
| setLiftGammaGainContrastRGB(rLift, rGamma, rGain, 0.0f, | |
| gLift, gGamma, gGain, 0.0f, | |
| bLift, bGamma, bGain, 0.0f); | |
| } | |
| /// Calling this setup color correction table, with the less | |
| /// known lift-gamma-gain formula + contrast addition, per channel. | |
| void setLiftGammaGainContrastRGB(float rLift = 0.0f, float rGamma = 1.0f, float rGain = 1.0f, float rContrast = 0.0f, | |
| float gLift = 0.0f, float gGamma = 1.0f, float gGain = 1.0f, float gContrast = 0.0f, | |
| float bLift = 0.0f, float bGamma = 1.0f, float bGain = 1.0f, float bContrast = 0.0f) | |
| { | |
| float[4] redPoly = approximateComponentCurve(rLift, rGamma, rGain, rContrast, _tempBuffer.ptr); | |
| float[4] greenPoly = approximateComponentCurve(gLift, gGamma, gGain, gContrast, _tempBuffer.ptr); | |
| float[4] bluePoly = approximateComponentCurve(bLift, bGamma, bGain, bContrast, _tempBuffer.ptr); | |
| float[4] alphaPoly = [0.0f, 0.0f, 1.0f, 0.0f]; | |
| foreach(i; 0..4) | |
| { | |
| _coef[4*i+0] = redPoly[i]; | |
| _coef[4*i+1] = greenPoly[i]; | |
| _coef[4*i+2] = bluePoly[i]; | |
| _coef[4*i+3] = alphaPoly[i]; | |
| } | |
| _coef[12] += 0.5f; // for rounding before truncation to int | |
| _coef[13] += 0.5f; | |
| _coef[14] += 0.5f; | |
| _coef[15] += 0.5f; | |
| setDirtyWhole(); | |
| } | |
| /// ditto | |
| void setLiftGammaGainContrastRGB(mat3x4f liftGammaGainContrast) | |
| { | |
| auto m = liftGammaGainContrast; | |
| setLiftGammaGainContrastRGB(m.c[0][0], m.c[0][1], m.c[0][2], m.c[0][3], | |
| m.c[1][0], m.c[1][1], m.c[1][2], m.c[1][3], | |
| m.c[2][0], m.c[2][1], m.c[2][2], m.c[2][3]); | |
| } | |
| override void onDrawRaw(ImageRef!RGBA rawMap, box2i[] dirtyRects) | |
| { | |
| foreach(dirtyRect; dirtyRects) | |
| { | |
| rawMap.cropImageRef(dirtyRect).applyColorCorrectionApprox(_coef.ptr); | |
| } | |
| } | |
| private: | |
| // 4x 3rd order polynomials for remapping components | |
| // A_red A_green A_blue A_alpha | |
| // B_red B_green B_blue B_alpha | |
| // C_red C_green C_blue C_alpha | |
| // D_red D_green D_blue D_alpha | |
| float[] _coef; | |
| // A temporary table, for computing polynomials. | |
| float[] _tempBuffer = null; | |
| } | |
| private: | |
| void applyColorCorrectionApprox(ImageRef!RGBA image, const(float*) coeff16) pure nothrow @nogc | |
| { | |
| immutable int w = image.w; | |
| immutable int h = image.h; | |
| for (int j = 0; j < h; ++j) | |
| { | |
| ubyte* scan = cast(ubyte*)image.scanline(j).ptr; | |
| // load all coefficients | |
| const(__m128) A = _mm_load_ps(coeff16); | |
| const(__m128) B = _mm_load_ps(coeff16+4); | |
| const(__m128) C = _mm_load_ps(coeff16+8); | |
| const(__m128) D = _mm_load_ps(coeff16+12); | |
| const(__m128i) zero = _mm_setzero_si128(); | |
| // unroll by 4 | |
| int i = 0; | |
| for(; i + 3 < w; i += 4) | |
| { | |
| __m128i rgba = _mm_loadu_si128( cast(__m128i*)(&scan[4*i]) ); | |
| // unpack to 32-bit int | |
| __m128i rgba_lo8 = _mm_unpacklo_epi8(rgba, zero); | |
| __m128i rgba_hi8 = _mm_unpackhi_epi8(rgba, zero); | |
| __m128i rgba0 = _mm_unpacklo_epi16(rgba_lo8, zero); | |
| __m128i rgba1 = _mm_unpackhi_epi16(rgba_lo8, zero); | |
| __m128i rgba2 = _mm_unpacklo_epi16(rgba_hi8, zero); | |
| __m128i rgba3 = _mm_unpackhi_epi16(rgba_hi8, zero); | |
| // convert to float | |
| __m128 x0 = _mm_cvtepi32_ps(rgba0); | |
| __m128 x1 = _mm_cvtepi32_ps(rgba1); | |
| __m128 x2 = _mm_cvtepi32_ps(rgba2); | |
| __m128 x3 = _mm_cvtepi32_ps(rgba3); | |
| // evaluate polynomials using horner's rule | |
| __m128 y0 = ((A * x0 + B) * x0 + C) * x0 + D; | |
| __m128 y1 = ((A * x1 + B) * x1 + C) * x1 + D; | |
| __m128 y2 = ((A * x2 + B) * x2 + C) * x2 + D; | |
| __m128 y3 = ((A * x3 + B) * x3 + C) * x3 + D; | |
| rgba0 = _mm_cvttps_epi32(y0); | |
| rgba1 = _mm_cvttps_epi32(y1); | |
| rgba2 = _mm_cvttps_epi32(y2); | |
| rgba3 = _mm_cvttps_epi32(y3); | |
| rgba0 = _mm_packs_epi32(rgba0, rgba1); | |
| rgba2 = _mm_packs_epi32(rgba2, rgba3); | |
| rgba0 = _mm_packus_epi16(rgba0, rgba2); | |
| _mm_storeu_si128(cast(__m128i*)(&scan[4*i]), rgba0); | |
| } | |
| for(; i < w; i++) | |
| { | |
| __m128i rgba = _mm_loadu_si32(&scan[4*i]); | |
| // unpack to 32-bit int | |
| rgba = _mm_unpacklo_epi8(rgba, zero); | |
| rgba = _mm_unpacklo_epi16(rgba, zero); | |
| // convert to float | |
| __m128 x = _mm_cvtepi32_ps(rgba); | |
| // evaluate polynomials using horner's rule | |
| __m128 y = ((A * x + B) * x + C) * x + D; | |
| rgba = _mm_cvttps_epi32(y); | |
| rgba = _mm_packs_epi32(rgba, zero); | |
| rgba = _mm_packus_epi16(rgba, zero); | |
| *cast(int*)(&scan[4*i]) = rgba[0]; | |
| } | |
| /+ | |
| for(int i = 0; i < w; i ++) | |
| { | |
| float r = scan[4*i+0]; | |
| float g = scan[4*i+1]; | |
| float b = scan[4*i+2]; | |
| float a = scan[4*i+3]; | |
| float outR = coeff16[12] + r * (coeff16[8] + r * (coeff16[4] + r * coeff16[0])); | |
| float outG = coeff16[13] + g * (coeff16[9] + g * (coeff16[5] + g * coeff16[1])); | |
| float outB = coeff16[14] + b * (coeff16[10] + b * (coeff16[6] + b * coeff16[2])); | |
| float outA = coeff16[15] + a * (coeff16[11] + a * (coeff16[7] + a * coeff16[3])); | |
| int approxR = cast(int)(0.5f + outR); | |
| int approxG = cast(int)(0.5f + outG); | |
| int approxB = cast(int)(0.5f + outB); | |
| int approxA = cast(int)(0.5f + outA); | |
| if (approxR < 0) | |
| approxR = 0; | |
| if (approxG < 0) | |
| approxG = 0; | |
| if (approxB < 0) | |
| approxB = 0; | |
| if (approxA < 0) | |
| approxA = 0; | |
| if (approxR > 255) | |
| approxR = 255; | |
| if (approxG > 255) | |
| approxG = 255; | |
| if (approxB > 255) | |
| approxB = 255; | |
| if (approxA > 255) | |
| approxA = 255; | |
| scan[4*i+0] = cast(ubyte)(approxR); | |
| scan[4*i+1] = cast(ubyte)(approxG); | |
| scan[4*i+2] = cast(ubyte)(approxB); | |
| scan[4*i+3] = cast(ubyte)(approxA); | |
| }+/ | |
| } | |
| } | |
| float[4] approximateComponentCurve(float lift, float gamma, float gain, float contrast, float* tempBuffer) nothrow @nogc | |
| { | |
| static float safePow(float a, float b) nothrow @nogc | |
| { | |
| if (a < 0) | |
| a = 0; | |
| if (a > 1) | |
| a = 1; | |
| return a ^^ b; | |
| } | |
| // create curve for this component | |
| for (int b = 0; b < 256; ++b) | |
| { | |
| float inp = b / 255.0f; | |
| float outR = gain*(inp + lift*(1-inp)); | |
| outR = safePow(outR, 1.0f / gamma ); | |
| outR = lerp!float(outR, smoothStep!float(0, 1, outR), contrast); | |
| tempBuffer[b] = outR * 255.0f; | |
| } | |
| // First first point above 0 | |
| int first = 0; | |
| int last = 255; | |
| while (tempBuffer[first] == 0) first++; | |
| while (tempBuffer[last] == 255) last--; | |
| // quite significant for the quality of interp | |
| first += 10; | |
| last -= 10; | |
| // Choose some points to interpolate | |
| float x0 = first; | |
| float y0 = tempBuffer[first]; | |
| int x1i = (first * 2 + last + 1) / 3; | |
| int x2i = (first * 1 + last*2 + 2) / 3; | |
| float x1 = x1i; | |
| float y1 = tempBuffer[x1i]; | |
| float x2 = x2i; | |
| float y2 = tempBuffer[x2i]; | |
| float x3 = last; | |
| float y3 = tempBuffer[last]; | |
| // Lagrange to find 3rd degree polynomial, passing into (xn, yn) | |
| // P(x) = Ax^3 + Bx^2 + Cx + D | |
| float d0 = (x0-x1)*(x0-x2)*(x0-x3); | |
| float d1 = (x1-x0)*(x1-x2)*(x1-x3); | |
| float d2 = (x2-x0)*(x2-x1)*(x2-x3); | |
| float d3 = (x3-x0)*(x3-x1)*(x3-x2); | |
| y0 /= d0; | |
| y1 /= d1; | |
| y2 /= d2; | |
| y3 /= d3; | |
| // first term of Lagrange Polynomial => (x^3 - x²*(x1+x2+x3) + x * (x2*x1 + x3*x1 + x3*x2) - x3*x2*x1)*y0 | |
| float A = y0 + y1 + y2 + y3; | |
| float B = -(x1+x2+x3)*(y0) - (x0+x2+x3)*(y1) - (x0+x1+x3)*(y2) - (x0+x1+x2)*(y3); | |
| float C = (x2*x1 + x3*x1 + x3*x2)*(y0) + (x2*x0 + x3*x0 + x3*x2)*(y1) + (x1*x0 + x3*x0 + x3*x1)*(y2) + (x1*x0 + x2*x0 + x2*x1)*(y3); | |
| float D = - x1*x2*x3*(y0) - x0*x2*x3*(y1) - x0*x1*x3*(y2) - x0*x1*x2*(y3); | |
| return [A, B, C, D]; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment