jakeoung · June 3, 2017 14:23
diff --git a/jkimg.c b/jkimg.c
 /*
 JKImage library for communicating with python

 Rules
 - The input image usually follows the order of [Channel x ny x nx]
 - 
 */

 #include <math.h>
 #include <stdio.h>
 #include <stdbool.h>

 #define MAX(a,b) (((a) >= (b)) ? (a) : (b))
 #define MIN(a,b) (((a) <= (b)) ? (a) : (b))

 // helper defininiton of indexing for numpy-given images of shape [C x H x W] column-wise
 #define T( I, x, y, c, ny, nx ) I[ x + y*nx + c*ny*nx ]

 // helper defininiton of indexing for Matlab-given images of shape [C x W x H] row-wise
 // #define TMATLAB( I, x, y, c, ny, nx ) I[ y + x*ny + c*ny*nx ]

 int neumann_bc(int x, int nx, bool *isOut)
 {
 	if (x < 0) {
 		x = 0;
 		*isOut = true;
 	}
 	else if (x >= nx) {
 		x = nx - 1;
 		*isOut = true;
 	}
 	return x;
 }

 //////// interpolation for optical flow
 float interp_cubic_cell(float v[4], float x)
 {
    return v[1] + 0.5 * x * (v[2] - v[0] +
        x * (2.0 * v[0] - 5.0 * v[1] + 4.0 * v[2] - v[3] +
        x * (3.0 * (v[1] - v[2]) + v[3] - v[0])));
 }

 float interp2_bicubic_at(const float*I, float xx, float yy, int ny, int nx)
 {
 	int sx = (xx < 0) ? -1 : 1;
 	int sy = (yy < 0) ? -1 : 1;

 	bool isOut[1] = { false };
 	
 	// neumann boundary
 	int mx	= neumann_bc((int)xx - sx    , nx, isOut);
 	int x   = neumann_bc((int)xx         , nx, isOut);
 	int px  = neumann_bc((int)xx + sx    , nx, isOut);
 	int ppx = neumann_bc((int)xx + 2 * sx, nx, isOut);

    int my  = neumann_bc((int)yy - sy    , ny, isOut);
    int y   = neumann_bc((int)yy         , ny, isOut);
    int py  = neumann_bc((int)yy + sy    , ny, isOut);
    int ppy = neumann_bc((int)yy + 2 * sy, ny, isOut);

    if (*isOut) {
        return -1.0;
    }
    
    float p11 = I[mx + my * nx];
    float p12 = I[x + my * nx];
    float p13 = I[px + my * nx];
    float p14 = I[ppx + my * nx];

    float p21 = I[mx + y * nx];
    float p22 = I[x + y * nx];
    float p23 = I[px + y * nx];
    float p24 = I[ppx + y * nx];

    float p31 = I[mx + py * nx];
    float p32 = I[x + py * nx];
    float p33 = I[px + py * nx];
    float p34 = I[ppx + py * nx];

    float p41 = I[mx + ppy * nx];
    float p42 = I[x + ppy * nx];
    float p43 = I[px + ppy * nx];
    float p44 = I[ppx + ppy * nx];

    float p[4][4] = {
        { p11, p21, p31, p41 },
        { p12, p22, p32, p42 },
        { p13, p23, p33, p43 },
        { p14, p24, p34, p44 }
    };

    float v[4];
    v[0] = interp_cubic_cell(p[0], yy - y);
    v[1] = interp_cubic_cell(p[1], yy - y);
    v[2] = interp_cubic_cell(p[2], yy - y);
    v[3] = interp_cubic_cell(p[3], yy - y);

    return interp_cubic_cell(v, xx - x);
 }

 // Original author: Javier Sánchez Pérez <[email protected]>
 void interp2_bicubic(
 	const	float*	I,
 	const	float*	u,
 	const	float*	v,
 			float*	out,
 	const	int		ny,
 	const	int		nx,
 	const	float*	extra
 )
 {
    #pragma omp parallel for collapse(2)
 	for (int y = 0; y < ny; y++) {
 		for (int x = 0; x < nx; x++) {
 			int idx = x + y * nx;
 			float xx = x + u[idx];
 			float yy = y + v[idx];

            out[idx] = interp2_bicubic_at(I, xx, yy, ny, nx);
            if (out[idx] < 0) {
                out[idx] = extra[idx];
            }
 		}
 	}
 }




 void computeGradient(
 	const	float*  I,
 	const	int     method,
            float*  I_x,
            float*  I_y,
    const   int     ny,
    const   int     nx
 	)
 {
 	// forward difference
 	if (method == 1) {
        #pragma omp parallel for collapse(2)
        for (int y=0; y < ny; y++) {
            for (int x=0; x < nx; x++) {
                // w.r.t. x
                float valNow, valNext;
                valNow = I[x + y * nx];
                
                if (x+1 == nx)
                    valNext = I[x+y*nx];
                else
                    valNext = I[x+1+y*nx];
                
                I_x[x+y*nx] = valNext - valNow;

                // w.r.t. y
                if (y+1 == ny)
                    valNext = I[x+y*nx];
                else
                    valNext = I[x + (y+1) * nx];

                I_y[x+y*nx] = valNext - valNow;
            }
        }
 	}
 	// centered difference
 	else if (method == 0) {
 		#pragma omp parallel for collapse(2)
        for (int y=0; y < ny; y++) {
            for (int x=0; x < nx; x++) {
                // w.r.t. x
                float valxp, valxm, valyp, valym;
                int xp, xm, yp, ym;

                // xp: xplus, xm: xminus
                xp = MIN(x+1, nx-1);
                xm = MAX(x-1, 0);
                yp = MIN(y+1, ny-1);
                ym = MAX(y-1, 0);

                valxp = I[xp + y * nx];
                valxm = I[xm + y * nx];
                valyp = I[x + yp * nx];
                valym = I[x + ym * nx];

                I_x[x+y*nx] = 0.5 * valxp - 0.5 * valxm;
                I_y[x+y*nx] = 0.5 * valyp - 0.5 * valym;
            }
        }
 	}
 }

 // compute divergence with backward difference
 void computeDivergence(
    const   float*  I1,  
    const   float*  I2,
    	    float*  out,
    const   int     ny,
    const   int     nx
 	)
 {
 	// backward difference
 	#pragma omp parallel for collapse(2)
    for (int y=0; y < ny; y++) {
        for (int x=0; x < nx; x++) {
            float val1, val1xm, val2, val2ym, dx, dy;
            int xm, ym, idx;

            idx = x + y * nx;

            // xm: xminus
            xm = MAX(x-1, 0);
            ym = MAX(y-1, 0);

            val1 = I1[x+y*nx];
            val2 = I2[x+y*nx];

            val1xm = I1[xm+y*nx];
            val2ym = I2[x+ym*nx];

            if (x == 0) {
                val1xm = 0;
            } else if (x+1 == nx) {
                val1 = 0;
            }

            if (y == 0) {
                val2ym = 0;
            } else if (y+1 == ny) {
                val2 = 0;
            }

            dx = val1 - val1xm;
            dy = val2 - val2ym;

            out[x+y*nx] = dx + dy;
        }
    }
 }

 // solve: A x = b by Gauss-Seidel: A = (I - pLHS * Laplacian), b = pRHS
 void computeGaussSeidel(float* pData, const float* pLHS, const float* pRHS, const int nIterGS, const int ny, const int nx)
 {
    float  hh = 0.25;
    
    for(int i = 0; i < nIterGS; i++)
    {
        #pragma omp parallel for collapse(2)
        for(int y = 0; y < ny; y++)
        {
            for(int x = 0; x < nx; x++)
            {
                float dX2, dA, dB, dAxp, dAxm, dAyp, dAym, dXxp, dXxm, dXyp, dXym;
                int xm, xp, ym, yp, idx;
                
                idx = x + y * nx;
                
                dA  = pLHS[idx];
                dB  = pRHS[idx];

                // getNeighbour4
                xm = MAX(x-1, 0);
                xp = MIN(x+1, nx-1);
                
                ym = MAX(y-1, 0);
                yp = MIN(y+1, ny-1);

                dAxp = pLHS[xp + y * nx];
                dAxm = pLHS[xm + y * nx];
                dAyp = pLHS[x + yp * nx];
                dAym = pLHS[x + ym * nx];
                
                dXxp = pData[xp + y * nx];
                dXxm = pData[xm + y * nx];
                dXyp = pData[x + yp * nx];
                dXym = pData[x + ym * nx];

                dX2 = hh * dB + (dAxp * dXxp + dAxm * dXxm + dAyp * dXyp + dAym * dXym);
                dX2 = dX2 / (hh + 4.0 * dA);
                
                pData[idx] = dX2;
            }
        }
    }
 }

 float maxFromArray(float* arr, int size)
 {
    float dMax = arr[0];
    for (int i=1; i < size; i++) {
        if (arr[i] > dMax) {
            dMax = arr[i];
        }
    }
    return dMax;
 }

 float minFromArray(float* arr, int size)
 {
    float dMin = arr[0];
    for (int i=1; i < size; i++) {
        if (arr[i] < dMin) {
            dMin = arr[i];
        }
    }
    return dMin;
 }

 int neumann_bc_x(int c, int r, int ny, int nx, bool *isOut, const int* R)
 {
    if (c < 0 ) {
        c = 0;
        *isOut = true;
    }
    else if (c > 0 && r > 0 && r < ny && R[ c-1 + r*nx] == 0 ) {
        //c = 0;
        *isOut = true;
    }
    else if (c >= nx) {
        c = nx - 1;
        *isOut = true;
    }
    else if (c + 1 < nx && r > 0 && r < ny && R[ c+1 + r*nx] == 0) {
        //c = c;
        *isOut = true;
    }
    return c;
 }

 // assume that c is within region
 int neumann_bc_y(int c, int r, int ny, int nx, bool *isOut, const int* R)
 {
    if (r < 0) {
        r = 0;
        *isOut = true;
    }
    else if (r > 0 && R[ c + (r-1)*nx] == 0) {
        //r = 0;
        *isOut = true;
    }
    else if (r >= ny) {
        r = ny - 1;
        *isOut = true;
    }
    else if (r+1 < ny &&  R[ c + (r+1)*nx] == 0) {
        //c = c;
        *isOut = true;
    }
    return r;
 }
diff --git a/jkimg.cuh b/jkimg.cuh
 /*********
   JKImage CUDA device functions
   Pixel-wise process should be considered.

   Rules
   - The input image usually follows the order of [Channel x ny x nx]
   - The output arguments are followed by input arguments
   - The return value is encouraged, instead of void.
 *********/

 #ifndef JK_IMG__CUH__
 #define JK_IMG__CUH__

 #define MAX(a,b) (((a) >= (b)) ? (a) : (b))
 #define MIN(a,b) (((a) <= (b)) ? (a) : (b))

 #include <math.h>
 #include <stdio.h>
 #include <stdbool.h>

 __device__ float shrinkageScalar(float dValue, float dThreshold)
 {
    float dValueAbs, dValueMax;
    dValueAbs = fabs(dValue);
    dValueMax = MAX(1.0, dValueAbs / dThreshold);
    return (1.0 - 1.0 / dValueMax) * dValue;
 }

 __device__ float computeGradientY(float* I, int method, int x, int y, int nx, int ny)
 {
    // forward difference
    if (method == 1)
    {
        float valNow, valNext;
        valNow = I[x + y*nx];
        // w.r.t. y
        if (y+1 == ny)
            valNext = valNow;
        else
            valNext = I[x + (y+1) * nx];

        return (valNext - valNow);
    }
    // centered difference
    else if (method == 0) {
        int yp, ym;
        float valyp, valym;

        yp = MIN(y+1, ny-1);
        ym = MAX(y-1, 0);

        valyp = I[x + yp * nx];
        valym = I[x + ym * nx];

        return 0.5 * (valyp - valym);
    }
 }

 __device__ float computeGradientX(float* I, int method, int x, int y, int nx, int ny)
 {
    // forward difference
    if (method == 1)
    {
        float valNow, valNext;
        valNow = I[x + y * nx];

        if (x+1 == nx)
            valNext = valNow;
        else
            valNext = I[x+1 + y * nx];

        return (valNext - valNow);
    }
    // centered difference
    else if (method == 0)
    {
        float valxp, valxm;
        int xp, xm;

        xp = MIN(x+1, nx-1);
        xm = MAX(x-1, 0);

        valxp = I[xp + y * nx];
        valxm = I[xm + y * nx];

        return 0.5 * (valxp - valxm);
    }
 }

 __device__ float computeDivergence(float* I1, float* I2, int x, int y, int nx, int ny)
 {
    float val1, val1xm, val2, val2ym, dx, dy;

    int idx = x+y*nx;
    int xm, ym;

    xm = MAX(x-1, 0);
    ym = MAX(y-1, 0);

    val1 = I1[idx];
    val2 = I2[idx];

    val1xm = I1[xm+y*nx];
    val2ym = I2[x+ym*nx];

    if (x == 0) val1xm = 0;
    else if (x+1 == nx) val1 = 0;

    if (y == 0) val2ym = 0;
    else if (y+1 == ny) val2 = 0;

    dx = val1 - val1xm;
    dy = val2 - val2ym;

    return dx+dy;
 }

 __device__ int neumann_bc(int x, int nx, bool *isOut)
 {
 	if (x < 0) {
 		x = 0;
 		*isOut = true;
 	}
 	else if (x >= nx) {
 		x = nx - 1;
 		*isOut = true;
 	}
 	return x;
 }

 //////// interpolation for optical flow
 __device__ float interp_cubic_cell(float v[4], float x)
 {
    return v[1] + 0.5 * x * (v[2] - v[0] +
        x * (2.0 * v[0] - 5.0 * v[1] + 4.0 * v[2] - v[3] +
        x * (3.0 * (v[1] - v[2]) + v[3] - v[0])));
 }


 __device__ float interp2_bicubic_at(float*I, float xx, float yy, int nx, int ny, float extra)
 {
 	int sx = (xx < 0) ? -1 : 1;
 	int sy = (yy < 0) ? -1 : 1;

 	bool isOut[1] = { false };

 	// neumann boundary
 	int mx	= neumann_bc((int)xx - sx    , nx, isOut);
 	int x   = neumann_bc((int)xx         , nx, isOut);
 	int px  = neumann_bc((int)xx + sx    , nx, isOut);
 	int ppx = neumann_bc((int)xx + 2 * sx, nx, isOut);

    int my  = neumann_bc((int)yy - sy    , ny, isOut);
    int y   = neumann_bc((int)yy         , ny, isOut);
    int py  = neumann_bc((int)yy + sy    , ny, isOut);
    int ppy = neumann_bc((int)yy + 2 * sy, ny, isOut);

    if (*isOut)
    {
        return extra;
    }

    float p11 = I[mx + my * nx];
    float p12 = I[x + my * nx];
    float p13 = I[px + my * nx];
    float p14 = I[ppx + my * nx];

    float p21 = I[mx + y * nx];
    float p22 = I[x + y * nx];
    float p23 = I[px + y * nx];
    float p24 = I[ppx + y * nx];

    float p31 = I[mx + py * nx];
    float p32 = I[x + py * nx];
    float p33 = I[px + py * nx];
    float p34 = I[ppx + py * nx];

    float p41 = I[mx + ppy * nx];
    float p42 = I[x + ppy * nx];
    float p43 = I[px + ppy * nx];
    float p44 = I[ppx + ppy * nx];

    float p[4][4] = {
        { p11, p21, p31, p41 },
        { p12, p22, p32, p42 },
        { p13, p23, p33, p43 },
        { p14, p24, p34, p44 }
    };

    float v[4];
    v[0] = interp_cubic_cell(p[0], yy - y);
    v[1] = interp_cubic_cell(p[1], yy - y);
    v[2] = interp_cubic_cell(p[2], yy - y);
    v[3] = interp_cubic_cell(p[3], yy - y);

    return interp_cubic_cell(v, xx - x);
 }

 /*******************************
 // solve: A x = b by Gauss-Seidel: A = (I - pLHS * Laplacian), b = pRHS
 // One sweep of Gauss-Seidel iterations
 *******************************/
 __device__ void computeGaussSeidel(float* pData, const float* pLHS, const float* pRHS, \
    const int x, const int y, const int nx, const int ny)
 {
    float  hh = 0.25;

    float dX2, dA, dB, dAxp, dAxm, dAyp, dAym, dXxp, dXxm, dXyp, dXym;
    int xm, xp, ym, yp, idx;

    idx = x + y * nx;

    dA  = pLHS[idx];
    dB  = pRHS[idx];

    // getNeighbour4
    xm = MAX(x-1, 0);
    xp = MIN(x+1, nx-1);

    ym = MAX(y-1, 0);
    yp = MIN(y+1, ny-1);

    dAxp = pLHS[xp + y * nx];
    dAxm = pLHS[xm + y * nx];
    dAyp = pLHS[x + yp * nx];
    dAym = pLHS[x + ym * nx];

    dXxp = pData[xp + y * nx];
    dXxm = pData[xm + y * nx];
    dXyp = pData[x + yp * nx];
    dXym = pData[x + ym * nx];

    dX2 = hh * dB + (dAxp * dXxp + dAxm * dXxm + dAyp * dXyp + dAym * dXym);
    dX2 = dX2 / (hh + 4.0 * dA);

    pData[idx] = dX2;
 }


 #endif
diff --git a/jkimg.h b/jkimg.h
 /*
 JKImage library for communicating with python

 Rules
 - The input image usually follows the order of [Channel x nRow x nCol]
 - 
 */

 // void interp2_bilinear(
 //           float *out,
 //     const float *pData,
 //     const int nx,
 //     const int ny,
 //     const int nc,
 //     )

 void interp2_bicubic(
 	const	float*	I,
 	const	float*	u,
 	const	float*	v,
 			float*	out,
 	const	int		nRow,
 	const	int		nCol,
 	const	float*	extra
 	);

 void computeGradient(
 	const	float*  I,
 	const	int     method,
            float*  I_x,
            float*  I_y,
    const   int     nRow,
    const   int     nCol
 	);

 // compute divergence with backward difference
 void computeDivergence(
    const   float*  I1,  
    const   float*  I2,
    	    float*  out,
    const   int     nRow,
    const   int     nCol
 	);

 // solve: A x = b by Gauss-Seidel: A = (I - pLHS * Laplacian), b = pRHS
 void computeGaussSeidel(
 			float*  pData,
 	const 	float*  pLHS,
 	const 	float*  pRHS, 
 	const	int 	nIterG,
 	const	int 	ny,
 	const	int 	nx
 	);

 float minFromArray(float* arr, int size);
 float maxFromArray(float* arr, int size);


 // #endif
	/*
	JKImage library for communicating with python

	Rules
	- The input image usually follows the order of [Channel x ny x nx]
	-
	*/

	#include <math.h>
	#include <stdio.h>
	#include <stdbool.h>

	#define MAX(a,b) (((a) >= (b)) ? (a) : (b))
	#define MIN(a,b) (((a) <= (b)) ? (a) : (b))

	// helper defininiton of indexing for numpy-given images of shape [C x H x W] column-wise
	#define T( I, x, y, c, ny, nx ) I[ x + ynx + cny*nx ]

	// helper defininiton of indexing for Matlab-given images of shape [C x W x H] row-wise
	// #define TMATLAB( I, x, y, c, ny, nx ) I[ y + xny + cny*nx ]

	int neumann_bc(int x, int nx, bool *isOut)
	{
	if (x < 0) {
	x = 0;
	*isOut = true;
	}
	else if (x >= nx) {
	x = nx - 1;
	*isOut = true;
	}
	return x;
	}

	//////// interpolation for optical flow
	float interp_cubic_cell(float v[4], float x)
	{
	return v[1] + 0.5 * x * (v[2] - v[0] +
	x * (2.0 * v[0] - 5.0 * v[1] + 4.0 * v[2] - v[3] +
	x * (3.0 * (v[1] - v[2]) + v[3] - v[0])));
	}

	float interp2_bicubic_at(const float*I, float xx, float yy, int ny, int nx)
	{
	int sx = (xx < 0) ? -1 : 1;
	int sy = (yy < 0) ? -1 : 1;

	bool isOut[1] = { false };

	// neumann boundary
	int mx = neumann_bc((int)xx - sx , nx, isOut);
	int x = neumann_bc((int)xx , nx, isOut);
	int px = neumann_bc((int)xx + sx , nx, isOut);
	int ppx = neumann_bc((int)xx + 2 * sx, nx, isOut);

	int my = neumann_bc((int)yy - sy , ny, isOut);
	int y = neumann_bc((int)yy , ny, isOut);
	int py = neumann_bc((int)yy + sy , ny, isOut);
	int ppy = neumann_bc((int)yy + 2 * sy, ny, isOut);

	if (*isOut) {
	return -1.0;
	}

	float p11 = I[mx + my * nx];
	float p12 = I[x + my * nx];
	float p13 = I[px + my * nx];
	float p14 = I[ppx + my * nx];

	float p21 = I[mx + y * nx];
	float p22 = I[x + y * nx];
	float p23 = I[px + y * nx];
	float p24 = I[ppx + y * nx];

	float p31 = I[mx + py * nx];
	float p32 = I[x + py * nx];
	float p33 = I[px + py * nx];
	float p34 = I[ppx + py * nx];

	float p41 = I[mx + ppy * nx];
	float p42 = I[x + ppy * nx];
	float p43 = I[px + ppy * nx];
	float p44 = I[ppx + ppy * nx];

	float p[4][4] = {
	{ p11, p21, p31, p41 },
	{ p12, p22, p32, p42 },
	{ p13, p23, p33, p43 },
	{ p14, p24, p34, p44 }
	};

	float v[4];
	v[0] = interp_cubic_cell(p[0], yy - y);
	v[1] = interp_cubic_cell(p[1], yy - y);
	v[2] = interp_cubic_cell(p[2], yy - y);
	v[3] = interp_cubic_cell(p[3], yy - y);

	return interp_cubic_cell(v, xx - x);
	}

	// Original author: Javier Sánchez Pérez <[email protected]>
	void interp2_bicubic(
	const float* I,
	const float* u,
	const float* v,
	float* out,
	const int ny,
	const int nx,
	const float* extra
	)
	{
	#pragma omp parallel for collapse(2)
	for (int y = 0; y < ny; y++) {
	for (int x = 0; x < nx; x++) {
	int idx = x + y * nx;
	float xx = x + u[idx];
	float yy = y + v[idx];

	out[idx] = interp2_bicubic_at(I, xx, yy, ny, nx);
	if (out[idx] < 0) {
	out[idx] = extra[idx];
	}
	}
	}
	}




	void computeGradient(
	const float* I,
	const int method,
	float* I_x,
	float* I_y,
	const int ny,
	const int nx
	)
	{
	// forward difference
	if (method == 1) {
	#pragma omp parallel for collapse(2)
	for (int y=0; y < ny; y++) {
	for (int x=0; x < nx; x++) {
	// w.r.t. x
	float valNow, valNext;
	valNow = I[x + y * nx];

	if (x+1 == nx)
	valNext = I[x+y*nx];
	else
	valNext = I[x+1+y*nx];

	I_x[x+y*nx] = valNext - valNow;

	// w.r.t. y
	if (y+1 == ny)
	valNext = I[x+y*nx];
	else
	valNext = I[x + (y+1) * nx];

	I_y[x+y*nx] = valNext - valNow;
	}
	}
	}
	// centered difference
	else if (method == 0) {
	#pragma omp parallel for collapse(2)
	for (int y=0; y < ny; y++) {
	for (int x=0; x < nx; x++) {
	// w.r.t. x
	float valxp, valxm, valyp, valym;
	int xp, xm, yp, ym;

	// xp: xplus, xm: xminus
	xp = MIN(x+1, nx-1);
	xm = MAX(x-1, 0);
	yp = MIN(y+1, ny-1);
	ym = MAX(y-1, 0);

	valxp = I[xp + y * nx];
	valxm = I[xm + y * nx];
	valyp = I[x + yp * nx];
	valym = I[x + ym * nx];

	I_x[x+ynx] = 0.5 valxp - 0.5 * valxm;
	I_y[x+ynx] = 0.5 valyp - 0.5 * valym;
	}
	}
	}
	}

	// compute divergence with backward difference
	void computeDivergence(
	const float* I1,
	const float* I2,
	float* out,
	const int ny,
	const int nx
	)
	{
	// backward difference
	#pragma omp parallel for collapse(2)
	for (int y=0; y < ny; y++) {
	for (int x=0; x < nx; x++) {
	float val1, val1xm, val2, val2ym, dx, dy;
	int xm, ym, idx;

	idx = x + y * nx;

	// xm: xminus
	xm = MAX(x-1, 0);
	ym = MAX(y-1, 0);

	val1 = I1[x+y*nx];
	val2 = I2[x+y*nx];

	val1xm = I1[xm+y*nx];
	val2ym = I2[x+ym*nx];

	if (x == 0) {
	val1xm = 0;
	} else if (x+1 == nx) {
	val1 = 0;
	}

	if (y == 0) {
	val2ym = 0;
	} else if (y+1 == ny) {
	val2 = 0;
	}

	dx = val1 - val1xm;
	dy = val2 - val2ym;

	out[x+y*nx] = dx + dy;
	}
	}
	}

	// solve: A x = b by Gauss-Seidel: A = (I - pLHS * Laplacian), b = pRHS
	void computeGaussSeidel(float* pData, const float* pLHS, const float* pRHS, const int nIterGS, const int ny, const int nx)
	{
	float hh = 0.25;

	for(int i = 0; i < nIterGS; i++)
	{
	#pragma omp parallel for collapse(2)
	for(int y = 0; y < ny; y++)
	{
	for(int x = 0; x < nx; x++)
	{
	float dX2, dA, dB, dAxp, dAxm, dAyp, dAym, dXxp, dXxm, dXyp, dXym;
	int xm, xp, ym, yp, idx;

	idx = x + y * nx;

	dA = pLHS[idx];
	dB = pRHS[idx];

	// getNeighbour4
	xm = MAX(x-1, 0);
	xp = MIN(x+1, nx-1);

	ym = MAX(y-1, 0);
	yp = MIN(y+1, ny-1);

	dAxp = pLHS[xp + y * nx];
	dAxm = pLHS[xm + y * nx];
	dAyp = pLHS[x + yp * nx];
	dAym = pLHS[x + ym * nx];

	dXxp = pData[xp + y * nx];
	dXxm = pData[xm + y * nx];
	dXyp = pData[x + yp * nx];
	dXym = pData[x + ym * nx];

	dX2 = hh * dB + (dAxp * dXxp + dAxm * dXxm + dAyp * dXyp + dAym * dXym);
	dX2 = dX2 / (hh + 4.0 * dA);

	pData[idx] = dX2;
	}
	}
	}
	}

	float maxFromArray(float* arr, int size)
	{
	float dMax = arr[0];
	for (int i=1; i < size; i++) {
	if (arr[i] > dMax) {
	dMax = arr[i];
	}
	}
	return dMax;
	}

	float minFromArray(float* arr, int size)
	{
	float dMin = arr[0];
	for (int i=1; i < size; i++) {
	if (arr[i] < dMin) {
	dMin = arr[i];
	}
	}
	return dMin;
	}

	int neumann_bc_x(int c, int r, int ny, int nx, bool isOut, const int R)
	{
	if (c < 0 ) {
	c = 0;
	*isOut = true;
	}
	else if (c > 0 && r > 0 && r < ny && R[ c-1 + r*nx] == 0 ) {
	//c = 0;
	*isOut = true;
	}
	else if (c >= nx) {
	c = nx - 1;
	*isOut = true;
	}
	else if (c + 1 < nx && r > 0 && r < ny && R[ c+1 + r*nx] == 0) {
	//c = c;
	*isOut = true;
	}
	return c;
	}

	// assume that c is within region
	int neumann_bc_y(int c, int r, int ny, int nx, bool isOut, const int R)
	{
	if (r < 0) {
	r = 0;
	*isOut = true;
	}
	else if (r > 0 && R[ c + (r-1)*nx] == 0) {
	//r = 0;
	*isOut = true;
	}
	else if (r >= ny) {
	r = ny - 1;
	*isOut = true;
	}
	else if (r+1 < ny && R[ c + (r+1)*nx] == 0) {
	//c = c;
	*isOut = true;
	}
	return r;
	}
	/*********
	JKImage CUDA device functions
	Pixel-wise process should be considered.

	Rules
	- The input image usually follows the order of [Channel x ny x nx]
	- The output arguments are followed by input arguments
	- The return value is encouraged, instead of void.
	*********/

	#ifndef JK_IMG__CUH__
	#define JK_IMG__CUH__

	#define MAX(a,b) (((a) >= (b)) ? (a) : (b))
	#define MIN(a,b) (((a) <= (b)) ? (a) : (b))

	#include <math.h>
	#include <stdio.h>
	#include <stdbool.h>

	__device__ float shrinkageScalar(float dValue, float dThreshold)
	{
	float dValueAbs, dValueMax;
	dValueAbs = fabs(dValue);
	dValueMax = MAX(1.0, dValueAbs / dThreshold);
	return (1.0 - 1.0 / dValueMax) * dValue;
	}

	__device__ float computeGradientY(float* I, int method, int x, int y, int nx, int ny)
	{
	// forward difference
	if (method == 1)
	{
	float valNow, valNext;
	valNow = I[x + y*nx];
	// w.r.t. y
	if (y+1 == ny)
	valNext = valNow;
	else
	valNext = I[x + (y+1) * nx];

	return (valNext - valNow);
	}
	// centered difference
	else if (method == 0) {
	int yp, ym;
	float valyp, valym;

	yp = MIN(y+1, ny-1);
	ym = MAX(y-1, 0);

	valyp = I[x + yp * nx];
	valym = I[x + ym * nx];

	return 0.5 * (valyp - valym);
	}
	}

	__device__ float computeGradientX(float* I, int method, int x, int y, int nx, int ny)
	{
	// forward difference
	if (method == 1)
	{
	float valNow, valNext;
	valNow = I[x + y * nx];

	if (x+1 == nx)
	valNext = valNow;
	else
	valNext = I[x+1 + y * nx];

	return (valNext - valNow);
	}
	// centered difference
	else if (method == 0)
	{
	float valxp, valxm;
	int xp, xm;

	xp = MIN(x+1, nx-1);
	xm = MAX(x-1, 0);

	valxp = I[xp + y * nx];
	valxm = I[xm + y * nx];

	return 0.5 * (valxp - valxm);
	}
	}

	__device__ float computeDivergence(float* I1, float* I2, int x, int y, int nx, int ny)
	{
	float val1, val1xm, val2, val2ym, dx, dy;

	int idx = x+y*nx;
	int xm, ym;

	xm = MAX(x-1, 0);
	ym = MAX(y-1, 0);

	val1 = I1[idx];
	val2 = I2[idx];

	val1xm = I1[xm+y*nx];
	val2ym = I2[x+ym*nx];

	if (x == 0) val1xm = 0;
	else if (x+1 == nx) val1 = 0;

	if (y == 0) val2ym = 0;
	else if (y+1 == ny) val2 = 0;

	dx = val1 - val1xm;
	dy = val2 - val2ym;

	return dx+dy;
	}

	__device__ int neumann_bc(int x, int nx, bool *isOut)
	{
	if (x < 0) {
	x = 0;
	*isOut = true;
	}
	else if (x >= nx) {
	x = nx - 1;
	*isOut = true;
	}
	return x;
	}

	//////// interpolation for optical flow
	__device__ float interp_cubic_cell(float v[4], float x)
	{
	return v[1] + 0.5 * x * (v[2] - v[0] +
	x * (2.0 * v[0] - 5.0 * v[1] + 4.0 * v[2] - v[3] +
	x * (3.0 * (v[1] - v[2]) + v[3] - v[0])));
	}


	__device__ float interp2_bicubic_at(float*I, float xx, float yy, int nx, int ny, float extra)
	{
	int sx = (xx < 0) ? -1 : 1;
	int sy = (yy < 0) ? -1 : 1;

	bool isOut[1] = { false };

	// neumann boundary
	int mx = neumann_bc((int)xx - sx , nx, isOut);
	int x = neumann_bc((int)xx , nx, isOut);
	int px = neumann_bc((int)xx + sx , nx, isOut);
	int ppx = neumann_bc((int)xx + 2 * sx, nx, isOut);

	int my = neumann_bc((int)yy - sy , ny, isOut);
	int y = neumann_bc((int)yy , ny, isOut);
	int py = neumann_bc((int)yy + sy , ny, isOut);
	int ppy = neumann_bc((int)yy + 2 * sy, ny, isOut);

	if (*isOut)
	{
	return extra;
	}

	float p11 = I[mx + my * nx];
	float p12 = I[x + my * nx];
	float p13 = I[px + my * nx];
	float p14 = I[ppx + my * nx];

	float p21 = I[mx + y * nx];
	float p22 = I[x + y * nx];
	float p23 = I[px + y * nx];
	float p24 = I[ppx + y * nx];

	float p31 = I[mx + py * nx];
	float p32 = I[x + py * nx];
	float p33 = I[px + py * nx];
	float p34 = I[ppx + py * nx];

	float p41 = I[mx + ppy * nx];
	float p42 = I[x + ppy * nx];
	float p43 = I[px + ppy * nx];
	float p44 = I[ppx + ppy * nx];

	float p[4][4] = {
	{ p11, p21, p31, p41 },
	{ p12, p22, p32, p42 },
	{ p13, p23, p33, p43 },
	{ p14, p24, p34, p44 }
	};

	float v[4];
	v[0] = interp_cubic_cell(p[0], yy - y);
	v[1] = interp_cubic_cell(p[1], yy - y);
	v[2] = interp_cubic_cell(p[2], yy - y);
	v[3] = interp_cubic_cell(p[3], yy - y);

	return interp_cubic_cell(v, xx - x);
	}

	/*******************************
	// solve: A x = b by Gauss-Seidel: A = (I - pLHS * Laplacian), b = pRHS
	// One sweep of Gauss-Seidel iterations
	*******************************/
	__device__ void computeGaussSeidel(float* pData, const float* pLHS, const float* pRHS, \
	const int x, const int y, const int nx, const int ny)
	{
	float hh = 0.25;

	float dX2, dA, dB, dAxp, dAxm, dAyp, dAym, dXxp, dXxm, dXyp, dXym;
	int xm, xp, ym, yp, idx;

	idx = x + y * nx;

	dA = pLHS[idx];
	dB = pRHS[idx];

	// getNeighbour4
	xm = MAX(x-1, 0);
	xp = MIN(x+1, nx-1);

	ym = MAX(y-1, 0);
	yp = MIN(y+1, ny-1);

	dAxp = pLHS[xp + y * nx];
	dAxm = pLHS[xm + y * nx];
	dAyp = pLHS[x + yp * nx];
	dAym = pLHS[x + ym * nx];

	dXxp = pData[xp + y * nx];
	dXxm = pData[xm + y * nx];
	dXyp = pData[x + yp * nx];
	dXym = pData[x + ym * nx];

	dX2 = hh * dB + (dAxp * dXxp + dAxm * dXxm + dAyp * dXyp + dAym * dXym);
	dX2 = dX2 / (hh + 4.0 * dA);

	pData[idx] = dX2;
	}


	#endif