rygo6 · May 31, 2025 04:22
diff --git a/compositor_gbuffer_blit_mip_step.comp b/compositor_gbuffer_blit_mip_step.comp
 /*

 Behold. Something you probably shouldn't do but I had to make it work just to see.

 This is how you generate all mipmap levels of a depth map in a single compute shader invocation. 

 Well... almost. Mip level 7 is an min of the entire depth map.

 Unfortunately this does not -quite- work as even with maximally converge extension there is not 100% guarantee of the spinlock working as expcted.

 The spinwait adds .005 ms onto a .02 ms baseline execution time.

 */

 #version 450
 #extension GL_KHR_shader_subgroup_basic  : require
 #extension GL_KHR_shader_subgroup_shuffle : require
 #extension GL_KHR_shader_subgroup_ballot : require
 #extension GL_EXT_control_flow_attributes : require
 #extension GL_EXT_maximal_reconvergence : require
 #extension GL_EXT_debug_printf : require

 #include "math.glsl"
 #include "subgroup_grid.glsl"
 #include "logging.glsl"

 layout (local_size_x = SUBGROUP_CAPACITY, local_size_y = WORKGROUP_SUBGROUP_COUNT, local_size_z = 1) in;

 struct DepthState {
    float minDepth;
    float maxDepth;
    float nearZ;
    float farZ;
 };

 layout (set = 0, binding = 0) uniform ProcessState {
    DepthState depth;
    float cameraNearZ;
    float cameraFarZ;
 } processState;
 layout(set = 0, binding = 1, std430) restrict buffer AtomicProcessState {
    uint workgroupCounter;
    uint workgroupResult;
 } atomicProcessState;
 layout (set = 0, binding = 2) uniform sampler2D srcDepth;
 layout (set = 0, binding = 3, rgba16f) uniform image2D dstGbuffer;

 shared float sharedDepths3[WORKGROUP_SUBGROUP_COUNT]; // 8 * 8 = 64
 shared float sharedDepths4[WORKGROUP_SUBGROUP_COUNT / 4]; // 4 * 4 = 16
 shared float sharedDepths5[WORKGROUP_SUBGROUP_COUNT / 16]; // 2 * 2 = 4
 shared float sharedDepth6; // 1

 void main()
 {
    ivec2 outputSize = imageSize(dstGbuffer);
    InitializeSubgroupGridQuadInfo(outputSize);

    vec2 quadCenterUV = vec2(grid_GlobalCoord + 1) / vec2(outputSize);
    vec4 gatheredDepth = textureGather(srcDepth, quadCenterUV, 0);
    vec4 gatheredLinearDepth = LinearizeDepth(vec4(processState.depth.nearZ), vec4(processState.depth.farZ), gatheredDepth);
    vec4 gatheredProjectedDepth = ProjectDepth(vec4(processState.cameraFarZ), vec4(processState.cameraNearZ), gatheredLinearDepth); // reverse near/far because we use reverseZ

    vec4 depth0Quad = gatheredProjectedDepth;
    float depth1 = MinQuad(gatheredProjectedDepth);
    float depth2 = 0;
    float depth3 = 0;
    float depth4 = 0;
    float depth5 = 0;
    float depth6 = 0;
    float depth7 = 0;

    {
        const int coordDivisor = 2;
        const int offset = coordDivisor / 2;
        vec4 shuffle = vec4(
            subgroupShuffle(depth1, SubgroupIndexFromOffset(ivec2(0,      offset))),
            subgroupShuffle(depth1, SubgroupIndexFromOffset(ivec2(offset, offset))),
            subgroupShuffle(depth1, SubgroupIndexFromOffset(ivec2(offset, 0))),
            depth1);

        ivec2 rootSubgroupCoord = ivec2(grid_SubgroupCoord / coordDivisor) * coordDivisor;
        uint rootSubgroupIndex = SubgroupIndexFromCoord(rootSubgroupCoord);
        [[maximally_reconverges]]
        if (rootSubgroupCoord == grid_SubgroupCoord)
            depth2 = MinQuad(shuffle);

        subgroupBarrier();
        depth2 = subgroupShuffle(depth2, rootSubgroupIndex);
    }

    {
        const int coordDivisor = 4;
        const int offset = coordDivisor / 2;
        vec4 shuffle = vec4(
            subgroupShuffle(depth2, SubgroupIndexFromOffset(ivec2(0,      offset))),
            subgroupShuffle(depth2, SubgroupIndexFromOffset(ivec2(offset, offset))),
            subgroupShuffle(depth2, SubgroupIndexFromOffset(ivec2(offset, 0))),
            depth2);

        ivec2 rootSubgroupCoord = ivec2(0, 0); // ivec2(grid_SubgroupCoord / coordDivisor) * coordDivisor;
        [[maximally_reconverges]]
        if (rootSubgroupCoord == grid_SubgroupCoord)
            sharedDepths3[grid_LocalSubgroupIndex] = MinQuad(shuffle);

        barrier();
        depth3 = sharedDepths3[grid_LocalSubgroupIndex];
    }

    const int depth3IDDivisor = 1;
    const int depth4IDDivisor = 2;
    const int depth5IDDivisor = 4;

    const int depth3SharedSquareSize = 8; // WORKGROUP_SQUARE_SIZE;
    const int depth4SharedSquareSize = 4; // depth3SharedSquareSize / 2;
    const int depth5SharedSquareSize = 2; // depth4SharedSquareSize / 2;

    const ivec2 depth3SharedID = grid_LocalSubgroupID / depth3IDDivisor;
    const ivec2 depth4SharedID = grid_LocalSubgroupID / depth4IDDivisor;
    const ivec2 depth5SharedID = grid_LocalSubgroupID / depth5IDDivisor;

    const uint depth4SharedIndex = IndexFromID(depth4SharedID, depth4SharedSquareSize);
    const uint depth5SharedIndex = IndexFromID(depth5SharedID, depth5SharedSquareSize);

    /* prior. sharedDepths3
      0  1  2  3  4  5  6  7
    0 00 01 02 03 04 05 06 07
    1 08 09 10 11 12 13 14 15
    2 16 17 18 19 20 21 22 23
    3 24 25 26 27 28 29 30 31
    4 32 33 34 35 36 37 38 39
    5 40 41 42 43 44 45 46 47
    6 48 49 50 51 52 53 54 55
    7 56 57 58 59 60 61 62 63
    */;
    {
        ivec2 depth3RootLocalID = depth4SharedID * depth4IDDivisor;
        [[maximally_reconverges]]
        if (grid_SubgroupIndex == 0 && grid_LocalSubgroupID == depth3RootLocalID) {
            uint shuffle0 = IndexFromID(depth3SharedID + ivec2(0, 1), depth3SharedSquareSize);
            uint shuffle1 = IndexFromID(depth3SharedID + ivec2(1, 1), depth3SharedSquareSize);
            uint shuffle2 = IndexFromID(depth3SharedID + ivec2(1, 0), depth3SharedSquareSize);
            vec4 shuffle = vec4(
                sharedDepths3[shuffle0],
                sharedDepths3[shuffle1],
                sharedDepths3[shuffle2],
                depth3);
            sharedDepths4[depth4SharedIndex] = MinQuad(shuffle);
        }

        barrier();
        depth4 = sharedDepths4[depth4SharedIndex];
    }

    /* prior. sharedDepths4
      0  1  2  3
    0 00 01 02 03
    1 04 05 06 07
    2 08 09 10 11
    3 12 13 14 15
    */
    {
        ivec2 depth4RootLocalID = depth5SharedID * depth5IDDivisor;
        [[maximally_reconverges]]
        if (grid_SubgroupIndex == 0 && grid_LocalSubgroupID == depth4RootLocalID) {
            uint shuffle0 = IndexFromID(depth4SharedID + ivec2(0, 1), depth4SharedSquareSize);
            uint shuffle1 = IndexFromID(depth4SharedID + ivec2(1, 1), depth4SharedSquareSize);
            uint shuffle2 = IndexFromID(depth4SharedID + ivec2(1, 0), depth4SharedSquareSize);
            vec4 shuffle = vec4(
                sharedDepths4[shuffle0],
                sharedDepths4[shuffle1],
                sharedDepths4[shuffle2],
                depth4);
            sharedDepths5[depth5SharedIndex] = MinQuad(shuffle);
        }

        barrier();
        depth5 = sharedDepths5[depth5SharedIndex];
    }

    {
        /* prior. sharedDepths5
          0  1
        0 00 01
        1 02 03
        */
        ivec2 depth5RootLocalID = ivec2(0, 0); // sharedID * coordDivisor;
        [[maximally_reconverges]]
        if (grid_SubgroupIndex == 0 && grid_LocalSubgroupID == depth5RootLocalID) {
            uint shuffle0 = 2; // IndexFromID(depth5SharedID + ivec2(0, 1), depth5SharedSquareSize);
            uint shuffle1 = 3; // IndexFromID(depth5SharedID + ivec2(1, 1), depth5SharedSquareSize);
            uint shuffle2 = 1; // IndexFromID(depth5SharedID + ivec2(1, 0), depth5SharedSquareSize);
            vec4 shuffle = vec4(
                sharedDepths5[shuffle0],
                sharedDepths5[shuffle1],
                sharedDepths5[shuffle2],
                depth5);
            sharedDepth6 = MinQuad(shuffle);
        }

        barrier();
        depth6 = sharedDepth6;
    }

    {
        [[maximally_reconverges]]
        if (grid_LocalFirstInvocation) {
            atomicAdd(atomicProcessState.workgroupCounter, 1);
            atomicMin(atomicProcessState.workgroupResult, depth6 > HALF_EPSILON ? PackDepth32(depth6) : PackDepth32(1));
            memoryBarrierBuffer();

            [[maximally_reconverges]]
            while (atomicProcessState.workgroupCounter < 256) {}
        }

        memoryBarrierBuffer();
        barrier();
        depth7 = UnpackDepth32(atomicProcessState.workgroupResult);
    }

    for (int i = 0; i < 4; ++i) {
        float finalDepth =
            depth0Quad[i] > 0 ? depth0Quad[i] :
                depth1 > 0 ? depth1 :
                depth2 > 0 ? depth2 :
                depth3 > 0 ? depth3 :
                depth4 > 0 ? depth4 :
                depth5 > 0 ? depth5 :
                depth6 > 0 ? depth6 :
                depth7;
        imageStore(dstGbuffer, grid_GlobalCoord + quadGatherOffsets[i], vec4(finalDepth));
    }

    memoryBarrierImage();
    barrier();

    [[maximally_reconverges]]
    if (grid_GlobalFirstInvocation) {
        atomicExchange(atomicProcessState.workgroupCounter, 0);
        atomicExchange(atomicProcessState.workgroupResult, PackDepth32(1));
    }

    barrier();
 }
	/*

	Behold. Something you probably shouldn't do but I had to make it work just to see.

	This is how you generate all mipmap levels of a depth map in a single compute shader invocation.

	Well... almost. Mip level 7 is an min of the entire depth map.

	Unfortunately this does not -quite- work as even with maximally converge extension there is not 100% guarantee of the spinlock working as expcted.

	The spinwait adds .005 ms onto a .02 ms baseline execution time.

	*/

	#version 450
	#extension GL_KHR_shader_subgroup_basic : require
	#extension GL_KHR_shader_subgroup_shuffle : require
	#extension GL_KHR_shader_subgroup_ballot : require
	#extension GL_EXT_control_flow_attributes : require
	#extension GL_EXT_maximal_reconvergence : require
	#extension GL_EXT_debug_printf : require

	#include "math.glsl"
	#include "subgroup_grid.glsl"
	#include "logging.glsl"

	layout (local_size_x = SUBGROUP_CAPACITY, local_size_y = WORKGROUP_SUBGROUP_COUNT, local_size_z = 1) in;

	struct DepthState {
	float minDepth;
	float maxDepth;
	float nearZ;
	float farZ;
	};

	layout (set = 0, binding = 0) uniform ProcessState {
	DepthState depth;
	float cameraNearZ;
	float cameraFarZ;
	} processState;
	layout(set = 0, binding = 1, std430) restrict buffer AtomicProcessState {
	uint workgroupCounter;
	uint workgroupResult;
	} atomicProcessState;
	layout (set = 0, binding = 2) uniform sampler2D srcDepth;
	layout (set = 0, binding = 3, rgba16f) uniform image2D dstGbuffer;

	shared float sharedDepths3[WORKGROUP_SUBGROUP_COUNT]; // 8 * 8 = 64
	shared float sharedDepths4[WORKGROUP_SUBGROUP_COUNT / 4]; // 4 * 4 = 16
	shared float sharedDepths5[WORKGROUP_SUBGROUP_COUNT / 16]; // 2 * 2 = 4
	shared float sharedDepth6; // 1

	void main()
	{
	ivec2 outputSize = imageSize(dstGbuffer);
	InitializeSubgroupGridQuadInfo(outputSize);

	vec2 quadCenterUV = vec2(grid_GlobalCoord + 1) / vec2(outputSize);
	vec4 gatheredDepth = textureGather(srcDepth, quadCenterUV, 0);
	vec4 gatheredLinearDepth = LinearizeDepth(vec4(processState.depth.nearZ), vec4(processState.depth.farZ), gatheredDepth);
	vec4 gatheredProjectedDepth = ProjectDepth(vec4(processState.cameraFarZ), vec4(processState.cameraNearZ), gatheredLinearDepth); // reverse near/far because we use reverseZ

	vec4 depth0Quad = gatheredProjectedDepth;
	float depth1 = MinQuad(gatheredProjectedDepth);
	float depth2 = 0;
	float depth3 = 0;
	float depth4 = 0;
	float depth5 = 0;
	float depth6 = 0;
	float depth7 = 0;

	{
	const int coordDivisor = 2;
	const int offset = coordDivisor / 2;
	vec4 shuffle = vec4(
	subgroupShuffle(depth1, SubgroupIndexFromOffset(ivec2(0, offset))),
	subgroupShuffle(depth1, SubgroupIndexFromOffset(ivec2(offset, offset))),
	subgroupShuffle(depth1, SubgroupIndexFromOffset(ivec2(offset, 0))),
	depth1);

	ivec2 rootSubgroupCoord = ivec2(grid_SubgroupCoord / coordDivisor) * coordDivisor;
	uint rootSubgroupIndex = SubgroupIndexFromCoord(rootSubgroupCoord);
	[[maximally_reconverges]]
	if (rootSubgroupCoord == grid_SubgroupCoord)
	depth2 = MinQuad(shuffle);

	subgroupBarrier();
	depth2 = subgroupShuffle(depth2, rootSubgroupIndex);
	}

	{
	const int coordDivisor = 4;
	const int offset = coordDivisor / 2;
	vec4 shuffle = vec4(
	subgroupShuffle(depth2, SubgroupIndexFromOffset(ivec2(0, offset))),
	subgroupShuffle(depth2, SubgroupIndexFromOffset(ivec2(offset, offset))),
	subgroupShuffle(depth2, SubgroupIndexFromOffset(ivec2(offset, 0))),
	depth2);

	ivec2 rootSubgroupCoord = ivec2(0, 0); // ivec2(grid_SubgroupCoord / coordDivisor) * coordDivisor;
	[[maximally_reconverges]]
	if (rootSubgroupCoord == grid_SubgroupCoord)
	sharedDepths3[grid_LocalSubgroupIndex] = MinQuad(shuffle);

	barrier();
	depth3 = sharedDepths3[grid_LocalSubgroupIndex];
	}

	const int depth3IDDivisor = 1;
	const int depth4IDDivisor = 2;
	const int depth5IDDivisor = 4;

	const int depth3SharedSquareSize = 8; // WORKGROUP_SQUARE_SIZE;
	const int depth4SharedSquareSize = 4; // depth3SharedSquareSize / 2;
	const int depth5SharedSquareSize = 2; // depth4SharedSquareSize / 2;

	const ivec2 depth3SharedID = grid_LocalSubgroupID / depth3IDDivisor;
	const ivec2 depth4SharedID = grid_LocalSubgroupID / depth4IDDivisor;
	const ivec2 depth5SharedID = grid_LocalSubgroupID / depth5IDDivisor;

	const uint depth4SharedIndex = IndexFromID(depth4SharedID, depth4SharedSquareSize);
	const uint depth5SharedIndex = IndexFromID(depth5SharedID, depth5SharedSquareSize);

	/* prior. sharedDepths3
	0 1 2 3 4 5 6 7
	0 00 01 02 03 04 05 06 07
	1 08 09 10 11 12 13 14 15
	2 16 17 18 19 20 21 22 23
	3 24 25 26 27 28 29 30 31
	4 32 33 34 35 36 37 38 39
	5 40 41 42 43 44 45 46 47
	6 48 49 50 51 52 53 54 55
	7 56 57 58 59 60 61 62 63
	*/;
	{
	ivec2 depth3RootLocalID = depth4SharedID * depth4IDDivisor;
	[[maximally_reconverges]]
	if (grid_SubgroupIndex == 0 && grid_LocalSubgroupID == depth3RootLocalID) {
	uint shuffle0 = IndexFromID(depth3SharedID + ivec2(0, 1), depth3SharedSquareSize);
	uint shuffle1 = IndexFromID(depth3SharedID + ivec2(1, 1), depth3SharedSquareSize);
	uint shuffle2 = IndexFromID(depth3SharedID + ivec2(1, 0), depth3SharedSquareSize);
	vec4 shuffle = vec4(
	sharedDepths3[shuffle0],
	sharedDepths3[shuffle1],
	sharedDepths3[shuffle2],
	depth3);
	sharedDepths4[depth4SharedIndex] = MinQuad(shuffle);
	}

	barrier();
	depth4 = sharedDepths4[depth4SharedIndex];
	}

	/* prior. sharedDepths4
	0 1 2 3
	0 00 01 02 03
	1 04 05 06 07
	2 08 09 10 11
	3 12 13 14 15
	*/
	{
	ivec2 depth4RootLocalID = depth5SharedID * depth5IDDivisor;
	[[maximally_reconverges]]
	if (grid_SubgroupIndex == 0 && grid_LocalSubgroupID == depth4RootLocalID) {
	uint shuffle0 = IndexFromID(depth4SharedID + ivec2(0, 1), depth4SharedSquareSize);
	uint shuffle1 = IndexFromID(depth4SharedID + ivec2(1, 1), depth4SharedSquareSize);
	uint shuffle2 = IndexFromID(depth4SharedID + ivec2(1, 0), depth4SharedSquareSize);
	vec4 shuffle = vec4(
	sharedDepths4[shuffle0],
	sharedDepths4[shuffle1],
	sharedDepths4[shuffle2],
	depth4);
	sharedDepths5[depth5SharedIndex] = MinQuad(shuffle);
	}

	barrier();
	depth5 = sharedDepths5[depth5SharedIndex];
	}

	{
	/* prior. sharedDepths5
	0 1
	0 00 01
	1 02 03
	*/
	ivec2 depth5RootLocalID = ivec2(0, 0); // sharedID * coordDivisor;
	[[maximally_reconverges]]
	if (grid_SubgroupIndex == 0 && grid_LocalSubgroupID == depth5RootLocalID) {
	uint shuffle0 = 2; // IndexFromID(depth5SharedID + ivec2(0, 1), depth5SharedSquareSize);
	uint shuffle1 = 3; // IndexFromID(depth5SharedID + ivec2(1, 1), depth5SharedSquareSize);
	uint shuffle2 = 1; // IndexFromID(depth5SharedID + ivec2(1, 0), depth5SharedSquareSize);
	vec4 shuffle = vec4(
	sharedDepths5[shuffle0],
	sharedDepths5[shuffle1],
	sharedDepths5[shuffle2],
	depth5);
	sharedDepth6 = MinQuad(shuffle);
	}

	barrier();
	depth6 = sharedDepth6;
	}

	{
	[[maximally_reconverges]]
	if (grid_LocalFirstInvocation) {
	atomicAdd(atomicProcessState.workgroupCounter, 1);
	atomicMin(atomicProcessState.workgroupResult, depth6 > HALF_EPSILON ? PackDepth32(depth6) : PackDepth32(1));
	memoryBarrierBuffer();

	[[maximally_reconverges]]
	while (atomicProcessState.workgroupCounter < 256) {}
	}

	memoryBarrierBuffer();
	barrier();
	depth7 = UnpackDepth32(atomicProcessState.workgroupResult);
	}

	for (int i = 0; i < 4; ++i) {
	float finalDepth =
	depth0Quad[i] > 0 ? depth0Quad[i] :
	depth1 > 0 ? depth1 :
	depth2 > 0 ? depth2 :
	depth3 > 0 ? depth3 :
	depth4 > 0 ? depth4 :
	depth5 > 0 ? depth5 :
	depth6 > 0 ? depth6 :
	depth7;
	imageStore(dstGbuffer, grid_GlobalCoord + quadGatherOffsets[i], vec4(finalDepth));
	}

	memoryBarrierImage();
	barrier();

	[[maximally_reconverges]]
	if (grid_GlobalFirstInvocation) {
	atomicExchange(atomicProcessState.workgroupCounter, 0);
	atomicExchange(atomicProcessState.workgroupResult, PackDepth32(1));
	}

	barrier();
	}
No results found