Last active
May 31, 2025 04:22
-
-
Save rygo6/554642b3d084b1abdb08b50d22d2c477 to your computer and use it in GitHub Desktop.
Blit all mips of a depth in a single compute shader invocation... almost.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| Behold. Something you probably shouldn't do but I had to make it work just to see. | |
| This is how you generate all mipmap levels of a depth map in a single compute shader invocation. | |
| Well... almost. Mip level 7 is an min of the entire depth map. | |
| Unfortunately this does not -quite- work as even with maximally converge extension there is not 100% guarantee of the spinlock working as expcted. | |
| The spinwait adds .005 ms onto a .02 ms baseline execution time. | |
| */ | |
| #version 450 | |
| #extension GL_KHR_shader_subgroup_basic : require | |
| #extension GL_KHR_shader_subgroup_shuffle : require | |
| #extension GL_KHR_shader_subgroup_ballot : require | |
| #extension GL_EXT_control_flow_attributes : require | |
| #extension GL_EXT_maximal_reconvergence : require | |
| #extension GL_EXT_debug_printf : require | |
| #include "math.glsl" | |
| #include "subgroup_grid.glsl" | |
| #include "logging.glsl" | |
| layout (local_size_x = SUBGROUP_CAPACITY, local_size_y = WORKGROUP_SUBGROUP_COUNT, local_size_z = 1) in; | |
| struct DepthState { | |
| float minDepth; | |
| float maxDepth; | |
| float nearZ; | |
| float farZ; | |
| }; | |
| layout (set = 0, binding = 0) uniform ProcessState { | |
| DepthState depth; | |
| float cameraNearZ; | |
| float cameraFarZ; | |
| } processState; | |
| layout(set = 0, binding = 1, std430) restrict buffer AtomicProcessState { | |
| uint workgroupCounter; | |
| uint workgroupResult; | |
| } atomicProcessState; | |
| layout (set = 0, binding = 2) uniform sampler2D srcDepth; | |
| layout (set = 0, binding = 3, rgba16f) uniform image2D dstGbuffer; | |
| shared float sharedDepths3[WORKGROUP_SUBGROUP_COUNT]; // 8 * 8 = 64 | |
| shared float sharedDepths4[WORKGROUP_SUBGROUP_COUNT / 4]; // 4 * 4 = 16 | |
| shared float sharedDepths5[WORKGROUP_SUBGROUP_COUNT / 16]; // 2 * 2 = 4 | |
| shared float sharedDepth6; // 1 | |
| void main() | |
| { | |
| ivec2 outputSize = imageSize(dstGbuffer); | |
| InitializeSubgroupGridQuadInfo(outputSize); | |
| vec2 quadCenterUV = vec2(grid_GlobalCoord + 1) / vec2(outputSize); | |
| vec4 gatheredDepth = textureGather(srcDepth, quadCenterUV, 0); | |
| vec4 gatheredLinearDepth = LinearizeDepth(vec4(processState.depth.nearZ), vec4(processState.depth.farZ), gatheredDepth); | |
| vec4 gatheredProjectedDepth = ProjectDepth(vec4(processState.cameraFarZ), vec4(processState.cameraNearZ), gatheredLinearDepth); // reverse near/far because we use reverseZ | |
| vec4 depth0Quad = gatheredProjectedDepth; | |
| float depth1 = MinQuad(gatheredProjectedDepth); | |
| float depth2 = 0; | |
| float depth3 = 0; | |
| float depth4 = 0; | |
| float depth5 = 0; | |
| float depth6 = 0; | |
| float depth7 = 0; | |
| { | |
| const int coordDivisor = 2; | |
| const int offset = coordDivisor / 2; | |
| vec4 shuffle = vec4( | |
| subgroupShuffle(depth1, SubgroupIndexFromOffset(ivec2(0, offset))), | |
| subgroupShuffle(depth1, SubgroupIndexFromOffset(ivec2(offset, offset))), | |
| subgroupShuffle(depth1, SubgroupIndexFromOffset(ivec2(offset, 0))), | |
| depth1); | |
| ivec2 rootSubgroupCoord = ivec2(grid_SubgroupCoord / coordDivisor) * coordDivisor; | |
| uint rootSubgroupIndex = SubgroupIndexFromCoord(rootSubgroupCoord); | |
| [[maximally_reconverges]] | |
| if (rootSubgroupCoord == grid_SubgroupCoord) | |
| depth2 = MinQuad(shuffle); | |
| subgroupBarrier(); | |
| depth2 = subgroupShuffle(depth2, rootSubgroupIndex); | |
| } | |
| { | |
| const int coordDivisor = 4; | |
| const int offset = coordDivisor / 2; | |
| vec4 shuffle = vec4( | |
| subgroupShuffle(depth2, SubgroupIndexFromOffset(ivec2(0, offset))), | |
| subgroupShuffle(depth2, SubgroupIndexFromOffset(ivec2(offset, offset))), | |
| subgroupShuffle(depth2, SubgroupIndexFromOffset(ivec2(offset, 0))), | |
| depth2); | |
| ivec2 rootSubgroupCoord = ivec2(0, 0); // ivec2(grid_SubgroupCoord / coordDivisor) * coordDivisor; | |
| [[maximally_reconverges]] | |
| if (rootSubgroupCoord == grid_SubgroupCoord) | |
| sharedDepths3[grid_LocalSubgroupIndex] = MinQuad(shuffle); | |
| barrier(); | |
| depth3 = sharedDepths3[grid_LocalSubgroupIndex]; | |
| } | |
| const int depth3IDDivisor = 1; | |
| const int depth4IDDivisor = 2; | |
| const int depth5IDDivisor = 4; | |
| const int depth3SharedSquareSize = 8; // WORKGROUP_SQUARE_SIZE; | |
| const int depth4SharedSquareSize = 4; // depth3SharedSquareSize / 2; | |
| const int depth5SharedSquareSize = 2; // depth4SharedSquareSize / 2; | |
| const ivec2 depth3SharedID = grid_LocalSubgroupID / depth3IDDivisor; | |
| const ivec2 depth4SharedID = grid_LocalSubgroupID / depth4IDDivisor; | |
| const ivec2 depth5SharedID = grid_LocalSubgroupID / depth5IDDivisor; | |
| const uint depth4SharedIndex = IndexFromID(depth4SharedID, depth4SharedSquareSize); | |
| const uint depth5SharedIndex = IndexFromID(depth5SharedID, depth5SharedSquareSize); | |
| /* prior. sharedDepths3 | |
| 0 1 2 3 4 5 6 7 | |
| 0 00 01 02 03 04 05 06 07 | |
| 1 08 09 10 11 12 13 14 15 | |
| 2 16 17 18 19 20 21 22 23 | |
| 3 24 25 26 27 28 29 30 31 | |
| 4 32 33 34 35 36 37 38 39 | |
| 5 40 41 42 43 44 45 46 47 | |
| 6 48 49 50 51 52 53 54 55 | |
| 7 56 57 58 59 60 61 62 63 | |
| */; | |
| { | |
| ivec2 depth3RootLocalID = depth4SharedID * depth4IDDivisor; | |
| [[maximally_reconverges]] | |
| if (grid_SubgroupIndex == 0 && grid_LocalSubgroupID == depth3RootLocalID) { | |
| uint shuffle0 = IndexFromID(depth3SharedID + ivec2(0, 1), depth3SharedSquareSize); | |
| uint shuffle1 = IndexFromID(depth3SharedID + ivec2(1, 1), depth3SharedSquareSize); | |
| uint shuffle2 = IndexFromID(depth3SharedID + ivec2(1, 0), depth3SharedSquareSize); | |
| vec4 shuffle = vec4( | |
| sharedDepths3[shuffle0], | |
| sharedDepths3[shuffle1], | |
| sharedDepths3[shuffle2], | |
| depth3); | |
| sharedDepths4[depth4SharedIndex] = MinQuad(shuffle); | |
| } | |
| barrier(); | |
| depth4 = sharedDepths4[depth4SharedIndex]; | |
| } | |
| /* prior. sharedDepths4 | |
| 0 1 2 3 | |
| 0 00 01 02 03 | |
| 1 04 05 06 07 | |
| 2 08 09 10 11 | |
| 3 12 13 14 15 | |
| */ | |
| { | |
| ivec2 depth4RootLocalID = depth5SharedID * depth5IDDivisor; | |
| [[maximally_reconverges]] | |
| if (grid_SubgroupIndex == 0 && grid_LocalSubgroupID == depth4RootLocalID) { | |
| uint shuffle0 = IndexFromID(depth4SharedID + ivec2(0, 1), depth4SharedSquareSize); | |
| uint shuffle1 = IndexFromID(depth4SharedID + ivec2(1, 1), depth4SharedSquareSize); | |
| uint shuffle2 = IndexFromID(depth4SharedID + ivec2(1, 0), depth4SharedSquareSize); | |
| vec4 shuffle = vec4( | |
| sharedDepths4[shuffle0], | |
| sharedDepths4[shuffle1], | |
| sharedDepths4[shuffle2], | |
| depth4); | |
| sharedDepths5[depth5SharedIndex] = MinQuad(shuffle); | |
| } | |
| barrier(); | |
| depth5 = sharedDepths5[depth5SharedIndex]; | |
| } | |
| { | |
| /* prior. sharedDepths5 | |
| 0 1 | |
| 0 00 01 | |
| 1 02 03 | |
| */ | |
| ivec2 depth5RootLocalID = ivec2(0, 0); // sharedID * coordDivisor; | |
| [[maximally_reconverges]] | |
| if (grid_SubgroupIndex == 0 && grid_LocalSubgroupID == depth5RootLocalID) { | |
| uint shuffle0 = 2; // IndexFromID(depth5SharedID + ivec2(0, 1), depth5SharedSquareSize); | |
| uint shuffle1 = 3; // IndexFromID(depth5SharedID + ivec2(1, 1), depth5SharedSquareSize); | |
| uint shuffle2 = 1; // IndexFromID(depth5SharedID + ivec2(1, 0), depth5SharedSquareSize); | |
| vec4 shuffle = vec4( | |
| sharedDepths5[shuffle0], | |
| sharedDepths5[shuffle1], | |
| sharedDepths5[shuffle2], | |
| depth5); | |
| sharedDepth6 = MinQuad(shuffle); | |
| } | |
| barrier(); | |
| depth6 = sharedDepth6; | |
| } | |
| { | |
| [[maximally_reconverges]] | |
| if (grid_LocalFirstInvocation) { | |
| atomicAdd(atomicProcessState.workgroupCounter, 1); | |
| atomicMin(atomicProcessState.workgroupResult, depth6 > HALF_EPSILON ? PackDepth32(depth6) : PackDepth32(1)); | |
| memoryBarrierBuffer(); | |
| [[maximally_reconverges]] | |
| while (atomicProcessState.workgroupCounter < 256) {} | |
| } | |
| memoryBarrierBuffer(); | |
| barrier(); | |
| depth7 = UnpackDepth32(atomicProcessState.workgroupResult); | |
| } | |
| for (int i = 0; i < 4; ++i) { | |
| float finalDepth = | |
| depth0Quad[i] > 0 ? depth0Quad[i] : | |
| depth1 > 0 ? depth1 : | |
| depth2 > 0 ? depth2 : | |
| depth3 > 0 ? depth3 : | |
| depth4 > 0 ? depth4 : | |
| depth5 > 0 ? depth5 : | |
| depth6 > 0 ? depth6 : | |
| depth7; | |
| imageStore(dstGbuffer, grid_GlobalCoord + quadGatherOffsets[i], vec4(finalDepth)); | |
| } | |
| memoryBarrierImage(); | |
| barrier(); | |
| [[maximally_reconverges]] | |
| if (grid_GlobalFirstInvocation) { | |
| atomicExchange(atomicProcessState.workgroupCounter, 0); | |
| atomicExchange(atomicProcessState.workgroupResult, PackDepth32(1)); | |
| } | |
| barrier(); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment