diff --git a/src/renderer/wg_engine/tvgWgCompositor.cpp b/src/renderer/wg_engine/tvgWgCompositor.cpp index 20f2b88b..71a5f751 100755 --- a/src/renderer/wg_engine/tvgWgCompositor.cpp +++ b/src/renderer/wg_engine/tvgWgCompositor.cpp @@ -747,41 +747,45 @@ void WgCompositor::gaussianBlur(WgContext& context, WgRenderStorage* dst, const assert(compose->rdViewport); assert(!renderPassEncoder); auto renderDataGaussian = (WgRenderDataGaussian*)params->rd; + auto aabb = compose->aabb; auto viewport = compose->rdViewport; + WgRenderStorage* sbuff = dst; + WgRenderStorage* dbuff = &storageDstCopy; + + // begin compute pass + WGPUComputePassDescriptor computePassDesc{ .label = "Compute pass gaussian blur" }; + WGPUComputePassEncoder computePassEncoder = wgpuCommandEncoderBeginComputePass(commandEncoder, &computePassDesc); for (uint32_t level = 0; level < renderDataGaussian->level; level++) { // horizontal blur if (params->direction != 2) { - const WGPUImageCopyTexture texSrc { .texture = dst->texture }; - const WGPUImageCopyTexture texDst { .texture = storageDstCopy.texture }; - const WGPUExtent3D copySize { .width = width, .height = height, .depthOrArrayLayers = 1 }; - wgpuCommandEncoderCopyTextureToTexture(commandEncoder, &texSrc, &texDst, ©Size); - WGPUComputePassDescriptor computePassDesc{ .label = "Compute pass gaussian blur horizontal" }; - WGPUComputePassEncoder computePassEncoder = wgpuCommandEncoderBeginComputePass(commandEncoder, &computePassDesc); - wgpuComputePassEncoderSetBindGroup(computePassEncoder, 0, storageDstCopy.bindGroupRead, 0, nullptr); - wgpuComputePassEncoderSetBindGroup(computePassEncoder, 1, dst->bindGroupWrite, 0, nullptr); + wgpuComputePassEncoderSetBindGroup(computePassEncoder, 0, sbuff->bindGroupRead, 0, nullptr); + wgpuComputePassEncoderSetBindGroup(computePassEncoder, 1, dbuff->bindGroupWrite, 0, nullptr); wgpuComputePassEncoderSetBindGroup(computePassEncoder, 2, renderDataGaussian->bindGroupGaussian, 0, nullptr); wgpuComputePassEncoderSetBindGroup(computePassEncoder, 3, viewport->bindGroupViewport, 0, nullptr); wgpuComputePassEncoderSetPipeline(computePassEncoder, pipelines.gaussian_horz); - wgpuComputePassEncoderDispatchWorkgroups(computePassEncoder, width / 16, height / 16, 1); - wgpuComputePassEncoderEnd(computePassEncoder); - wgpuComputePassEncoderRelease(computePassEncoder); + wgpuComputePassEncoderDispatchWorkgroups(computePassEncoder, (aabb.w - 1) / 128 + 1, aabb.h, 1); + std::swap(sbuff, dbuff); } // vertical blur if (params->direction != 1) { - const WGPUImageCopyTexture texSrc { .texture = dst->texture }; - const WGPUImageCopyTexture texDst { .texture = storageDstCopy.texture }; - const WGPUExtent3D copySize { .width = width, .height = height, .depthOrArrayLayers = 1 }; - wgpuCommandEncoderCopyTextureToTexture(commandEncoder, &texSrc, &texDst, ©Size); - WGPUComputePassDescriptor computePassDesc{ .label = "Compute pass gaussian blur vertical" }; - WGPUComputePassEncoder computePassEncoder = wgpuCommandEncoderBeginComputePass(commandEncoder, &computePassDesc); - wgpuComputePassEncoderSetBindGroup(computePassEncoder, 0, storageDstCopy.bindGroupRead, 0, nullptr); - wgpuComputePassEncoderSetBindGroup(computePassEncoder, 1, dst->bindGroupWrite, 0, nullptr); + wgpuComputePassEncoderSetBindGroup(computePassEncoder, 0, sbuff->bindGroupRead, 0, nullptr); + wgpuComputePassEncoderSetBindGroup(computePassEncoder, 1, dbuff->bindGroupWrite, 0, nullptr); wgpuComputePassEncoderSetBindGroup(computePassEncoder, 2, renderDataGaussian->bindGroupGaussian, 0, nullptr); wgpuComputePassEncoderSetBindGroup(computePassEncoder, 3, viewport->bindGroupViewport, 0, nullptr); wgpuComputePassEncoderSetPipeline(computePassEncoder, pipelines.gaussian_vert); - wgpuComputePassEncoderDispatchWorkgroups(computePassEncoder, width / 16, height / 16, 1); - wgpuComputePassEncoderEnd(computePassEncoder); - wgpuComputePassEncoderRelease(computePassEncoder); + wgpuComputePassEncoderDispatchWorkgroups(computePassEncoder, aabb.w, (aabb.h - 1) / 128 + 1, 1); + std::swap(sbuff, dbuff); } } + // end compute pass + wgpuComputePassEncoderEnd(computePassEncoder); + wgpuComputePassEncoderRelease(computePassEncoder); + + // if final result stored in intermidiate buffer we must copy result to destination buffer + if (sbuff == &storageDstCopy) { + const WGPUImageCopyTexture texSrc { .texture = sbuff->texture, .origin = { .x = (uint32_t)aabb.x, .y = (uint32_t)aabb.y } }; + const WGPUImageCopyTexture texDst { .texture = dbuff->texture, .origin = { .x = (uint32_t)aabb.x, .y = (uint32_t)aabb.y } }; + const WGPUExtent3D copySize { .width = (uint32_t)aabb.w, .height = (uint32_t)aabb.h, .depthOrArrayLayers = 1 }; + wgpuCommandEncoderCopyTextureToTexture(commandEncoder, &texSrc, &texDst, ©Size); + } } \ No newline at end of file diff --git a/src/renderer/wg_engine/tvgWgPipelines.cpp b/src/renderer/wg_engine/tvgWgPipelines.cpp index 9807b020..367a3734 100755 --- a/src/renderer/wg_engine/tvgWgPipelines.cpp +++ b/src/renderer/wg_engine/tvgWgPipelines.cpp @@ -221,8 +221,7 @@ void WgPipelines::initialize(WgContext& context) // shader blit shader_blit = createShaderModule(context.device, "The shader blit", cShaderSrc_Blit); // shader effects - shader_gaussian_horz = createShaderModule(context.device, "The shader gaussian horizontal", cShaderSrc_GaussianBlur_Horz); - shader_gaussian_vert = createShaderModule(context.device, "The shader gaussian vertical", cShaderSrc_GaussianBlur_Vert); + shader_gaussian = createShaderModule(context.device, "The shader gaussian", cShaderSrc_GaussianBlur); // layouts layout_stencil = createPipelineLayout(context.device, bindGroupLayoutsStencil, 2); @@ -450,10 +449,10 @@ void WgPipelines::initialize(WgContext& context) // compute pipeline gaussian blur gaussian_horz = createComputePipeline( context.device, "The compute pipeline gaussian blur horizontal", - shader_gaussian_horz, "cs_main", layout_gaussian); + shader_gaussian, "cs_main_horz", layout_gaussian); gaussian_vert = createComputePipeline( context.device, "The compute pipeline gaussian blur vertical", - shader_gaussian_vert, "cs_main", layout_gaussian); + shader_gaussian, "cs_main_vert", layout_gaussian); } void WgPipelines::releaseGraphicHandles(WgContext& context) @@ -505,8 +504,7 @@ void WgPipelines::releaseGraphicHandles(WgContext& context) releasePipelineLayout(layout_depth); releasePipelineLayout(layout_stencil); // shaders - releaseShaderModule(shader_gaussian_horz); - releaseShaderModule(shader_gaussian_vert); + releaseShaderModule(shader_gaussian); releaseShaderModule(shader_blit); releaseShaderModule(shader_scene_compose); releaseShaderModule(shader_scene_blend); diff --git a/src/renderer/wg_engine/tvgWgPipelines.h b/src/renderer/wg_engine/tvgWgPipelines.h index 987aa787..86c7a9e4 100755 --- a/src/renderer/wg_engine/tvgWgPipelines.h +++ b/src/renderer/wg_engine/tvgWgPipelines.h @@ -47,8 +47,7 @@ private: // shader blit WGPUShaderModule shader_blit{}; // shader effects - WGPUShaderModule shader_gaussian_horz{}; - WGPUShaderModule shader_gaussian_vert{}; + WGPUShaderModule shader_gaussian; // layouts helpers WGPUPipelineLayout layout_stencil{}; diff --git a/src/renderer/wg_engine/tvgWgShaderSrc.cpp b/src/renderer/wg_engine/tvgWgShaderSrc.cpp index 501e7854..73b5c0d7 100755 --- a/src/renderer/wg_engine/tvgWgShaderSrc.cpp +++ b/src/renderer/wg_engine/tvgWgShaderSrc.cpp @@ -721,94 +721,106 @@ fn cs_main(@builtin(global_invocation_id) id: vec3u) { } )"; -const char* cShaderSrc_GaussianBlur_Horz = R"( +const char* cShaderSrc_GaussianBlur = R"( @group(0) @binding(0) var imageSrc : texture_storage_2d; @group(1) @binding(0) var imageDst : texture_storage_2d; @group(2) @binding(0) var settings: vec4f; @group(3) @binding(0) var viewport: vec4f; +const N: u32 = 128; +const M: u32 = N * 3; +var buff: array; + fn gaussian(x: f32, sigma: f32) -> f32 { let a = 0.39894f / sigma; let b = -(x * x) / (2.0 * sigma * sigma); return a * exp(b); } -@compute @workgroup_size(16, 16) -fn cs_main(@builtin(global_invocation_id) id: vec3u) { - // id conversion - let iid = vec2i(id.xy); - // viewport decode - let xmin = i32(viewport.x); - let ymin = i32(viewport.y); - let xmax = i32(viewport.z); - let ymax = i32(viewport.w); +@compute @workgroup_size(N, 1) +fn cs_main_horz(@builtin(global_invocation_id) gid: vec3u, + @builtin(local_invocation_id) lid: vec3u) { // settings decode let sigma = settings.x; let scale = settings.y; let size = i32(settings.z); - // draw borders points outside of viewport - if ((iid.x < xmin) || (iid.x > xmax) || (iid.y < ymin) || (iid.y > ymax)) { return; } - - // apply filter - var weight = gaussian(0.0, sigma); - var color = weight * textureLoad(imageSrc, id.xy); - var sum = weight; - for (var i: i32 = 1; i < size; i++) { - let ii = i32(f32(i) * scale); - let idneg = vec2i(clamp(iid.x - ii, xmin, xmax), iid.y); - let idpos = vec2i(clamp(iid.x + ii, xmin, xmax), iid.y); - weight = gaussian(f32(i) * scale, sigma); - color += (weight * textureLoad(imageSrc, vec2u(idneg))); - color += (weight * textureLoad(imageSrc, vec2u(idpos))); - sum += (2.0 * weight); - } - textureStore(imageDst, id.xy, color / sum); -} -)"; - -const char* cShaderSrc_GaussianBlur_Vert = R"( -@group(0) @binding(0) var imageSrc : texture_storage_2d; -@group(1) @binding(0) var imageDst : texture_storage_2d; -@group(2) @binding(0) var settings: vec4f; -@group(3) @binding(0) var viewport: vec4f; - -fn gaussian(x: f32, sigma: f32) -> f32 { - let a = 0.39894f / sigma; - let b = -(x * x) / (2.0 * sigma * sigma); - return a * exp(b); -} - -@compute @workgroup_size(16, 16) -fn cs_main(@builtin(global_invocation_id) id: vec3u) { - // id conversion - let iid = vec2i(id.xy); // viewport decode let xmin = i32(viewport.x); let ymin = i32(viewport.y); let xmax = i32(viewport.z); let ymax = i32(viewport.w); + + // tex coord + let uid = vec2u(gid.x + u32(xmin), gid.y + u32(ymin)); + let iid = vec2i(uid); + + // load source to local workgroup memory + buff[lid.x + N*0] = textureLoad(imageSrc, uid - vec2u(N, 0)); + buff[lid.x + N*1] = textureLoad(imageSrc, uid + vec2u(0, 0)); + buff[lid.x + N*2] = textureLoad(imageSrc, uid + vec2u(N, 0)); + workgroupBarrier(); + + // apply filter + var weight = gaussian(0.0, sigma); + var color = weight * buff[lid.x + N]; + var sum = weight; + + for (var i: i32 = 1; i < size; i++) { + let ii = i32(f32(i) * scale); + weight = gaussian(f32(i) * scale, sigma); + let poffset = min(iid.x + ii, xmax) - iid.x; + let noffset = max(iid.x - ii, xmin) - iid.x; + color += (weight * buff[i32(lid.x + N) + poffset]); + color += (weight * buff[i32(lid.x + N) + noffset]); + sum += (2.0 * weight); + } + + // store result + textureStore(imageDst, uid, color / sum); +} + +@compute @workgroup_size(1, N) +fn cs_main_vert(@builtin(global_invocation_id) gid: vec3u, + @builtin(local_invocation_id) lid: vec3u) { // settings decode let sigma = settings.x; let scale = settings.y; let size = i32(settings.z); - // draw borders points outside of viewport - if ((iid.x < xmin) || (iid.x > xmax) || (iid.y < ymin) || (iid.y > ymax)) { return; } + // viewport decode + let xmin = i32(viewport.x); + let ymin = i32(viewport.y); + let xmax = i32(viewport.z); + let ymax = i32(viewport.w); + + // tex coord + let uid = vec2u(gid.x + u32(xmin), gid.y + u32(ymin)); + let iid = vec2i(uid); + + // load source to local workgroup memory + buff[lid.y + N*0] = textureLoad(imageSrc, uid - vec2u(0, N)); + buff[lid.y + N*1] = textureLoad(imageSrc, uid + vec2u(0, 0)); + buff[lid.y + N*2] = textureLoad(imageSrc, uid + vec2u(0, N)); + workgroupBarrier(); // apply filter var weight = gaussian(0.0, sigma); - var color = weight * textureLoad(imageSrc, id.xy); + var color = weight * buff[lid.y + N]; var sum = weight; + for (var i: i32 = 1; i < size; i++) { let ii = i32(f32(i) * scale); - let idneg = vec2i(iid.x, clamp(iid.y - ii, ymin, ymax)); - let idpos = vec2i(iid.x, clamp(iid.y + ii, ymin, ymax)); weight = gaussian(f32(i) * scale, sigma); - color += (weight * textureLoad(imageSrc, vec2u(idneg))); - color += (weight * textureLoad(imageSrc, vec2u(idpos))); + let poffset = min(iid.y + ii, ymax) - iid.y; + let noffset = max(iid.y - ii, ymin) - iid.y; + color += (weight * buff[i32(lid.y + N) + poffset]); + color += (weight * buff[i32(lid.y + N) + noffset]); sum += (2.0 * weight); } - textureStore(imageDst, id.xy, color / sum); + + // store result + textureStore(imageDst, uid, color / sum); + //textureStore(imageDst, uid, vec4f(1.0, 0.0, 0.0, 1.0)); } -)"; +)"; \ No newline at end of file diff --git a/src/renderer/wg_engine/tvgWgShaderSrc.h b/src/renderer/wg_engine/tvgWgShaderSrc.h index 68b7d6f8..f445f5ed 100755 --- a/src/renderer/wg_engine/tvgWgShaderSrc.h +++ b/src/renderer/wg_engine/tvgWgShaderSrc.h @@ -46,7 +46,6 @@ extern const char* cShaderSrc_Blit; // compute shader sources: effects extern const char* cShaderSrc_MergeMasks; -extern const char* cShaderSrc_GaussianBlur_Vert; -extern const char* cShaderSrc_GaussianBlur_Horz; +extern const char* cShaderSrc_GaussianBlur; #endif // _TVG_WG_SHEDER_SRC_H_ diff --git a/src/renderer/wg_engine/tvgWgShaderTypes.cpp b/src/renderer/wg_engine/tvgWgShaderTypes.cpp index 6c8b5604..9f214014 100755 --- a/src/renderer/wg_engine/tvgWgShaderTypes.cpp +++ b/src/renderer/wg_engine/tvgWgShaderTypes.cpp @@ -205,8 +205,9 @@ void WgShaderTypeGaussianBlur::update(const RenderEffectGaussianBlur* gaussian, assert(gaussian); const float sigma = gaussian->sigma; const float scale = std::sqrt(transform.e11 * transform.e11 + transform.e12 * transform.e12); + const float kernel = std::min(WG_GAUSSIAN_KERNEL_SIZE_MAX, 2 * sigma * scale); // kernel size settings[0] = sigma; - settings[1] = scale; - settings[2] = 2 * sigma * scale; // kernel size + settings[1] = std::min(WG_GAUSSIAN_KERNEL_SIZE_MAX / kernel, scale); + settings[2] = kernel; extend = settings[2] * 2; } \ No newline at end of file diff --git a/src/renderer/wg_engine/tvgWgShaderTypes.h b/src/renderer/wg_engine/tvgWgShaderTypes.h index 8e27effb..85a64455 100755 --- a/src/renderer/wg_engine/tvgWgShaderTypes.h +++ b/src/renderer/wg_engine/tvgWgShaderTypes.h @@ -68,6 +68,7 @@ struct WgShaderTypeGradient }; // gaussian settings: sigma, scale, extend +#define WG_GAUSSIAN_KERNEL_SIZE_MAX (128.0f) struct WgShaderTypeGaussianBlur { float settings[4]{}; // [0]: sigma, [1]: scale, [2]: kernel size, [3]: unused