sw_engine: Improved the blur performance by 12%

Applied the compromised approach for the gaussian blur
since the effect is a bit burdensome for cpu processing
as animatable effects.

- Optimized performance and quality with negligible observable differences.
- Disabled the border option until specific use cases are identified.
This commit is contained in:
Hermet Park 2025-01-20 17:00:54 +09:00 committed by Hermet Park
parent ef41f3f79c
commit 44075aa0f0

View file

@ -36,22 +36,23 @@ struct SwGaussianBlur
}; };
static int _gaussianEdgeWrap(int end, int idx) static inline int _gaussianEdgeWrap(int end, int idx)
{ {
auto r = idx % end; auto r = idx % (end + 1);
return (r < 0) ? end + r : r; return (r < 0) ? (end + 1) + r : r;
} }
static int _gaussianEdgeExtend(int end, int idx) static inline int _gaussianEdgeExtend(int end, int idx)
{ {
if (idx < 0) return 0; if (idx < 0) return 0;
else if (idx >= end) return end - 1; else if (idx > end) return end;
return idx; return idx;
} }
static int _gaussianRemap(int end, int idx, int border) template<int border>
static inline int _gaussianRemap(int end, int idx)
{ {
if (border == 1) return _gaussianEdgeWrap(end, idx); if (border == 1) return _gaussianEdgeWrap(end, idx);
return _gaussianEdgeExtend(end, idx); return _gaussianEdgeExtend(end, idx);
@ -59,7 +60,8 @@ static int _gaussianRemap(int end, int idx, int border)
//TODO: SIMD OPTIMIZATION? //TODO: SIMD OPTIMIZATION?
static void _gaussianFilter(uint8_t* dst, uint8_t* src, int32_t stride, int32_t w, int32_t h, const SwBBox& bbox, int32_t dimension, int border, bool flipped) template<int border = 0>
static void _gaussianFilter(uint8_t* dst, uint8_t* src, int32_t stride, int32_t w, int32_t h, const SwBBox& bbox, int32_t dimension, bool flipped)
{ {
if (flipped) { if (flipped) {
src += (bbox.min.x * stride + bbox.min.y) << 2; src += (bbox.min.x * stride + bbox.min.y) << 2;
@ -70,6 +72,7 @@ static void _gaussianFilter(uint8_t* dst, uint8_t* src, int32_t stride, int32_t
} }
auto iarr = 1.0f / (dimension + dimension + 1); auto iarr = 1.0f / (dimension + dimension + 1);
auto end = w - 1;
#pragma omp parallel for #pragma omp parallel for
for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
@ -81,7 +84,7 @@ static void _gaussianFilter(uint8_t* dst, uint8_t* src, int32_t stride, int32_t
//initial accumulation //initial accumulation
for (int x = l; x < r; ++x) { for (int x = l; x < r; ++x) {
auto id = (_gaussianRemap(w, x, border) + p) * 4; auto id = (_gaussianRemap<border>(end, x) + p) * 4;
acc[0] += src[id++]; acc[0] += src[id++];
acc[1] += src[id++]; acc[1] += src[id++];
acc[2] += src[id++]; acc[2] += src[id++];
@ -89,16 +92,17 @@ static void _gaussianFilter(uint8_t* dst, uint8_t* src, int32_t stride, int32_t
} }
//perform filtering //perform filtering
for (int x = 0; x < w; ++x, ++r, ++l) { for (int x = 0; x < w; ++x, ++r, ++l) {
auto rid = (_gaussianRemap(w, r, border) + p) * 4; auto rid = (_gaussianRemap<border>(end, r) + p) * 4;
auto lid = (_gaussianRemap(w, l, border) + p) * 4; auto lid = (_gaussianRemap<border>(end, l) + p) * 4;
acc[0] += src[rid++] - src[lid++]; acc[0] += src[rid++] - src[lid++];
acc[1] += src[rid++] - src[lid++]; acc[1] += src[rid++] - src[lid++];
acc[2] += src[rid++] - src[lid++]; acc[2] += src[rid++] - src[lid++];
acc[3] += src[rid] - src[lid]; acc[3] += src[rid] - src[lid];
dst[i++] = static_cast<uint8_t>(acc[0] * iarr + 0.5f); //ignored rounding for the performance. It should be originally: acc[idx] * iarr + 0.5f
dst[i++] = static_cast<uint8_t>(acc[1] * iarr + 0.5f); dst[i++] = static_cast<uint8_t>(acc[0] * iarr);
dst[i++] = static_cast<uint8_t>(acc[2] * iarr + 0.5f); dst[i++] = static_cast<uint8_t>(acc[1] * iarr);
dst[i++] = static_cast<uint8_t>(acc[3] * iarr + 0.5f); dst[i++] = static_cast<uint8_t>(acc[2] * iarr);
dst[i++] = static_cast<uint8_t>(acc[3] * iarr);
} }
} }
} }
@ -188,7 +192,7 @@ bool effectGaussianBlur(SwCompositor* cmp, SwSurface* surface, const RenderEffec
//horizontal //horizontal
if (params->direction != 2) { if (params->direction != 2) {
for (int i = 0; i < data->level; ++i) { for (int i = 0; i < data->level; ++i) {
_gaussianFilter(reinterpret_cast<uint8_t*>(back), reinterpret_cast<uint8_t*>(front), stride, w, h, bbox, data->kernel[i], params->border, false); _gaussianFilter(reinterpret_cast<uint8_t*>(back), reinterpret_cast<uint8_t*>(front), stride, w, h, bbox, data->kernel[i], false);
std::swap(front, back); std::swap(front, back);
swapped = !swapped; swapped = !swapped;
} }
@ -200,7 +204,7 @@ bool effectGaussianBlur(SwCompositor* cmp, SwSurface* surface, const RenderEffec
std::swap(front, back); std::swap(front, back);
for (int i = 0; i < data->level; ++i) { for (int i = 0; i < data->level; ++i) {
_gaussianFilter(reinterpret_cast<uint8_t*>(back), reinterpret_cast<uint8_t*>(front), stride, h, w, bbox, data->kernel[i], params->border, true); _gaussianFilter(reinterpret_cast<uint8_t*>(back), reinterpret_cast<uint8_t*>(front), stride, h, w, bbox, data->kernel[i], true);
std::swap(front, back); std::swap(front, back);
swapped = !swapped; swapped = !swapped;
} }
@ -235,6 +239,7 @@ static void _dropShadowFilter(uint32_t* dst, uint32_t* src, int stride, int w, i
dst += (bbox.min.y * stride + bbox.min.x); dst += (bbox.min.y * stride + bbox.min.x);
} }
auto iarr = 1.0f / (dimension + dimension + 1); auto iarr = 1.0f / (dimension + dimension + 1);
auto end = w - 1;
#pragma omp parallel for #pragma omp parallel for
for (int y = 0; y < h; ++y) { for (int y = 0; y < h; ++y) {
@ -246,15 +251,16 @@ static void _dropShadowFilter(uint32_t* dst, uint32_t* src, int stride, int w, i
//initial accumulation //initial accumulation
for (int x = l; x < r; ++x) { for (int x = l; x < r; ++x) {
auto id = _gaussianEdgeExtend(w, x) + p; auto id = _gaussianEdgeExtend(end, x) + p;
acc += A(src[id]); acc += A(src[id]);
} }
//perform filtering //perform filtering
for (int x = 0; x < w; ++x, ++r, ++l) { for (int x = 0; x < w; ++x, ++r, ++l) {
auto rid = _gaussianEdgeExtend(w, r) + p; auto rid = _gaussianEdgeExtend(end, r) + p;
auto lid = _gaussianEdgeExtend(w, l) + p; auto lid = _gaussianEdgeExtend(end, l) + p;
acc += A(src[rid]) - A(src[lid]); acc += A(src[rid]) - A(src[lid]);
dst[i++] = ALPHA_BLEND(color, static_cast<uint8_t>(acc * iarr + 0.5f)); //ignored rounding for the performance. It should be originally: acc * iarr
dst[i++] = ALPHA_BLEND(color, static_cast<uint8_t>(acc * iarr));
} }
} }
} }