From eb8539e0b41114f6a5897dc988de653a2f517961 Mon Sep 17 00:00:00 2001
From: Hermet Park <hermetpark@gmail.com>
Date: Sun, 28 May 2023 18:49:44 +0900
Subject: [PATCH] sw_engine fill: optimize linear/radial fill fetching.

Save the separated for-loop by
unifying the blending/composition in one fetching stage.
---
 src/lib/sw_engine/tvgSwCommon.h   |  39 ++++++-
 src/lib/sw_engine/tvgSwFill.cpp   | 178 ++++++++++++++++++++++++++----
 src/lib/sw_engine/tvgSwRaster.cpp | 152 ++++---------------------
 3 files changed, 214 insertions(+), 155 deletions(-)

diff --git a/src/lib/sw_engine/tvgSwCommon.h b/src/lib/sw_engine/tvgSwCommon.h
index a01bcc4f..be72e4e6 100644
--- a/src/lib/sw_engine/tvgSwCommon.h
+++ b/src/lib/sw_engine/tvgSwCommon.h
@@ -240,6 +240,7 @@ struct SwImage
 
 typedef uint32_t(*SwJoin)(uint8_t r, uint8_t g, uint8_t b, uint8_t a);      //color channel join
 typedef uint8_t(*SwAlpha)(uint8_t*);                                        //blending alpha
+typedef uint32_t(*SwBlendOp)(uint32_t s, uint32_t d, uint8_t a);            //src, dst, alpha
 
 struct SwBlender
 {
@@ -302,6 +303,38 @@ static inline SwCoord HALF_STROKE(float width)
     return TO_SWCOORD(width * 0.5f);
 }
 
+static inline uint8_t _multiply(uint8_t c, uint8_t a)
+{
+    return ((c * a + 0xff) >> 8);
+}
+
+static inline uint8_t _alpha(uint32_t c)
+{
+    return (c >> 24);
+}
+
+static inline uint8_t _ialpha(uint32_t c)
+{
+    return (~c >> 24);
+}
+
+static inline uint32_t opAlphaBlend(uint32_t s, uint32_t d, uint8_t a)
+{
+    auto t = ALPHA_BLEND(s, a);
+    return t + ALPHA_BLEND(d, _ialpha(t));
+}
+
+static inline uint32_t opBlend(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    return s + ALPHA_BLEND(d, _ialpha(s));
+}
+
+static inline uint32_t opInterpolate(uint32_t s, uint32_t d, uint8_t a)
+{
+    return INTERPOLATE(s, d, a);
+}
+
+
 int64_t mathMultiply(int64_t a, int64_t b);
 int64_t mathDivide(int64_t a, int64_t b);
 int64_t mathMulDiv(int64_t a, int64_t b, int64_t c);
@@ -349,8 +382,10 @@ void imageFree(SwImage* image);
 bool fillGenColorTable(SwFill* fill, const Fill* fdata, const Matrix* transform, SwSurface* surface, uint32_t opacity, bool ctable);
 void fillReset(SwFill* fill);
 void fillFree(SwFill* fill);
-void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len);
-void fillFetchRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len);
+void fillRasterLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op = nullptr, uint8_t a = 255);                         //blending ver.
+void fillRasterLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity);     //masking ver.
+void fillRasterRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op = nullptr, uint8_t a = 255);                         //blending ver.
+void fillRasterRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity);     //masking ver.
 
 SwRleData* rleRender(SwRleData* rle, const SwOutline* outline, const SwBBox& renderRegion, bool antiAlias);
 SwRleData* rleRender(const SwBBox* bbox);
diff --git a/src/lib/sw_engine/tvgSwFill.cpp b/src/lib/sw_engine/tvgSwFill.cpp
index e14d2bdd..df7a1b57 100644
--- a/src/lib/sw_engine/tvgSwFill.cpp
+++ b/src/lib/sw_engine/tvgSwFill.cpp
@@ -233,7 +233,7 @@ static inline uint32_t _pixel(const SwFill* fill, float pos)
 /* External Class Implementation                                        */
 /************************************************************************/
 
-void fillFetchRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len)
+void fillRasterRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
 {
     auto rx = (x + 0.5f) * fill->radial.a11 + (y + 0.5f) * fill->radial.a12 + fill->radial.shiftX;
     auto ry = (x + 0.5f) * fill->radial.a21 + (y + 0.5f) * fill->radial.a22 + fill->radial.shiftY;
@@ -244,16 +244,125 @@ void fillFetchRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x,
     auto detFirstDerivative = 2.0f * (fill->radial.a11 * rx + fill->radial.a21 * ry) + 0.5f * detSecondDerivative;
     auto det = rx * rx + ry * ry;
 
-    for (uint32_t i = 0 ; i < len ; ++i) {
-        *dst = _pixel(fill, sqrtf(det));
-        ++dst;
-        det += detFirstDerivative;
-        detFirstDerivative += detSecondDerivative;
+    if (opacity == 255) {
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+            *dst = opAlphaBlend(_pixel(fill, sqrtf(det)), *dst, alpha(cmp));
+            det += detFirstDerivative;
+            detFirstDerivative += detSecondDerivative;
+        }
+    } else {
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+            *dst = opAlphaBlend(_pixel(fill, sqrtf(det)), *dst, _multiply(opacity, alpha(cmp)));
+            det += detFirstDerivative;
+            detFirstDerivative += detSecondDerivative;
+        }
+    }
+
+}
+
+
+void fillRasterRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a)
+{
+    auto rx = (x + 0.5f) * fill->radial.a11 + (y + 0.5f) * fill->radial.a12 + fill->radial.shiftX;
+    auto ry = (x + 0.5f) * fill->radial.a21 + (y + 0.5f) * fill->radial.a22 + fill->radial.shiftY;
+
+    // detSecondDerivative = d(detFirstDerivative)/dx = d( d(det)/dx )/dx
+    auto detSecondDerivative = fill->radial.detSecDeriv;
+    // detFirstDerivative = d(det)/dx
+    auto detFirstDerivative = 2.0f * (fill->radial.a11 * rx + fill->radial.a21 * ry) + 0.5f * detSecondDerivative;
+    auto det = rx * rx + ry * ry;
+
+    if (op) {
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+            *dst = op(_pixel(fill, sqrtf(det)), *dst, a);
+            det += detFirstDerivative;
+            detFirstDerivative += detSecondDerivative;
+        }
+    } else {
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+            *dst = _pixel(fill, sqrtf(det));
+            det += detFirstDerivative;
+            detFirstDerivative += detSecondDerivative;
+        }
     }
 }
 
 
-void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len)
+void fillRasterLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (opacity == 255) {
+        if (mathZero(inc)) {
+            auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+            for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) {
+                *dst = opAlphaBlend(color, *dst, alpha(cmp));
+            }
+            return;
+        }
+
+        auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+        auto vMin = -vMax;
+        auto v = t + (inc * len);
+
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) {
+                *dst = opAlphaBlend(_fixedPixel(fill, t2), *dst, alpha(cmp));
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = opAlphaBlend(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, alpha(cmp));
+                ++dst;
+                t += inc;
+                cmp += csize;
+            }
+        }
+    } else {
+        if (mathZero(inc)) {
+            auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+            for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) {
+                *dst = opAlphaBlend(color, *dst, _multiply(alpha(cmp), opacity));
+            }
+            return;
+        }
+
+        auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+        auto vMin = -vMax;
+        auto v = t + (inc * len);
+
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) {
+                *dst = opAlphaBlend(_fixedPixel(fill, t2), *dst, _multiply(alpha(cmp), opacity));
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = opAlphaBlend(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, _multiply(opacity, alpha(cmp)));
+                ++dst;
+                t += inc;
+                cmp += csize;
+            }
+        }
+    }
+}
+
+
+void fillRasterLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlendOp op, uint8_t a)
 {
     //Rotation
     float rx = x + 0.5f;
@@ -263,7 +372,13 @@ void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x,
 
     if (mathZero(inc)) {
         auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
-        rasterRGBA32(dst, color, 0, len);
+        if (op) {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                *dst = op(color, *dst, a);
+            }
+        } else {
+            rasterRGBA32(dst, color, 0, len);
+        }
         return;
     }
 
@@ -271,22 +386,41 @@ void fillFetchLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x,
     auto vMin = -vMax;
     auto v = t + (inc * len);
 
-    //we can use fixed point math
-    if (v < vMax && v > vMin) {
-        auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
-        auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
-        for (uint32_t j = 0; j < len; ++j) {
-            *dst = _fixedPixel(fill, t2);
-            ++dst;
-            t2 += inc2;
+    if (op) {
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst) {
+                *dst = op(_fixedPixel(fill, t2), *dst, a);
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = op(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, a);
+                ++dst;
+                t += inc;
+            }
         }
-    //we have to fallback to float math
     } else {
-        uint32_t counter = 0;
-        while (counter++ < len) {
-            *dst = _pixel(fill, t / GRADIENT_STOP_SIZE);
-            ++dst;
-            t += inc;
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst) {
+                *dst = _fixedPixel(fill, t2);
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = _pixel(fill, t / GRADIENT_STOP_SIZE);
+                ++dst;
+                t += inc;
+            }
         }
     }
 }
diff --git a/src/lib/sw_engine/tvgSwRaster.cpp b/src/lib/sw_engine/tvgSwRaster.cpp
index c69cbc34..c8ba4764 100644
--- a/src/lib/sw_engine/tvgSwRaster.cpp
+++ b/src/lib/sw_engine/tvgSwRaster.cpp
@@ -37,17 +37,6 @@
 /************************************************************************/
 constexpr auto DOWN_SCALE_TOLERANCE = 0.5f;
 
-static inline uint8_t _multiply(uint8_t c, uint8_t a)
-{
-    return ((c * a + 0xff) >> 8);
-}
-
-
-static inline uint32_t _ialpha(uint32_t c)
-{
-    return (~c >> 24);
-}
-
 
 static inline uint8_t _alpha(uint8_t* a)
 {
@@ -926,18 +915,8 @@ static bool _rasterLinearGradientMaskedRect(SwSurface* surface, const SwBBox& re
     auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize;
     auto alpha = surface->blender.alpha(surface->compositor->method);
 
-    auto sbuffer = static_cast<uint32_t*>(alloca(w * sizeof(uint32_t)));
-    if (!sbuffer) return false;
-
     for (uint32_t y = 0; y < h; ++y) {
-        fillFetchLinear(fill, sbuffer, region.min.y + y, region.min.x, w);
-        auto dst = buffer;
-        auto cmp = cbuffer;
-        auto src = sbuffer;
-        for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
-            auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-            *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-        }
+        fillRasterLinear(fill, buffer, region.min.y + y, region.min.x, w, cbuffer, alpha, csize, 255);
         buffer += surface->stride;
         cbuffer += surface->stride * csize;
     }
@@ -953,15 +932,9 @@ static bool _rasterTranslucentLinearGradientRect(SwSurface* surface, const SwBBo
     auto h = static_cast<uint32_t>(region.max.y - region.min.y);
     auto w = static_cast<uint32_t>(region.max.x - region.min.x);
 
-    auto sbuffer = static_cast<uint32_t*>(alloca(w * sizeof(uint32_t)));
-    if (!sbuffer) return false;
-
     for (uint32_t y = 0; y < h; ++y) {
         auto dst = buffer;
-        fillFetchLinear(fill, sbuffer, region.min.y + y, region.min.x, w);
-        for (uint32_t x = 0; x < w; ++x, ++dst) {
-            *dst = sbuffer[x] + ALPHA_BLEND(*dst, _ialpha(sbuffer[x]));
-        }
+        fillRasterLinear(fill, dst, region.min.y + y, region.min.x, w, opBlend);
         buffer += surface->stride;
     }
     return true;
@@ -977,7 +950,7 @@ static bool _rasterSolidLinearGradientRect(SwSurface* surface, const SwBBox& reg
     auto h = static_cast<uint32_t>(region.max.y - region.min.y);
 
     for (uint32_t y = 0; y < h; ++y) {
-        fillFetchLinear(fill, buffer + y * surface->stride, region.min.y + y, region.min.x, w);
+        fillRasterLinear(fill, buffer + y * surface->stride, region.min.y + y, region.min.x, w);
     }
     return true;
 }
@@ -988,6 +961,7 @@ static bool _rasterLinearGradientRect(SwSurface* surface, const SwBBox& region,
     if (_compositing(surface)) {
         return _rasterLinearGradientMaskedRect(surface, region, fill);
     } else {
+        //OPTIMIZE_ME: Unity branches.
         if (fill->translucent) return _rasterTranslucentLinearGradientRect(surface, region, fill);
         else _rasterSolidLinearGradientRect(surface, region, fill);
     }
@@ -1003,30 +977,15 @@ static bool _rasterLinearGradientMaskedRle(SwSurface* surface, const SwRleData*
 {
     if (fill->linear.len < FLT_EPSILON) return false;
 
-    auto buffer = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buffer) return false;
-
     auto span = rle->spans;
     auto csize = surface->compositor->image.channelSize;
     auto cbuffer = surface->compositor->image.buf8;
     auto alpha = surface->blender.alpha(surface->compositor->method);
 
     for (uint32_t i = 0; i < rle->size; ++i, ++span) {
-        fillFetchLinear(fill, buffer, span->y, span->x, span->len);
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
         auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
-        auto src = buffer;
-        if (span->coverage == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        } else {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*src, _multiply(span->coverage, alpha(cmp)));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        }
+        fillRasterLinear(fill, dst, span->y, span->x, span->len, cmp, alpha, csize, span->coverage);
     }
     return true;
 }
@@ -1037,22 +996,11 @@ static bool _rasterTranslucentLinearGradientRle(SwSurface* surface, const SwRleD
     if (fill->linear.len < FLT_EPSILON) return false;
 
     auto span = rle->spans;
-    auto buffer = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buffer) return false;
 
     for (uint32_t i = 0; i < rle->size; ++i, ++span) {
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        fillFetchLinear(fill, buffer, span->y, span->x, span->len);
-        if (span->coverage == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x]));
-            }
-        } else {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                auto tmp = ALPHA_BLEND(buffer[x], span->coverage);
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        }
+        if (span->coverage == 255) fillRasterLinear(fill, dst, span->y, span->x, span->len, opBlend);
+        else fillRasterLinear(fill, dst, span->y, span->x, span->len, opAlphaBlend, span->coverage);
     }
     return true;
 }
@@ -1062,21 +1010,12 @@ static bool _rasterSolidLinearGradientRle(SwSurface* surface, const SwRleData* r
 {
     if (fill->linear.len < FLT_EPSILON) return false;
 
-    auto buf = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buf) return false;
-
     auto span = rle->spans;
 
     for (uint32_t i = 0; i < rle->size; ++i, ++span) {
-        if (span->coverage == 255) {
-            fillFetchLinear(fill, surface->buf32 + span->y * surface->stride + span->x, span->y, span->x, span->len);
-        } else {
-            fillFetchLinear(fill, buf, span->y, span->x, span->len);
-            auto dst = &surface->buf32[span->y * surface->stride + span->x];
-            for (uint32_t x = 0; x < span->len; ++x) {
-                dst[x] = INTERPOLATE(buf[x], dst[x], span->coverage);
-            }
-        }
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        if (span->coverage == 255) fillRasterLinear(fill, dst, span->y, span->x, span->len);
+        else fillRasterLinear(fill, dst, span->y, span->x, span->len, opInterpolate, span->coverage);
     }
     return true;
 }
@@ -1089,6 +1028,7 @@ static bool _rasterLinearGradientRle(SwSurface* surface, const SwRleData* rle, c
     if (_compositing(surface)) {
         return _rasterLinearGradientMaskedRle(surface, rle, fill);
     } else {
+        //OPTIMIZE_ME: Unify branches
         if (fill->translucent) return _rasterTranslucentLinearGradientRle(surface, rle, fill);
         else return _rasterSolidLinearGradientRle(surface, rle, fill);
     }
@@ -1111,18 +1051,8 @@ static bool _rasterRadialGradientMaskedRect(SwSurface* surface, const SwBBox& re
     auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize;
     auto alpha = surface->blender.alpha(surface->compositor->method);
 
-    auto sbuffer = static_cast<uint32_t*>(alloca(w * sizeof(uint32_t)));
-    if (!sbuffer) return false;
-
     for (uint32_t y = 0; y < h; ++y) {
-        fillFetchRadial(fill, sbuffer, region.min.y + y, region.min.x, w);
-        auto dst = buffer;
-        auto cmp = cbuffer;
-        auto src = sbuffer;
-        for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
-             auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-             *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-        }
+        fillRasterRadial(fill, buffer, region.min.y + y, region.min.x, w, cbuffer, alpha, csize, 255);
         buffer += surface->stride;
         cbuffer += surface->stride * csize;
     }
@@ -1138,15 +1068,9 @@ static bool _rasterTranslucentRadialGradientRect(SwSurface* surface, const SwBBo
     auto h = static_cast<uint32_t>(region.max.y - region.min.y);
     auto w = static_cast<uint32_t>(region.max.x - region.min.x);
 
-    auto sbuffer = static_cast<uint32_t*>(alloca(w * sizeof(uint32_t)));
-    if (!sbuffer) return false;
-
     for (uint32_t y = 0; y < h; ++y) {
         auto dst = buffer;
-        fillFetchRadial(fill, sbuffer, region.min.y + y, region.min.x, w);
-        for (uint32_t x = 0; x < w; ++x, ++dst) {
-            *dst = sbuffer[x] + ALPHA_BLEND(*dst, _ialpha(sbuffer[x]));
-        }
+        fillRasterRadial(fill, dst, region.min.y + y, region.min.x, w, opBlend);
         buffer += surface->stride;
     }
     return true;
@@ -1162,8 +1086,7 @@ static bool _rasterSolidRadialGradientRect(SwSurface* surface, const SwBBox& reg
     auto w = static_cast<uint32_t>(region.max.x - region.min.x);
 
     for (uint32_t y = 0; y < h; ++y) {
-        auto dst = &buffer[y * surface->stride];
-        fillFetchRadial(fill, dst, region.min.y + y, region.min.x, w);
+        fillRasterRadial(fill, &buffer[y * surface->stride], region.min.y + y, region.min.x, w);
     }
     return true;
 }
@@ -1174,6 +1097,7 @@ static bool _rasterRadialGradientRect(SwSurface* surface, const SwBBox& region,
     if (_compositing(surface)) {
         return _rasterRadialGradientMaskedRect(surface, region, fill);
     } else {
+        //OPTIMIZE_ME: Unity branches.
         if (fill->translucent) return _rasterTranslucentRadialGradientRect(surface, region, fill);
         else return _rasterSolidRadialGradientRect(surface, region, fill);
     }
@@ -1189,30 +1113,15 @@ static bool _rasterRadialGradientMaskedRle(SwSurface* surface, const SwRleData*
 {
     if (fill->radial.a < FLT_EPSILON) return false;
 
-    auto buffer = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buffer) return false;
-
     auto span = rle->spans;
     auto csize = surface->compositor->image.channelSize;
     auto cbuffer = surface->compositor->image.buf8;
     auto alpha = surface->blender.alpha(surface->compositor->method);
 
     for (uint32_t i = 0; i < rle->size; ++i, ++span) {
-        fillFetchRadial(fill, buffer, span->y, span->x, span->len);
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
         auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
-        auto src = buffer;
-        if (span->coverage == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*src, alpha(cmp));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        } else {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++src, cmp += csize) {
-                auto tmp = ALPHA_BLEND(*src, _multiply(span->coverage, alpha(cmp)));
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        }
+        fillRasterRadial(fill, dst, span->y, span->x, span->len, cmp, alpha, csize, span->coverage);
     }
     return true;
 }
@@ -1223,22 +1132,11 @@ static bool _rasterTranslucentRadialGradientRle(SwSurface* surface, const SwRleD
     if (fill->radial.a < FLT_EPSILON) return false;
 
     auto span = rle->spans;
-    auto buffer = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buffer) return false;
 
     for (uint32_t i = 0; i < rle->size; ++i, ++span) {
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        fillFetchRadial(fill, buffer, span->y, span->x, span->len);
-        if (span->coverage == 255) {
-            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                *dst = buffer[x] + ALPHA_BLEND(*dst, _ialpha(buffer[x]));
-            }
-        } else {
-           for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                auto tmp = ALPHA_BLEND(buffer[x], span->coverage);
-                *dst = tmp + ALPHA_BLEND(*dst, _ialpha(tmp));
-            }
-        }
+        if (span->coverage == 255) fillRasterRadial(fill, dst, span->y, span->x, span->len, opBlend);
+        else fillRasterRadial(fill, dst, span->y, span->x, span->len, opAlphaBlend, span->coverage);
     }
     return true;
 }
@@ -1248,21 +1146,12 @@ static bool _rasterSolidRadialGradientRle(SwSurface* surface, const SwRleData* r
 {
     if (fill->radial.a < FLT_EPSILON) return false;
 
-    auto buf = static_cast<uint32_t*>(alloca(surface->w * sizeof(uint32_t)));
-    if (!buf) return false;
-
     auto span = rle->spans;
 
     for (uint32_t i = 0; i < rle->size; ++i, ++span) {
         auto dst = &surface->buf32[span->y * surface->stride + span->x];
-        if (span->coverage == 255) {
-            fillFetchRadial(fill, dst, span->y, span->x, span->len);
-        } else {
-            fillFetchRadial(fill, buf, span->y, span->x, span->len);
-            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
-                *dst = INTERPOLATE(buf[x], *dst, span->coverage);
-            }
-        }
+        if (span->coverage == 255) fillRasterRadial(fill, dst, span->y, span->x, span->len);
+        else fillRasterRadial(fill, dst, span->y, span->x, span->len, opInterpolate, span->coverage);
     }
     return true;
 }
@@ -1275,6 +1164,7 @@ static bool _rasterRadialGradientRle(SwSurface* surface, const SwRleData* rle, c
     if (_compositing(surface)) {
         return _rasterRadialGradientMaskedRle(surface, rle, fill);
     } else {
+        //OPTIMIZE_ME: Unity branches.
         if (fill->translucent) _rasterTranslucentRadialGradientRle(surface, rle, fill);
         else return _rasterSolidRadialGradientRle(surface, rle, fill);
     }