sw_engine: optimize 64-bit rasterization.

Write the pixels per 64-bit data. This optimization can potentially double the performance in the best case scenario. It's visually noticeable, especially when the resolution is sufficiently large. @Issue: https://github.com/thorvg/thorvg/issues/1471
2025-06-14 12:04:29 +00:00 · 2023-06-12 15:06:56 +09:00 · 2023-06-12 15:06:56 +09:00 · 8305383989
commit 8305383989
parent a597d3bb49
2 changed files with 65 additions and 20 deletions
--- a/src/lib/sw_engine/tvgSwRaster.cpp
+++ b/src/lib/sw_engine/tvgSwRaster.cpp
@ -259,7 +259,7 @@ static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t
 }


-void _rasterGrayscale8(uint8_t *dst, uint32_t val, uint32_t offset, int32_t len)
+void _rasterGrayscale8(uint8_t *dst, uint8_t val, uint32_t offset, int32_t len)
 {
    cRasterPixels(dst, val, offset, len);
 }
@ -391,12 +391,11 @@ static bool _rasterSolidRect(SwSurface* surface, const SwBBox& region, uint8_t r
            rasterRGBA32(buffer + y * surface->stride, color, region.min.x, w);
        }
        return true;
-    //8bits grayscale
    }
+    //8bits grayscale
    if (surface->channelSize == sizeof(uint8_t)) {
-        auto buffer = surface->buf8 + (region.min.y * surface->stride);
        for (uint32_t y = 0; y < h; ++y) {
-            _rasterGrayscale8(buffer + y * surface->stride, 255, region.min.x, w);
+            _rasterGrayscale8(surface->buf8, 255, region.min.y * surface->stride + region.min.x, w);
        }
        return true;
    }
@ -565,7 +564,7 @@ static bool _rasterSolidRle(SwSurface* surface, const SwRleData* rle, uint8_t r,
    //8bit grayscale
    } else if (surface->channelSize == sizeof(uint8_t)) {
        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
-            _rasterGrayscale8(surface->buf8 + span->y * surface->stride, span->coverage, span->x, span->len);
+            _rasterGrayscale8(surface->buf8, span->coverage, span->y * surface->stride + span->x, span->len);
        }
    }
    return true;
@ -1637,7 +1636,7 @@ void rasterRGBA32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
 #elif defined(THORVG_NEON_VECTOR_SUPPORT)
    neonRasterRGBA32(dst, val, offset, len);
 #else
-    cRasterPixels<uint32_t>(dst, val, offset, len);
+    cRasterPixels(dst, val, offset, len);
 #endif
 }

@ -1672,24 +1671,22 @@ bool rasterClear(SwSurface* surface, uint32_t x, uint32_t y, uint32_t w, uint32_
    if (surface->channelSize == sizeof(uint32_t)) {
        //full clear
        if (w == surface->stride) {
-            rasterRGBA32(surface->buf32 + (surface->stride * y), 0x00000000, 0, w * h);
+            rasterRGBA32(surface->buf32, 0x00000000, surface->stride * y, w * h);
        //partial clear
        } else {
-            auto buffer = surface->buf32 + (surface->stride * y + x);
            for (uint32_t i = 0; i < h; i++) {
-                rasterRGBA32(buffer + (surface->stride * i), 0x00000000, 0, w);
+                rasterRGBA32(surface->buf32, 0x00000000, (surface->stride * y + x) + (surface->stride * i), w);
            }
        }
    //8 bits
    } else if (surface->channelSize == sizeof(uint8_t)) {
        //full clear
        if (w == surface->stride) {
-            _rasterGrayscale8(surface->buf8 + (surface->stride * y), 0x00, 0, w * h);
+            _rasterGrayscale8(surface->buf8, 0x00, surface->stride * y, w * h);
        //partial clear
        } else {
-            auto buffer = surface->buf8 + (surface->stride * y + x);
            for (uint32_t i = 0; i < h; i++) {
-                _rasterGrayscale8(buffer + (surface->stride * i), 0x00, 0, w);
+                _rasterGrayscale8(surface->buf8, 0x00, (surface->stride * y + x) + (surface->stride * i), w);
            }
        }
    }
--- a/src/lib/sw_engine/tvgSwRasterC.h
+++ b/src/lib/sw_engine/tvgSwRasterC.h
@ -21,9 +21,43 @@
 */

 template<typename PIXEL_T>
-static void inline cRasterPixels(PIXEL_T* dst, uint32_t val, uint32_t offset, int32_t len)
+static void inline cRasterPixels(PIXEL_T* dst, PIXEL_T val, uint32_t offset, int32_t len)
 {
    dst += offset;
+
+    //fix the misaligned memory
+    auto alignOffset = 0;
+    if (sizeof(PIXEL_T) == 4) alignOffset = offset % 2;
+    else if (sizeof(PIXEL_T) == 1) {
+        alignOffset = offset % 8;
+        if (alignOffset > 0) alignOffset = 8 - alignOffset;
+    }
+
+    while (alignOffset > 0 && len > 0) {
+        *dst++ = val;
+        --len;
+        --alignOffset;
+    }
+
+    //64bits faster clear
+    if ((sizeof(PIXEL_T) == 4)) {
+        auto val64 = (uint64_t(val) << 32) | uint64_t(val);
+        while (len > 1) {
+            *reinterpret_cast<uint64_t*>(dst) = val64;
+            len -= 2;
+            dst += 2;
+        }
+    } else if (sizeof(PIXEL_T) == 1) {
+        auto val32 = (uint32_t(val) << 24) | (uint32_t(val) << 16) | (uint32_t(val) << 8) | uint32_t(val);
+        auto val64 = (uint64_t(val32) << 32) | val32;
+        while (len > 7) {
+            *reinterpret_cast<uint64_t*>(dst) = val64;
+            len -= 8;
+            dst += 8;
+        }
+    }
+
+    //leftovers
    while (len--) *dst++ = val;
 }

@ -97,13 +131,27 @@ static bool inline cRasterABGRtoARGB(Surface* surface)
 {
    TVGLOG("SW_ENGINE", "Convert ColorSpace ABGR - ARGB [Size: %d x %d]", surface->w, surface->h);

-    auto buffer = surface->buf32;
-    for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) {
-        auto dst = buffer;
-        for (uint32_t x = 0; x < surface->w; ++x, ++dst) {
-            auto c = *dst;
-            //flip Blue, Red channels
-            *dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16);
+    //64bits faster converting
+    if (surface->w % 2 == 0) {
+        auto buffer = reinterpret_cast<uint64_t*>(surface->buf32);
+        for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride / 2) {
+            auto dst = buffer;
+            for (uint32_t x = 0; x < surface->w / 2; ++x, ++dst) {
+                auto c = *dst;
+                //flip Blue, Red channels
+                *dst = (c & 0xff000000ff000000) + ((c & 0x00ff000000ff0000) >> 16) + (c & 0x0000ff000000ff00) + ((c & 0x000000ff000000ff) << 16);
+            }
+        }
+    //default converting
+    } else {
+        auto buffer = surface->buf32;
+        for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) {
+            auto dst = buffer;
+            for (uint32_t x = 0; x < surface->w; ++x, ++dst) {
+                auto c = *dst;
+                //flip Blue, Red channels
+                *dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16);
+            }
        }
    }
    return true;