sw_engine/neon: arm neonRasterPixel32 function to support aarch64

Improved the speed through neon processing. Improvements Rate: Lottie: (0.026321/0.026779) = +1.8% Performance: (0.015411/0.015732) = +2.1% issue: https://github.com/thorvg/thorvg/issues/30
2025-06-14 12:04:29 +00:00 · 2024-05-07 15:33:58 +09:00 · 2024-05-07 15:33:58 +09:00 · 477714778c
commit 477714778c
parent 8c04b9d65e
1 changed files with 14 additions and 5 deletions
--- a/src/renderer/sw_engine/tvgSwRasterNeon.h
+++ b/src/renderer/sw_engine/tvgSwRasterNeon.h
@ -64,17 +64,26 @@ static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int

 static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
 {
+    dst += offset;
+
+    uint32x4_t vectorVal = vdupq_n_u32(val);
+
+#if TVG_AARCH64
+    uint32_t iterations = len / 16;
+    uint32_t neonFilled = iterations * 16;
+    uint32x4x4_t valQuad = {vectorVal, vectorVal, vectorVal, vectorVal};
+    for (uint32_t i = 0; i < iterations; ++i) {
+        vst4q_u32(dst, valQuad);
+        dst += 16;
+    }
+#else
    uint32_t iterations = len / 4;
    uint32_t neonFilled = iterations * 4;
-
-    dst += offset;
-    uint32x4_t vectorVal = {val, val, val, val};
-
    for (uint32_t i = 0; i < iterations; ++i) {
        vst1q_u32(dst, vectorVal);
        dst += 4;
    }
-
+#endif
    int32_t leftovers = len - neonFilled;
    while (leftovers--) *dst++ = val;
 }