From 477714778cf3ee5c42c9ecc80a6bc024677a81d7 Mon Sep 17 00:00:00 2001
From: rinechran <rinechran@gmail.com>
Date: Tue, 7 May 2024 15:33:58 +0900
Subject: [PATCH] sw_engine/neon: arm neonRasterPixel32 function to support
 aarch64

Improved the speed through neon processing.

Improvements Rate:
Lottie: (0.026321/0.026779) = +1.8%
Performance: (0.015411/0.015732) = +2.1%

issue: https://github.com/thorvg/thorvg/issues/30
---
 src/renderer/sw_engine/tvgSwRasterNeon.h | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/renderer/sw_engine/tvgSwRasterNeon.h b/src/renderer/sw_engine/tvgSwRasterNeon.h
index 35b82899..1ea6cd96 100644
--- a/src/renderer/sw_engine/tvgSwRasterNeon.h
+++ b/src/renderer/sw_engine/tvgSwRasterNeon.h
@@ -64,17 +64,26 @@ static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int
 
 static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
 {
+    dst += offset;
+
+    uint32x4_t vectorVal = vdupq_n_u32(val);
+
+#if TVG_AARCH64
+    uint32_t iterations = len / 16;
+    uint32_t neonFilled = iterations * 16;
+    uint32x4x4_t valQuad = {vectorVal, vectorVal, vectorVal, vectorVal};
+    for (uint32_t i = 0; i < iterations; ++i) {
+        vst4q_u32(dst, valQuad);
+        dst += 16;
+    }
+#else
     uint32_t iterations = len / 4;
     uint32_t neonFilled = iterations * 4;
-
-    dst += offset;
-    uint32x4_t vectorVal = {val, val, val, val};
-
     for (uint32_t i = 0; i < iterations; ++i) {
         vst1q_u32(dst, vectorVal);
         dst += 4;
     }
-
+#endif
     int32_t leftovers = len - neonFilled;
     while (leftovers--) *dst++ = val;
 }