From 477714778cf3ee5c42c9ecc80a6bc024677a81d7 Mon Sep 17 00:00:00 2001 From: rinechran Date: Tue, 7 May 2024 15:33:58 +0900 Subject: [PATCH] sw_engine/neon: arm neonRasterPixel32 function to support aarch64 Improved the speed through neon processing. Improvements Rate: Lottie: (0.026321/0.026779) = +1.8% Performance: (0.015411/0.015732) = +2.1% issue: https://github.com/thorvg/thorvg/issues/30 --- src/renderer/sw_engine/tvgSwRasterNeon.h | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/renderer/sw_engine/tvgSwRasterNeon.h b/src/renderer/sw_engine/tvgSwRasterNeon.h index 35b82899..1ea6cd96 100644 --- a/src/renderer/sw_engine/tvgSwRasterNeon.h +++ b/src/renderer/sw_engine/tvgSwRasterNeon.h @@ -64,17 +64,26 @@ static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len) { + dst += offset; + + uint32x4_t vectorVal = vdupq_n_u32(val); + +#if TVG_AARCH64 + uint32_t iterations = len / 16; + uint32_t neonFilled = iterations * 16; + uint32x4x4_t valQuad = {vectorVal, vectorVal, vectorVal, vectorVal}; + for (uint32_t i = 0; i < iterations; ++i) { + vst4q_u32(dst, valQuad); + dst += 16; + } +#else uint32_t iterations = len / 4; uint32_t neonFilled = iterations * 4; - - dst += offset; - uint32x4_t vectorVal = {val, val, val, val}; - for (uint32_t i = 0; i < iterations; ++i) { vst1q_u32(dst, vectorVal); dst += 4; } - +#endif int32_t leftovers = len - neonFilled; while (leftovers--) *dst++ = val; }