From c1394668ef16ba7d9c5819a9560e738fc03ef732 Mon Sep 17 00:00:00 2001 From: rinechran Date: Fri, 3 May 2024 01:01:17 +0900 Subject: [PATCH] sw_engine : arm neonRasterGrayscale8 function to support aarch64 Improved the speed through neon processing. Improvements Rate: Lottie: (0.025986/0.026201) = +4.7% Performance: (0.014163/0.014785) = +4.3% issue: https://github.com/thorvg/thorvg/issues/30 --- src/renderer/sw_engine/tvgSwRasterNeon.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/renderer/sw_engine/tvgSwRasterNeon.h b/src/renderer/sw_engine/tvgSwRasterNeon.h index 6b631fdf..35b82899 100644 --- a/src/renderer/sw_engine/tvgSwRasterNeon.h +++ b/src/renderer/sw_engine/tvgSwRasterNeon.h @@ -24,6 +24,15 @@ #include +//TODO : need to support windows ARM + +#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64) +#define TVG_AARCH64 1 +#else +#define TVG_AARCH64 0 +#endif + + static inline uint8x8_t ALPHA_BLEND(uint8x8_t c, uint8x8_t a) { uint16x8_t t = vmull_u8(c, a); @@ -36,12 +45,17 @@ static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int dst += offset; int32_t i = 0; - uint8x16_t valVec = vdupq_n_u8(val); - + const uint8x16_t valVec = vdupq_n_u8(val); +#if TVG_AARCH64 + uint8x16x4_t valQuad = {valVec, valVec, valVec, valVec}; + for (; i <= len - 16 * 4; i += 16 * 4) { + vst1q_u8_x4(dst + i, valQuad); + } +#else for (; i <= len - 16; i += 16) { vst1q_u8(dst + i, valVec); } - +#endif for (; i < len; i++) { dst[i] = val; }