mirror of
https://github.com/thorvg/thorvg.git
synced 2025-06-10 06:34:01 +00:00
sw_engine : arm neonRasterGrayscale8 function to support aarch64
Improved the speed through neon processing. Improvements Rate: Lottie: (0.025986/0.026201) = +4.7% Performance: (0.014163/0.014785) = +4.3% issue: https://github.com/thorvg/thorvg/issues/30
This commit is contained in:
parent
c8551d4856
commit
c1394668ef
1 changed files with 17 additions and 3 deletions
|
@ -24,6 +24,15 @@
|
|||
|
||||
#include <arm_neon.h>
|
||||
|
||||
//TODO : need to support windows ARM
|
||||
|
||||
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
|
||||
#define TVG_AARCH64 1
|
||||
#else
|
||||
#define TVG_AARCH64 0
|
||||
#endif
|
||||
|
||||
|
||||
static inline uint8x8_t ALPHA_BLEND(uint8x8_t c, uint8x8_t a)
|
||||
{
|
||||
uint16x8_t t = vmull_u8(c, a);
|
||||
|
@ -36,12 +45,17 @@ static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int
|
|||
dst += offset;
|
||||
|
||||
int32_t i = 0;
|
||||
uint8x16_t valVec = vdupq_n_u8(val);
|
||||
|
||||
const uint8x16_t valVec = vdupq_n_u8(val);
|
||||
#if TVG_AARCH64
|
||||
uint8x16x4_t valQuad = {valVec, valVec, valVec, valVec};
|
||||
for (; i <= len - 16 * 4; i += 16 * 4) {
|
||||
vst1q_u8_x4(dst + i, valQuad);
|
||||
}
|
||||
#else
|
||||
for (; i <= len - 16; i += 16) {
|
||||
vst1q_u8(dst + i, valVec);
|
||||
}
|
||||
|
||||
#endif
|
||||
for (; i < len; i++) {
|
||||
dst[i] = val;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue