mirror of
https://github.com/thorvg/thorvg.git
synced 2025-06-13 11:36:25 +00:00
sw_engine : arm neonRasterGrayscale8 function to support aarch64
Improved the speed through neon processing. Improvements Rate: Lottie: (0.025986/0.026201) = +4.7% Performance: (0.014163/0.014785) = +4.3% issue: https://github.com/thorvg/thorvg/issues/30
This commit is contained in:
parent
c8551d4856
commit
c1394668ef
1 changed files with 17 additions and 3 deletions
|
@ -24,6 +24,15 @@
|
||||||
|
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
//TODO : need to support windows ARM
|
||||||
|
|
||||||
|
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
|
||||||
|
#define TVG_AARCH64 1
|
||||||
|
#else
|
||||||
|
#define TVG_AARCH64 0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
static inline uint8x8_t ALPHA_BLEND(uint8x8_t c, uint8x8_t a)
|
static inline uint8x8_t ALPHA_BLEND(uint8x8_t c, uint8x8_t a)
|
||||||
{
|
{
|
||||||
uint16x8_t t = vmull_u8(c, a);
|
uint16x8_t t = vmull_u8(c, a);
|
||||||
|
@ -36,12 +45,17 @@ static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int
|
||||||
dst += offset;
|
dst += offset;
|
||||||
|
|
||||||
int32_t i = 0;
|
int32_t i = 0;
|
||||||
uint8x16_t valVec = vdupq_n_u8(val);
|
const uint8x16_t valVec = vdupq_n_u8(val);
|
||||||
|
#if TVG_AARCH64
|
||||||
|
uint8x16x4_t valQuad = {valVec, valVec, valVec, valVec};
|
||||||
|
for (; i <= len - 16 * 4; i += 16 * 4) {
|
||||||
|
vst1q_u8_x4(dst + i, valQuad);
|
||||||
|
}
|
||||||
|
#else
|
||||||
for (; i <= len - 16; i += 16) {
|
for (; i <= len - 16; i += 16) {
|
||||||
vst1q_u8(dst + i, valVec);
|
vst1q_u8(dst + i, valVec);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
for (; i < len; i++) {
|
for (; i < len; i++) {
|
||||||
dst[i] = val;
|
dst[i] = val;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue