mirror of
https://github.com/thorvg/thorvg.git
synced 2025-06-14 12:04:29 +00:00
sw_engine/neon: arm neonRasterPixel32 function to support aarch64
Improved the speed through neon processing. Improvements Rate: Lottie: (0.026321/0.026779) = +1.8% Performance: (0.015411/0.015732) = +2.1% issue: https://github.com/thorvg/thorvg/issues/30
This commit is contained in:
parent
8c04b9d65e
commit
477714778c
1 changed files with 14 additions and 5 deletions
|
@ -64,17 +64,26 @@ static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int
|
|||
|
||||
static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
|
||||
{
|
||||
dst += offset;
|
||||
|
||||
uint32x4_t vectorVal = vdupq_n_u32(val);
|
||||
|
||||
#if TVG_AARCH64
|
||||
uint32_t iterations = len / 16;
|
||||
uint32_t neonFilled = iterations * 16;
|
||||
uint32x4x4_t valQuad = {vectorVal, vectorVal, vectorVal, vectorVal};
|
||||
for (uint32_t i = 0; i < iterations; ++i) {
|
||||
vst4q_u32(dst, valQuad);
|
||||
dst += 16;
|
||||
}
|
||||
#else
|
||||
uint32_t iterations = len / 4;
|
||||
uint32_t neonFilled = iterations * 4;
|
||||
|
||||
dst += offset;
|
||||
uint32x4_t vectorVal = {val, val, val, val};
|
||||
|
||||
for (uint32_t i = 0; i < iterations; ++i) {
|
||||
vst1q_u32(dst, vectorVal);
|
||||
dst += 4;
|
||||
}
|
||||
|
||||
#endif
|
||||
int32_t leftovers = len - neonFilled;
|
||||
while (leftovers--) *dst++ = val;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue