mirror of
https://github.com/thorvg/thorvg.git
synced 2025-06-14 12:04:29 +00:00
sw_engine/neon: arm neonRasterPixel32 function to support aarch64
Improved the speed through neon processing. Improvements Rate: Lottie: (0.026321/0.026779) = +1.8% Performance: (0.015411/0.015732) = +2.1% issue: https://github.com/thorvg/thorvg/issues/30
This commit is contained in:
parent
8c04b9d65e
commit
477714778c
1 changed files with 14 additions and 5 deletions
|
@ -64,17 +64,26 @@ static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int
|
||||||
|
|
||||||
static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
|
static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
|
||||||
{
|
{
|
||||||
|
dst += offset;
|
||||||
|
|
||||||
|
uint32x4_t vectorVal = vdupq_n_u32(val);
|
||||||
|
|
||||||
|
#if TVG_AARCH64
|
||||||
|
uint32_t iterations = len / 16;
|
||||||
|
uint32_t neonFilled = iterations * 16;
|
||||||
|
uint32x4x4_t valQuad = {vectorVal, vectorVal, vectorVal, vectorVal};
|
||||||
|
for (uint32_t i = 0; i < iterations; ++i) {
|
||||||
|
vst4q_u32(dst, valQuad);
|
||||||
|
dst += 16;
|
||||||
|
}
|
||||||
|
#else
|
||||||
uint32_t iterations = len / 4;
|
uint32_t iterations = len / 4;
|
||||||
uint32_t neonFilled = iterations * 4;
|
uint32_t neonFilled = iterations * 4;
|
||||||
|
|
||||||
dst += offset;
|
|
||||||
uint32x4_t vectorVal = {val, val, val, val};
|
|
||||||
|
|
||||||
for (uint32_t i = 0; i < iterations; ++i) {
|
for (uint32_t i = 0; i < iterations; ++i) {
|
||||||
vst1q_u32(dst, vectorVal);
|
vst1q_u32(dst, vectorVal);
|
||||||
dst += 4;
|
dst += 4;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
int32_t leftovers = len - neonFilled;
|
int32_t leftovers = len - neonFilled;
|
||||||
while (leftovers--) *dst++ = val;
|
while (leftovers--) *dst++ = val;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue