sw_engine common: Added neon version of rasterRGBA32 API.

Changes:
- Added 'neon' vector option in build system
- Introduced neon version of rasterRGBA32 API, which improves
speed of the funciton on ARM cpu's around ~35%
This commit is contained in:
Michal Szczecinski 2021-07-27 13:46:57 +02:00 committed by Hermet Park
parent 386888bd11
commit 1acc25b5ac
4 changed files with 30 additions and 4 deletions

View file

@ -42,6 +42,10 @@ if get_option('vectors').contains('avx') == true
config_h.set10('THORVG_AVX_VECTOR_SUPPORT', true)
endif
if get_option('vectors').contains('neon') == true
config_h.set10('THORVG_NEON_VECTOR_SUPPORT', true)
endif
if get_option('bindings').contains('capi') == true
config_h.set10('THORVG_CAPI_BINDING_SUPPORT', true)
endif

View file

@ -17,9 +17,9 @@ option('savers',
description: 'Enable File Savers in thorvg')
option('vectors',
type: 'array',
choices: ['', 'avx'],
value: [''],
type: 'combo',
choices: ['', 'avx', 'neon'],
value: '',
description: 'Enable CPU Vectorization(SIMD) in thorvg')
option('bindings',

View file

@ -29,6 +29,10 @@
#include <immintrin.h>
#endif
#ifdef THORVG_NEON_VECTOR_SUPPORT
#include <arm_neon.h>
#endif
#if 0
#include <sys/time.h>
static double timeStamp()
@ -362,7 +366,7 @@ bool rasterClear(SwSurface* surface);
static inline void rasterRGBA32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
{
#ifdef THORVG_AVX_VECTOR_SUPPORT
#if defined(THORVG_AVX_VECTOR_SUPPORT)
//1. calculate how many iterations we need to cover length
uint32_t iterations = len / 8;
uint32_t avxFilled = iterations * 8;
@ -382,6 +386,21 @@ static inline void rasterRGBA32(uint32_t *dst, uint32_t val, uint32_t offset, in
leftovers = len - avxFilled;
dst+= avxFilled;
while (leftovers--) *dst++ = val;
#elif defined(THORVG_NEON_VECTOR_SUPPORT)
uint32_t iterations = len / 4;
uint32_t neonFilled = iterations * 4;
int32_t leftovers = 0;
dst+=offset;
uint32x4_t vectorVal = { val, val, val, val };
for (uint32_t i = 0; i < iterations; ++i) {
vst1q_u32(dst, vectorVal);
dst += 4;
}
leftovers = len - neonFilled;
while (leftovers--) *dst++ = val;
#else
dst += offset;

View file

@ -5,6 +5,9 @@ if (cc.get_id() != 'msvc')
if get_option('vectors').contains('avx')
compiler_flags += ['-mavx']
endif
if get_option('vectors').contains('neon')
compiler_flags += ['-mfpu=neon-vfpv4']
endif
if get_option('b_sanitize') == 'none'
compiler_flags += ['-fno-exceptions', '-fno-rtti',
'-fno-unwind-tables' , '-fno-asynchronous-unwind-tables',