sw_engine: Implement AVX and NEON optimizations for RasterGrayscale

This commit is contained in:
rinechran 2024-04-07 14:43:23 +09:00 committed by Hermet Park
parent 387d82a80e
commit 06b4b2c586
3 changed files with 41 additions and 2 deletions

View file

@ -1755,8 +1755,13 @@ static bool _rasterRadialGradientRle(SwSurface* surface, const SwRleData* rle, c
void rasterGrayscale8(uint8_t *dst, uint8_t val, uint32_t offset, int32_t len) void rasterGrayscale8(uint8_t *dst, uint8_t val, uint32_t offset, int32_t len)
{ {
//OPTIMIZE_ME: Support SIMD #if defined(THORVG_AVX_VECTOR_SUPPORT)
avxRasterGrayscale8(dst, val, offset, len);
#elif defined(THORVG_NEON_VECTOR_SUPPORT)
neonRasterGrayscale8(dst, val, offset, len);
#else
cRasterPixels(dst, val, offset, len); cRasterPixels(dst, val, offset, len);
#endif
} }

View file

@ -1,4 +1,4 @@
/* /*
* Copyright (c) 2021 - 2024 the ThorVG project. All rights reserved. * Copyright (c) 2021 - 2024 the ThorVG project. All rights reserved.
* Permission is hereby granted, free of charge, to any person obtaining a copy * Permission is hereby granted, free of charge, to any person obtaining a copy
@ -62,6 +62,23 @@ static inline __m128i ALPHA_BLEND(__m128i c, __m128i a)
} }
static void avxRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int32_t len)
{
dst += offset;
__m256i vecVal = _mm256_set1_epi8(val);
int32_t i = 0;
for (; i <= len - 32; i += 32) {
_mm256_storeu_si256((__m256i*)(dst + i), vecVal);
}
for (; i < len; ++i) {
dst[i] = val;
}
}
static void avxRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len) static void avxRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
{ {
//1. calculate how many iterations we need to cover the length //1. calculate how many iterations we need to cover the length

View file

@ -31,6 +31,23 @@ static inline uint8x8_t ALPHA_BLEND(uint8x8_t c, uint8x8_t a)
} }
static void neonRasterGrayscale8(uint8_t* dst, uint8_t val, uint32_t offset, int32_t len)
{
dst += offset;
int32_t i = 0;
uint8x16_t valVec = vdupq_n_u8(val);
for (; i <= len - 16; i += 16) {
vst1q_u8(dst + i, valVec);
}
for (; i < len; i++) {
dst[i] = val;
}
}
static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len) static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
{ {
uint32_t iterations = len / 4; uint32_t iterations = len / 4;