diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/include/pg_config.h.in | 3 | ||||
-rw-r--r-- | src/include/port/pg_bitutils.h | 17 | ||||
-rw-r--r-- | src/port/pg_popcount_aarch64.c | 281 |
3 files changed, 295 insertions, 6 deletions
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c6f055b3905..92f0616c400 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -712,6 +712,9 @@ /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */ #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK +/* Define to 1 to use SVE popcount instructions with a runtime check. */ +#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + /* Define to build with systemd support. (--with-systemd) */ #undef USE_SYSTEMD diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index a387f77c2c0..c7901bf8ddc 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -324,6 +324,23 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes); extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); #endif +#elif POPCNT_AARCH64 +/* Use the Neon version of pg_popcount{32,64} without function pointer. */ +extern int pg_popcount32(uint32 word); +extern int pg_popcount64(uint64 word); + +/* + * We can try to use an SVE-optimized pg_popcount() on some systems For that, + * we do use a function pointer. + */ +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); +#else +extern uint64 pg_popcount_optimized(const char *buf, int bytes); +extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); +#endif + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index 29b8c0640fe..bed0f7ab242 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -18,6 +18,275 @@ #include <arm_neon.h> +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK +#include <arm_sve.h> + +#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL) +#include <sys/auxv.h> +#endif +#endif + +/* + * The Neon versions are built regardless of whether we are building the SVE + * versions. + */ +static uint64 pg_popcount_neon(const char *buf, int bytes); +static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask); + +#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK + +/* + * These are the SVE implementations of the popcount functions. + */ +static uint64 pg_popcount_sve(const char *buf, int bytes); +static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask); + +/* + * The function pointers are initially set to "choose" functions. These + * functions will first set the pointers to the right implementations (based on + * what the current CPU supports) and then will call the pointer to fulfill the + * caller's request. + */ +static uint64 pg_popcount_choose(const char *buf, int bytes); +static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); +uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; +uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; + +static inline bool +pg_popcount_sve_available(void) +{ +#ifdef HAVE_ELF_AUX_INFO + unsigned long value; + + return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 && + (value & HWCAP_SVE) != 0; +#elif defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0; +#else + return false; +#endif +} + +static inline void +choose_popcount_functions(void) +{ + if (pg_popcount_sve_available()) + { + pg_popcount_optimized = pg_popcount_sve; + pg_popcount_masked_optimized = pg_popcount_masked_sve; + } + else + { + pg_popcount_optimized = pg_popcount_neon; + pg_popcount_masked_optimized = pg_popcount_masked_neon; + } +} + +static uint64 +pg_popcount_choose(const char *buf, int bytes) +{ + choose_popcount_functions(); + return pg_popcount_optimized(buf, bytes); +} + +static uint64 +pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask) +{ + choose_popcount_functions(); + return pg_popcount_masked_optimized(buf, bytes, mask); +} + +/* + * pg_popcount_sve + * Returns number of 1 bits in buf + */ +pg_attribute_target("arch=armv8-a+sve") +static uint64 +pg_popcount_sve(const char *buf, int bytes) +{ + svbool_t pred = svptrue_b64(); + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0), + accum3 = svdup_u64(0), + accum4 = svdup_u64(0); + uint32 vec_len = svcntb(), + bytes_per_iteration = 4 * vec_len; + uint64 popcnt = 0; + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + svuint64_t vec; + + vec = svld1_u64(pred, (const uint64 *) buf); + accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec)); + buf += vec_len; + + vec = svld1_u64(pred, (const uint64 *) buf); + accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec)); + buf += vec_len; + + vec = svld1_u64(pred, (const uint64 *) buf); + accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec)); + buf += vec_len; + + vec = svld1_u64(pred, (const uint64 *) buf); + accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec)); + buf += vec_len; + } + + /* + * If enough data remains, do another iteration on a block of two + * registers. + */ + bytes_per_iteration = 2 * vec_len; + if (bytes >= bytes_per_iteration) + { + svuint64_t vec; + + vec = svld1_u64(pred, (const uint64 *) buf); + accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec)); + buf += vec_len; + + vec = svld1_u64(pred, (const uint64 *) buf); + accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec)); + buf += vec_len; + + bytes -= bytes_per_iteration; + } + + /* + * Add the accumulators. + */ + popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2)); + popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4)); + + /* + * Process any remaining data. + */ + for (; bytes > 0; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8_s32(0, bytes); + vec = svld1_u8(pred, (const uint8 *) buf); + popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec)); + buf += vec_len; + } + + return popcnt; +} + +/* + * pg_popcount_masked_sve + * Returns number of 1 bits in buf after applying the mask to each byte + */ +pg_attribute_target("arch=armv8-a+sve") +static uint64 +pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask) +{ + svbool_t pred = svptrue_b64(); + svuint64_t accum1 = svdup_u64(0), + accum2 = svdup_u64(0), + accum3 = svdup_u64(0), + accum4 = svdup_u64(0); + uint32 vec_len = svcntb(), + bytes_per_iteration = 4 * vec_len; + uint64 popcnt = 0, + mask64 = ~UINT64CONST(0) / 0xFF * mask; + + /* + * For better instruction-level parallelism, each loop iteration operates + * on a block of four registers. + */ + for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration) + { + svuint64_t vec; + + vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); + accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec)); + buf += vec_len; + + vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); + accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec)); + buf += vec_len; + + vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); + accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec)); + buf += vec_len; + + vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); + accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec)); + buf += vec_len; + } + + /* + * If enough data remains, do another iteration on a block of two + * registers. + */ + bytes_per_iteration = 2 * vec_len; + if (bytes >= bytes_per_iteration) + { + svuint64_t vec; + + vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); + accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec)); + buf += vec_len; + + vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64); + accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec)); + buf += vec_len; + + bytes -= bytes_per_iteration; + } + + /* + * Add the accumulators. + */ + popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2)); + popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4)); + + /* + * Process any remaining data. + */ + for (; bytes > 0; bytes -= vec_len) + { + svuint8_t vec; + + pred = svwhilelt_b8_s32(0, bytes); + vec = svand_n_u8_x(pred, svld1_u8(pred, (const uint8 *) buf), mask); + popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec)); + buf += vec_len; + } + + return popcnt; +} + +#else /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ + +/* + * When the SVE version isn't available, there's no point in using function + * pointers to vary the implementation. We instead just make these actual + * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined. + * The compiler should be able to inline the Neon versions here. + */ +uint64 +pg_popcount_optimized(const char *buf, int bytes) +{ + return pg_popcount_neon(buf, bytes); +} + +uint64 +pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +{ + return pg_popcount_masked_neon(buf, bytes, mask); +} + +#endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ + /* * pg_popcount32 * Return number of 1 bits in word @@ -44,11 +313,11 @@ pg_popcount64(uint64 word) } /* - * pg_popcount_optimized + * pg_popcount_neon * Returns number of 1 bits in buf */ -uint64 -pg_popcount_optimized(const char *buf, int bytes) +static uint64 +pg_popcount_neon(const char *buf, int bytes) { uint8x16_t vec; uint64x2_t accum1 = vdupq_n_u64(0), @@ -124,11 +393,11 @@ pg_popcount_optimized(const char *buf, int bytes) } /* - * pg_popcount_masked_optimized + * pg_popcount_masked_neon * Returns number of 1 bits in buf after applying the mask to each byte */ -uint64 -pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) +static uint64 +pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) { uint8x16_t vec, maskv = vdupq_n_u8(mask); |