diff options
Diffstat (limited to 'src/port')
| -rw-r--r-- | src/port/Makefile | 11 | ||||
| -rw-r--r-- | src/port/meson.build | 6 | ||||
| -rw-r--r-- | src/port/pg_bitutils.c | 5 | ||||
| -rw-r--r-- | src/port/pg_popcount_avx512.c | 81 | ||||
| -rw-r--r-- | src/port/pg_popcount_avx512_choose.c | 88 |
5 files changed, 189 insertions, 2 deletions
diff --git a/src/port/Makefile b/src/port/Makefile index dcc8737e682..db7c02117b0 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS) OBJS = \ $(LIBOBJS) \ $(PG_CRC32C_OBJS) \ + $(PG_POPCNT_OBJS) \ bsearch_arg.o \ chklocale.o \ inet_net_ntop.o \ @@ -92,6 +93,16 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) +# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE +pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE) +pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE) +pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE) + +# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT +pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT) + # # Shared library versions of object files # diff --git a/src/port/meson.build b/src/port/meson.build index 92b593e6ef3..fd9ee199d1b 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -84,6 +84,8 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'], + ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], @@ -98,8 +100,8 @@ replace_funcs_pos = [ ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] -pgport_cflags = {'crc': cflags_crc} -pgport_sources_cflags = {'crc': []} +pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave} +pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []} foreach f : replace_funcs_neg func = f.get(0) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 6271acea600..411be90f734 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -163,6 +163,11 @@ choose_popcount_functions(void) pg_popcount64 = pg_popcount64_slow; pg_popcount_optimized = pg_popcount_slow; } + +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK + if (pg_popcount_avx512_available()) + pg_popcount_optimized = pg_popcount_avx512; +#endif } static int diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c new file mode 100644 index 00000000000..908817617ac --- /dev/null +++ b/src/port/pg_popcount_avx512.c @@ -0,0 +1,81 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_avx512.c + * Holds the AVX-512 pg_popcount() implementation. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_avx512.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include <immintrin.h> + +#include "port/pg_bitutils.h" + +/* + * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to + * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on + * the function pointers that are only used when TRY_POPCNT_FAST is set. + */ +#ifdef TRY_POPCNT_FAST + +/* + * pg_popcount_avx512 + * Returns the number of 1-bits in buf + */ +uint64 +pg_popcount_avx512(const char *buf, int bytes) +{ + __m512i val, + cnt; + __m512i accum = _mm512_setzero_si512(); + const char *final; + int tail_idx; + __mmask64 mask = ~UINT64CONST(0); + + /* + * Align buffer down to avoid double load overhead from unaligned access. + * Calculate a mask to ignore preceding bytes. Find start offset of final + * iteration and ensure it is not empty. + */ + mask <<= ((uintptr_t) buf) % sizeof(__m512i); + tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1; + final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1); + buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf); + + /* + * Iterate through all but the final iteration. Starting from the second + * iteration, the mask is ignored. + */ + if (buf < final) + { + val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + + buf += sizeof(__m512i); + mask = ~UINT64CONST(0); + + for (; buf < final; buf += sizeof(__m512i)) + { + val = _mm512_load_si512((const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + } + } + + /* Final iteration needs to ignore bytes that are not within the length */ + mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx)); + + val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + + return _mm512_reduce_add_epi64(accum); +} + +#endif /* TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c new file mode 100644 index 00000000000..ae3fa3d3067 --- /dev/null +++ b/src/port/pg_popcount_avx512_choose.c @@ -0,0 +1,88 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_avx512_choose.c + * Test whether we can use the AVX-512 pg_popcount() implementation. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_avx512_choose.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) +#include <cpuid.h> +#endif + +#ifdef HAVE_XSAVE_INTRINSICS +#include <immintrin.h> +#endif + +#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) +#include <intrin.h> +#endif + +#include "port/pg_bitutils.h" + +/* + * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to + * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on + * the function pointers that are only used when TRY_POPCNT_FAST is set. + */ +#ifdef TRY_POPCNT_FAST + +/* + * Returns true if the CPU supports the instructions required for the AVX-512 + * pg_popcount() implementation. + */ +bool +pg_popcount_avx512_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + /* Does CPUID say there's support for AVX-512 popcount instructions? */ +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#else +#error cpuid instruction not available +#endif + if ((exx[2] & (1 << 14)) == 0) /* avx512-vpopcntdq */ + return false; + + /* Does CPUID say there's support for AVX-512 byte and word instructions? */ + memset(exx, 0, sizeof(exx)); +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#else +#error cpuid instruction not available +#endif + if ((exx[1] & (1 << 30)) == 0) /* avx512-bw */ + return false; + + /* Does CPUID say there's support for XSAVE instructions? */ + memset(exx, 0, sizeof(exx)); +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif + if ((exx[2] & (1 << 26)) == 0) /* xsave */ + return false; + + /* Does XGETBV say the ZMM registers are enabled? */ +#ifdef HAVE_XSAVE_INTRINSICS + return (_xgetbv(0) & 0xe0) != 0; +#else + return false; +#endif +} + +#endif /* TRY_POPCNT_FAST */ |
