diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/Makefile.global.in | 5 | ||||
| -rw-r--r-- | src/include/pg_config.h.in | 12 | ||||
| -rw-r--r-- | src/include/port/pg_bitutils.h | 11 | ||||
| -rw-r--r-- | src/makefiles/meson.build | 4 | ||||
| -rw-r--r-- | src/port/Makefile | 11 | ||||
| -rw-r--r-- | src/port/meson.build | 6 | ||||
| -rw-r--r-- | src/port/pg_bitutils.c | 5 | ||||
| -rw-r--r-- | src/port/pg_popcount_avx512.c | 81 | ||||
| -rw-r--r-- | src/port/pg_popcount_avx512_choose.c | 88 | ||||
| -rw-r--r-- | src/test/regress/expected/bit.out | 24 | ||||
| -rw-r--r-- | src/test/regress/sql/bit.sql | 4 |
11 files changed, 248 insertions, 3 deletions
diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 8b3f8c24e08..36d880d2251 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -262,7 +262,9 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@ CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@ CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@ CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@ +CFLAGS_POPCNT = @CFLAGS_POPCNT@ CFLAGS_CRC = @CFLAGS_CRC@ +CFLAGS_XSAVE = @CFLAGS_XSAVE@ PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@ CXXFLAGS = @CXXFLAGS@ @@ -758,6 +760,9 @@ LIBOBJS = @LIBOBJS@ # files needed for the chosen CRC-32C implementation PG_CRC32C_OBJS = @PG_CRC32C_OBJS@ +# files needed for the chosen popcount implementation +PG_POPCNT_OBJS = @PG_POPCNT_OBJS@ + LIBS := -lpgcommon -lpgport $(LIBS) # to make ws2_32.lib the last library diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 591e1ca3df6..f8d3e3b6b84 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -513,6 +513,9 @@ /* Define to 1 if the assembler supports X86_64's POPCNTQ instruction. */ #undef HAVE_X86_64_POPCNTQ +/* Define to 1 if you have XSAVE intrinsics. */ +#undef HAVE_XSAVE_INTRINSICS + /* Define to 1 if the system has the type `_Bool'. */ #undef HAVE__BOOL @@ -555,9 +558,15 @@ /* Define to 1 if you have __cpuid. */ #undef HAVE__CPUID +/* Define to 1 if you have __cpuidex. */ +#undef HAVE__CPUIDEX + /* Define to 1 if you have __get_cpuid. */ #undef HAVE__GET_CPUID +/* Define to 1 if you have __get_cpuid_count. */ +#undef HAVE__GET_CPUID_COUNT + /* Define to 1 if your compiler understands _Static_assert. */ #undef HAVE__STATIC_ASSERT @@ -680,6 +689,9 @@ /* Define to 1 to build with assertion checks. (--enable-cassert) */ #undef USE_ASSERT_CHECKING +/* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ +#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK + /* Define to 1 to build with Bonjour support. (--with-bonjour) */ #undef USE_BONJOUR diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index de480da71e1..b453f84d8f1 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -304,6 +304,17 @@ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); +/* + * We can also try to use the AVX-512 popcount instruction on some systems. + * The implementation of that is located in its own file because it may + * require special compiler flags that we don't want to apply to any other + * files. + */ +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +extern bool pg_popcount_avx512_available(void); +extern uint64 pg_popcount_avx512(const char *buf, int bytes); +#endif + #else /* Use a portable implementation -- no need for a function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build index b0f4178b3d9..5618050b306 100644 --- a/src/makefiles/meson.build +++ b/src/makefiles/meson.build @@ -100,8 +100,10 @@ pgxs_kv = { ' '.join(cflags_no_decl_after_statement), 'CFLAGS_CRC': ' '.join(cflags_crc), + 'CFLAGS_POPCNT': ' '.join(cflags_popcnt), 'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags), 'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags), + 'CFLAGS_XSAVE': ' '.join(cflags_xsave), 'LDFLAGS': var_ldflags, 'LDFLAGS_EX': var_ldflags_ex, @@ -177,7 +179,7 @@ pgxs_empty = [ 'WANTED_LANGUAGES', # Not needed because we don't build the server / PLs with the generated makefile - 'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS', + 'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS', 'DTRACEFLAGS', # only server has dtrace probes 'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 'perl_includespec', 'perl_privlibexp', diff --git a/src/port/Makefile b/src/port/Makefile index dcc8737e682..db7c02117b0 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS) OBJS = \ $(LIBOBJS) \ $(PG_CRC32C_OBJS) \ + $(PG_POPCNT_OBJS) \ bsearch_arg.o \ chklocale.o \ inet_net_ntop.o \ @@ -92,6 +93,16 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) +# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE +pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE) +pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE) +pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE) + +# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT +pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT) +pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT) + # # Shared library versions of object files # diff --git a/src/port/meson.build b/src/port/meson.build index 92b593e6ef3..fd9ee199d1b 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -84,6 +84,8 @@ replace_funcs_pos = [ ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'], + ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'], # arm / aarch64 ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'], @@ -98,8 +100,8 @@ replace_funcs_pos = [ ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] -pgport_cflags = {'crc': cflags_crc} -pgport_sources_cflags = {'crc': []} +pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave} +pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []} foreach f : replace_funcs_neg func = f.get(0) diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 6271acea600..411be90f734 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -163,6 +163,11 @@ choose_popcount_functions(void) pg_popcount64 = pg_popcount64_slow; pg_popcount_optimized = pg_popcount_slow; } + +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK + if (pg_popcount_avx512_available()) + pg_popcount_optimized = pg_popcount_avx512; +#endif } static int diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c new file mode 100644 index 00000000000..908817617ac --- /dev/null +++ b/src/port/pg_popcount_avx512.c @@ -0,0 +1,81 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_avx512.c + * Holds the AVX-512 pg_popcount() implementation. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_avx512.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include <immintrin.h> + +#include "port/pg_bitutils.h" + +/* + * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to + * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on + * the function pointers that are only used when TRY_POPCNT_FAST is set. + */ +#ifdef TRY_POPCNT_FAST + +/* + * pg_popcount_avx512 + * Returns the number of 1-bits in buf + */ +uint64 +pg_popcount_avx512(const char *buf, int bytes) +{ + __m512i val, + cnt; + __m512i accum = _mm512_setzero_si512(); + const char *final; + int tail_idx; + __mmask64 mask = ~UINT64CONST(0); + + /* + * Align buffer down to avoid double load overhead from unaligned access. + * Calculate a mask to ignore preceding bytes. Find start offset of final + * iteration and ensure it is not empty. + */ + mask <<= ((uintptr_t) buf) % sizeof(__m512i); + tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1; + final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1); + buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf); + + /* + * Iterate through all but the final iteration. Starting from the second + * iteration, the mask is ignored. + */ + if (buf < final) + { + val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + + buf += sizeof(__m512i); + mask = ~UINT64CONST(0); + + for (; buf < final; buf += sizeof(__m512i)) + { + val = _mm512_load_si512((const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + } + } + + /* Final iteration needs to ignore bytes that are not within the length */ + mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx)); + + val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); + cnt = _mm512_popcnt_epi64(val); + accum = _mm512_add_epi64(accum, cnt); + + return _mm512_reduce_add_epi64(accum); +} + +#endif /* TRY_POPCNT_FAST */ diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c new file mode 100644 index 00000000000..ae3fa3d3067 --- /dev/null +++ b/src/port/pg_popcount_avx512_choose.c @@ -0,0 +1,88 @@ +/*------------------------------------------------------------------------- + * + * pg_popcount_avx512_choose.c + * Test whether we can use the AVX-512 pg_popcount() implementation. + * + * Copyright (c) 2024, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/port/pg_popcount_avx512_choose.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) +#include <cpuid.h> +#endif + +#ifdef HAVE_XSAVE_INTRINSICS +#include <immintrin.h> +#endif + +#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) +#include <intrin.h> +#endif + +#include "port/pg_bitutils.h" + +/* + * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to + * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on + * the function pointers that are only used when TRY_POPCNT_FAST is set. + */ +#ifdef TRY_POPCNT_FAST + +/* + * Returns true if the CPU supports the instructions required for the AVX-512 + * pg_popcount() implementation. + */ +bool +pg_popcount_avx512_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + + /* Does CPUID say there's support for AVX-512 popcount instructions? */ +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#else +#error cpuid instruction not available +#endif + if ((exx[2] & (1 << 14)) == 0) /* avx512-vpopcntdq */ + return false; + + /* Does CPUID say there's support for AVX-512 byte and word instructions? */ + memset(exx, 0, sizeof(exx)); +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#else +#error cpuid instruction not available +#endif + if ((exx[1] & (1 << 30)) == 0) /* avx512-bw */ + return false; + + /* Does CPUID say there's support for XSAVE instructions? */ + memset(exx, 0, sizeof(exx)); +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif + if ((exx[2] & (1 << 26)) == 0) /* xsave */ + return false; + + /* Does XGETBV say the ZMM registers are enabled? */ +#ifdef HAVE_XSAVE_INTRINSICS + return (_xgetbv(0) & 0xe0) != 0; +#else + return false; +#endif +} + +#endif /* TRY_POPCNT_FAST */ diff --git a/src/test/regress/expected/bit.out b/src/test/regress/expected/bit.out index e17cbf42ca2..6a436288bb1 100644 --- a/src/test/regress/expected/bit.out +++ b/src/test/regress/expected/bit.out @@ -740,6 +740,30 @@ SELECT bit_count(B'1111111111'::bit(10)); 10 (1 row) +SELECT bit_count(repeat('0', 100)::bit(100)); + bit_count +----------- + 0 +(1 row) + +SELECT bit_count(repeat('1', 100)::bit(100)); + bit_count +----------- + 100 +(1 row) + +SELECT bit_count(repeat('01', 500)::bit(1000)); + bit_count +----------- + 500 +(1 row) + +SELECT bit_count(repeat('10101', 200)::bit(1000)); + bit_count +----------- + 600 +(1 row) + -- This table is intentionally left around to exercise pg_dump/pg_upgrade CREATE TABLE bit_defaults( b1 bit(4) DEFAULT '1001', diff --git a/src/test/regress/sql/bit.sql b/src/test/regress/sql/bit.sql index 34230b99fba..8ba6facd032 100644 --- a/src/test/regress/sql/bit.sql +++ b/src/test/regress/sql/bit.sql @@ -223,6 +223,10 @@ SELECT overlay(B'0101011100' placing '001' from 20); -- bit_count SELECT bit_count(B'0101011100'::bit(10)); SELECT bit_count(B'1111111111'::bit(10)); +SELECT bit_count(repeat('0', 100)::bit(100)); +SELECT bit_count(repeat('1', 100)::bit(100)); +SELECT bit_count(repeat('01', 500)::bit(1000)); +SELECT bit_count(repeat('10101', 200)::bit(1000)); -- This table is intentionally left around to exercise pg_dump/pg_upgrade CREATE TABLE bit_defaults( |
