summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.global.in5
-rw-r--r--src/include/pg_config.h.in12
-rw-r--r--src/include/port/pg_bitutils.h11
-rw-r--r--src/makefiles/meson.build4
-rw-r--r--src/port/Makefile11
-rw-r--r--src/port/meson.build6
-rw-r--r--src/port/pg_bitutils.c5
-rw-r--r--src/port/pg_popcount_avx512.c81
-rw-r--r--src/port/pg_popcount_avx512_choose.c88
-rw-r--r--src/test/regress/expected/bit.out24
-rw-r--r--src/test/regress/sql/bit.sql4
11 files changed, 248 insertions, 3 deletions
diff --git a/src/Makefile.global.in b/src/Makefile.global.in
index 8b3f8c24e08..36d880d2251 100644
--- a/src/Makefile.global.in
+++ b/src/Makefile.global.in
@@ -262,7 +262,9 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
+CFLAGS_POPCNT = @CFLAGS_POPCNT@
CFLAGS_CRC = @CFLAGS_CRC@
+CFLAGS_XSAVE = @CFLAGS_XSAVE@
PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
CXXFLAGS = @CXXFLAGS@
@@ -758,6 +760,9 @@ LIBOBJS = @LIBOBJS@
# files needed for the chosen CRC-32C implementation
PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
+# files needed for the chosen popcount implementation
+PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
+
LIBS := -lpgcommon -lpgport $(LIBS)
# to make ws2_32.lib the last library
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 591e1ca3df6..f8d3e3b6b84 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -513,6 +513,9 @@
/* Define to 1 if the assembler supports X86_64's POPCNTQ instruction. */
#undef HAVE_X86_64_POPCNTQ
+/* Define to 1 if you have XSAVE intrinsics. */
+#undef HAVE_XSAVE_INTRINSICS
+
/* Define to 1 if the system has the type `_Bool'. */
#undef HAVE__BOOL
@@ -555,9 +558,15 @@
/* Define to 1 if you have __cpuid. */
#undef HAVE__CPUID
+/* Define to 1 if you have __cpuidex. */
+#undef HAVE__CPUIDEX
+
/* Define to 1 if you have __get_cpuid. */
#undef HAVE__GET_CPUID
+/* Define to 1 if you have __get_cpuid_count. */
+#undef HAVE__GET_CPUID_COUNT
+
/* Define to 1 if your compiler understands _Static_assert. */
#undef HAVE__STATIC_ASSERT
@@ -680,6 +689,9 @@
/* Define to 1 to build with assertion checks. (--enable-cassert) */
#undef USE_ASSERT_CHECKING
+/* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
+#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+
/* Define to 1 to build with Bonjour support. (--with-bonjour) */
#undef USE_BONJOUR
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index de480da71e1..b453f84d8f1 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -304,6 +304,17 @@ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
+/*
+ * We can also try to use the AVX-512 popcount instruction on some systems.
+ * The implementation of that is located in its own file because it may
+ * require special compiler flags that we don't want to apply to any other
+ * files.
+ */
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+extern bool pg_popcount_avx512_available(void);
+extern uint64 pg_popcount_avx512(const char *buf, int bytes);
+#endif
+
#else
/* Use a portable implementation -- no need for a function pointer. */
extern int pg_popcount32(uint32 word);
diff --git a/src/makefiles/meson.build b/src/makefiles/meson.build
index b0f4178b3d9..5618050b306 100644
--- a/src/makefiles/meson.build
+++ b/src/makefiles/meson.build
@@ -100,8 +100,10 @@ pgxs_kv = {
' '.join(cflags_no_decl_after_statement),
'CFLAGS_CRC': ' '.join(cflags_crc),
+ 'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
+ 'CFLAGS_XSAVE': ' '.join(cflags_xsave),
'LDFLAGS': var_ldflags,
'LDFLAGS_EX': var_ldflags_ex,
@@ -177,7 +179,7 @@ pgxs_empty = [
'WANTED_LANGUAGES',
# Not needed because we don't build the server / PLs with the generated makefile
- 'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
+ 'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
'DTRACEFLAGS', # only server has dtrace probes
'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 'perl_includespec', 'perl_privlibexp',
diff --git a/src/port/Makefile b/src/port/Makefile
index dcc8737e682..db7c02117b0 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS)
OBJS = \
$(LIBOBJS) \
$(PG_CRC32C_OBJS) \
+ $(PG_POPCNT_OBJS) \
bsearch_arg.o \
chklocale.o \
inet_net_ntop.o \
@@ -92,6 +93,16 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
+# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE
+pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
+pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
+pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE)
+
+# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
+pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
+pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
+
#
# Shared library versions of object files
#
diff --git a/src/port/meson.build b/src/port/meson.build
index 92b593e6ef3..fd9ee199d1b 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -84,6 +84,8 @@ replace_funcs_pos = [
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+ ['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
+ ['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'],
# arm / aarch64
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
@@ -98,8 +100,8 @@ replace_funcs_pos = [
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
]
-pgport_cflags = {'crc': cflags_crc}
-pgport_sources_cflags = {'crc': []}
+pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave}
+pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []}
foreach f : replace_funcs_neg
func = f.get(0)
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 6271acea600..411be90f734 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -163,6 +163,11 @@ choose_popcount_functions(void)
pg_popcount64 = pg_popcount64_slow;
pg_popcount_optimized = pg_popcount_slow;
}
+
+#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
+ if (pg_popcount_avx512_available())
+ pg_popcount_optimized = pg_popcount_avx512;
+#endif
}
static int
diff --git a/src/port/pg_popcount_avx512.c b/src/port/pg_popcount_avx512.c
new file mode 100644
index 00000000000..908817617ac
--- /dev/null
+++ b/src/port/pg_popcount_avx512.c
@@ -0,0 +1,81 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512.c
+ * Holds the AVX-512 pg_popcount() implementation.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/port/pg_popcount_avx512.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include <immintrin.h>
+
+#include "port/pg_bitutils.h"
+
+/*
+ * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
+ * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
+ * the function pointers that are only used when TRY_POPCNT_FAST is set.
+ */
+#ifdef TRY_POPCNT_FAST
+
+/*
+ * pg_popcount_avx512
+ * Returns the number of 1-bits in buf
+ */
+uint64
+pg_popcount_avx512(const char *buf, int bytes)
+{
+ __m512i val,
+ cnt;
+ __m512i accum = _mm512_setzero_si512();
+ const char *final;
+ int tail_idx;
+ __mmask64 mask = ~UINT64CONST(0);
+
+ /*
+ * Align buffer down to avoid double load overhead from unaligned access.
+ * Calculate a mask to ignore preceding bytes. Find start offset of final
+ * iteration and ensure it is not empty.
+ */
+ mask <<= ((uintptr_t) buf) % sizeof(__m512i);
+ tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
+ final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
+ buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
+
+ /*
+ * Iterate through all but the final iteration. Starting from the second
+ * iteration, the mask is ignored.
+ */
+ if (buf < final)
+ {
+ val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
+ cnt = _mm512_popcnt_epi64(val);
+ accum = _mm512_add_epi64(accum, cnt);
+
+ buf += sizeof(__m512i);
+ mask = ~UINT64CONST(0);
+
+ for (; buf < final; buf += sizeof(__m512i))
+ {
+ val = _mm512_load_si512((const __m512i *) buf);
+ cnt = _mm512_popcnt_epi64(val);
+ accum = _mm512_add_epi64(accum, cnt);
+ }
+ }
+
+ /* Final iteration needs to ignore bytes that are not within the length */
+ mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
+
+ val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
+ cnt = _mm512_popcnt_epi64(val);
+ accum = _mm512_add_epi64(accum, cnt);
+
+ return _mm512_reduce_add_epi64(accum);
+}
+
+#endif /* TRY_POPCNT_FAST */
diff --git a/src/port/pg_popcount_avx512_choose.c b/src/port/pg_popcount_avx512_choose.c
new file mode 100644
index 00000000000..ae3fa3d3067
--- /dev/null
+++ b/src/port/pg_popcount_avx512_choose.c
@@ -0,0 +1,88 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_popcount_avx512_choose.c
+ * Test whether we can use the AVX-512 pg_popcount() implementation.
+ *
+ * Copyright (c) 2024, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/port/pg_popcount_avx512_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE_XSAVE_INTRINSICS
+#include <immintrin.h>
+#endif
+
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
+#include <intrin.h>
+#endif
+
+#include "port/pg_bitutils.h"
+
+/*
+ * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
+ * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
+ * the function pointers that are only used when TRY_POPCNT_FAST is set.
+ */
+#ifdef TRY_POPCNT_FAST
+
+/*
+ * Returns true if the CPU supports the instructions required for the AVX-512
+ * pg_popcount() implementation.
+ */
+bool
+pg_popcount_avx512_available(void)
+{
+ unsigned int exx[4] = {0, 0, 0, 0};
+
+ /* Does CPUID say there's support for AVX-512 popcount instructions? */
+#if defined(HAVE__GET_CPUID_COUNT)
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+ __cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+ if ((exx[2] & (1 << 14)) == 0) /* avx512-vpopcntdq */
+ return false;
+
+ /* Does CPUID say there's support for AVX-512 byte and word instructions? */
+ memset(exx, 0, sizeof(exx));
+#if defined(HAVE__GET_CPUID_COUNT)
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+ __cpuidex(exx, 7, 0);
+#else
+#error cpuid instruction not available
+#endif
+ if ((exx[1] & (1 << 30)) == 0) /* avx512-bw */
+ return false;
+
+ /* Does CPUID say there's support for XSAVE instructions? */
+ memset(exx, 0, sizeof(exx));
+#if defined(HAVE__GET_CPUID)
+ __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+ __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+ if ((exx[2] & (1 << 26)) == 0) /* xsave */
+ return false;
+
+ /* Does XGETBV say the ZMM registers are enabled? */
+#ifdef HAVE_XSAVE_INTRINSICS
+ return (_xgetbv(0) & 0xe0) != 0;
+#else
+ return false;
+#endif
+}
+
+#endif /* TRY_POPCNT_FAST */
diff --git a/src/test/regress/expected/bit.out b/src/test/regress/expected/bit.out
index e17cbf42ca2..6a436288bb1 100644
--- a/src/test/regress/expected/bit.out
+++ b/src/test/regress/expected/bit.out
@@ -740,6 +740,30 @@ SELECT bit_count(B'1111111111'::bit(10));
10
(1 row)
+SELECT bit_count(repeat('0', 100)::bit(100));
+ bit_count
+-----------
+ 0
+(1 row)
+
+SELECT bit_count(repeat('1', 100)::bit(100));
+ bit_count
+-----------
+ 100
+(1 row)
+
+SELECT bit_count(repeat('01', 500)::bit(1000));
+ bit_count
+-----------
+ 500
+(1 row)
+
+SELECT bit_count(repeat('10101', 200)::bit(1000));
+ bit_count
+-----------
+ 600
+(1 row)
+
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(
b1 bit(4) DEFAULT '1001',
diff --git a/src/test/regress/sql/bit.sql b/src/test/regress/sql/bit.sql
index 34230b99fba..8ba6facd032 100644
--- a/src/test/regress/sql/bit.sql
+++ b/src/test/regress/sql/bit.sql
@@ -223,6 +223,10 @@ SELECT overlay(B'0101011100' placing '001' from 20);
-- bit_count
SELECT bit_count(B'0101011100'::bit(10));
SELECT bit_count(B'1111111111'::bit(10));
+SELECT bit_count(repeat('0', 100)::bit(100));
+SELECT bit_count(repeat('1', 100)::bit(100));
+SELECT bit_count(repeat('01', 500)::bit(1000));
+SELECT bit_count(repeat('10101', 200)::bit(1000));
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
CREATE TABLE bit_defaults(