summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/include/pg_config.h.in3
-rw-r--r--src/include/port/pg_crc32c.h39
-rw-r--r--src/port/meson.build1
-rw-r--r--src/port/pg_crc32c_sse42.c94
-rw-r--r--src/port/pg_crc32c_sse42_choose.c75
-rw-r--r--src/test/regress/expected/strings.out24
-rw-r--r--src/test/regress/sql/strings.sql5
7 files changed, 215 insertions, 26 deletions
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c2f1241b234..9891b9b05c3 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -665,6 +665,9 @@
/* Define to 1 to build with assertion checks. (--enable-cassert) */
#undef USE_ASSERT_CHECKING
+/* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */
+#undef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+
/* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 9376d223fef..82313bb7fcf 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -42,7 +42,10 @@ typedef uint32 pg_crc32c;
#define EQ_CRC32C(c1, c2) ((c1) == (c2))
#if defined(USE_SSE42_CRC32C)
-/* Use Intel SSE4.2 instructions. */
+/*
+ * Use either Intel SSE 4.2 or AVX-512 instructions. We don't need a runtime check
+ * for SSE 4.2, so we can inline those in some cases.
+ */
#include <nmmintrin.h>
@@ -50,7 +53,11 @@ typedef uint32 pg_crc32c;
((crc) = pg_comp_crc32c_dispatch((crc), (data), (len)))
#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
+#endif
/*
* We can only get here if the host compiler targets SSE 4.2, but on some
@@ -82,9 +89,27 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len)
return crc;
}
else
- return pg_comp_crc32c_sse42(crc, data, len);
+ /* Otherwise, use a runtime check for AVX-512 instructions. */
+ return pg_comp_crc32c(crc, data, len);
}
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+
+/*
+ * Use Intel SSE 4.2 or AVX-512 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+#define COMP_CRC32C(crc, data, len) \
+ ((crc) = pg_comp_crc32c((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
+extern PGDLLIMPORT pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
+extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len);
+#endif
+
#elif defined(USE_ARMV8_CRC32C)
/* Use ARMv8 CRC Extension instructions. */
@@ -103,10 +128,10 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le
extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
-#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
+#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
/*
- * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first
+ * Use ARMv8 instructions, but perform a runtime check first
* to check that they are available.
*/
#define COMP_CRC32C(crc, data, len) \
@@ -115,13 +140,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
-
-#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
-extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len);
-#endif
-#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
-#endif
#else
/*
diff --git a/src/port/meson.build b/src/port/meson.build
index 51041e75609..48d2dfb7cf3 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -86,6 +86,7 @@ replace_funcs_pos = [
# x86/x64
['pg_crc32c_sse42', 'USE_SSE42_CRC32C'],
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
+ ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'],
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index 22c2137df31..db60bb3c32c 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
*
* pg_crc32c_sse42.c
- * Compute CRC-32C checksum using Intel SSE 4.2 instructions.
+ * Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions.
*
* Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -15,6 +15,9 @@
#include "c.h"
#include <nmmintrin.h>
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+#include <immintrin.h>
+#endif
#include "port/pg_crc32c.h"
@@ -68,3 +71,92 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
return crc;
}
+
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+
+/*
+ * Note: There is no copyright notice in the following generated code.
+ *
+ * We have modified the output to
+ * - match our function declaration
+ * - match whitespace to our project style
+ * - add a threshold for the alignment stanza
+ */
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */
+/* MIT licensed */
+
+#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0))
+#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17))
+
+pg_attribute_target("vpclmulqdq,avx512vl")
+pg_crc32c
+pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length)
+{
+ /* adjust names to match generated code */
+ pg_crc32c crc0 = crc;
+ size_t len = length;
+ const char *buf = data;
+
+ /* Align on cacheline boundary. The threshold is somewhat arbitrary. */
+ if (unlikely(len > 256))
+ {
+ for (; len && ((uintptr_t) buf & 7); --len)
+ crc0 = _mm_crc32_u8(crc0, *buf++);
+ while (((uintptr_t) buf & 56) && len >= 8)
+ {
+ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf);
+ buf += 8;
+ len -= 8;
+ }
+ }
+
+ if (len >= 64)
+ {
+ const char *end = buf + len;
+ const char *limit = buf + len - 64;
+ __m128i z0;
+
+ /* First vector chunk. */
+ __m512i x0 = _mm512_loadu_si512((const void *) buf),
+ y0;
+ __m512i k;
+
+ k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0));
+ x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0);
+ buf += 64;
+
+ /* Main loop. */
+ while (buf <= limit)
+ {
+ y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k);
+ x0 = _mm512_ternarylogic_epi64(x0, y0,
+ _mm512_loadu_si512((const void *) buf),
+ 0x96);
+ buf += 64;
+ }
+
+ /* Reduce 512 bits to 128 bits. */
+ k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0,
+ 0x3da6d0cb, 0, 0xba4fc28e, 0,
+ 0xf20c0dfe, 0, 0x493c7d27, 0,
+ 0, 0, 0, 0);
+ y0 = clmul_lo(x0, k), k = clmul_hi(x0, k);
+ y0 = _mm512_xor_si512(y0, k);
+ z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0),
+ _mm512_extracti32x4_epi32(y0, 1),
+ _mm512_extracti32x4_epi32(y0, 2),
+ 0x96);
+ z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3));
+
+ /* Reduce 128 bits to 32 bits, and multiply by x^32. */
+ crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0));
+ crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1));
+ len = end - buf;
+ }
+
+ return pg_comp_crc32c_sse42(crc0, buf, len);
+}
+
+#endif
diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c
index 65dbc4d4249..74d2421ba2b 100644
--- a/src/port/pg_crc32c_sse42_choose.c
+++ b/src/port/pg_crc32c_sse42_choose.c
@@ -20,30 +20,37 @@
#include "c.h"
-#ifdef HAVE__GET_CPUID
+#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
#include <cpuid.h>
#endif
-#ifdef HAVE__CPUID
+#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
#include <intrin.h>
#endif
+#ifdef HAVE_XSAVE_INTRINSICS
+#include <immintrin.h>
+#endif
+
#include "port/pg_crc32c.h"
+/*
+ * Does XGETBV say the ZMM registers are enabled?
+ *
+ * NB: Caller is responsible for verifying that osxsave is available
+ * before calling this.
+ */
+#ifdef HAVE_XSAVE_INTRINSICS
+pg_attribute_target("xsave")
+#endif
static bool
-pg_crc32c_sse42_available(void)
+zmm_regs_available(void)
{
- unsigned int exx[4] = {0, 0, 0, 0};
-
-#if defined(HAVE__GET_CPUID)
- __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
-#elif defined(HAVE__CPUID)
- __cpuid(exx, 1);
+#ifdef HAVE_XSAVE_INTRINSICS
+ return (_xgetbv(0) & 0xe6) == 0xe6;
#else
-#error cpuid instruction not available
+ return false;
#endif
-
- return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */
}
/*
@@ -53,10 +60,48 @@ pg_crc32c_sse42_available(void)
static pg_crc32c
pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
{
- if (pg_crc32c_sse42_available())
+ unsigned int exx[4] = {0, 0, 0, 0};
+
+ /*
+ * Set fallback. We must guard since slicing-by-8 is not visible
+ * everywhere.
+ */
+#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
+ pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
+#if defined(HAVE__GET_CPUID)
+ __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+ __cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+ if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */
+ {
pg_comp_crc32c = pg_comp_crc32c_sse42;
- else
- pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+ if (exx[2] & (1 << 27) && /* OSXSAVE */
+ zmm_regs_available())
+ {
+ /* second cpuid call on leaf 7 to check extended AVX-512 support */
+
+ memset(exx, 0, 4 * sizeof(exx[0]));
+
+#if defined(HAVE__GET_CPUID_COUNT)
+ __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUIDEX)
+ __cpuidex(exx, 7, 0);
+#endif
+
+#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK
+ if (exx[2] & (1 << 10) && /* VPCLMULQDQ */
+ exx[1] & (1 << 31)) /* AVX512-VL */
+ pg_comp_crc32c = pg_comp_crc32c_avx512;
+#endif
+ }
+ }
return pg_comp_crc32c(crc, data, len);
}
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index dc485735aa4..174f0a68331 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -2330,6 +2330,30 @@ SELECT crc32c('The quick brown fox jumps over the lazy dog.');
419469235
(1 row)
+SELECT crc32c(repeat('A', 127)::bytea);
+ crc32c
+-----------
+ 291820082
+(1 row)
+
+SELECT crc32c(repeat('A', 128)::bytea);
+ crc32c
+-----------
+ 816091258
+(1 row)
+
+SELECT crc32c(repeat('A', 129)::bytea);
+ crc32c
+------------
+ 4213642571
+(1 row)
+
+SELECT crc32c(repeat('A', 800)::bytea);
+ crc32c
+------------
+ 3134039419
+(1 row)
+
--
-- encode/decode
--
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index aeba798dac1..f7b325baadf 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -738,6 +738,11 @@ SELECT crc32('The quick brown fox jumps over the lazy dog.');
SELECT crc32c('');
SELECT crc32c('The quick brown fox jumps over the lazy dog.');
+SELECT crc32c(repeat('A', 127)::bytea);
+SELECT crc32c(repeat('A', 128)::bytea);
+SELECT crc32c(repeat('A', 129)::bytea);
+SELECT crc32c(repeat('A', 800)::bytea);
+
--
-- encode/decode
--