3 files changed, 295 insertions, 6 deletions
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c6f055b3905..92f0616c400 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -712,6 +712,9 @@
 /* Define to 1 to use Intel SSE 4.2 CRC instructions with a runtime check. */
 #undef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK
 
+/* Define to 1 to use SVE popcount instructions with a runtime check. */
+#undef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
 /* Define to build with systemd support. (--with-systemd) */
 #undef USE_SYSTEMD
 
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index a387f77c2c0..c7901bf8ddc 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -324,6 +324,23 @@ extern uint64 pg_popcount_avx512(const char *buf, int bytes);
 extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask);
 #endif
 
+#elif POPCNT_AARCH64
+/* Use the Neon version of pg_popcount{32,64} without function pointer. */
+extern int	pg_popcount32(uint32 word);
+extern int	pg_popcount64(uint64 word);
+
+/*
+ * We can try to use an SVE-optimized pg_popcount() on some systems  For that,
+ * we do use a function pointer.
+ */
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
+extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask);
+#else
+extern uint64 pg_popcount_optimized(const char *buf, int bytes);
+extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask);
+#endif
+
 #else
 /* Use a portable implementation -- no need for a function pointer. */
 extern int	pg_popcount32(uint32 word);
diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c
index 29b8c0640fe..bed0f7ab242 100644
--- a/src/port/pg_popcount_aarch64.c
+++ b/src/port/pg_popcount_aarch64.c
@@ -18,6 +18,275 @@
 
 #include <arm_neon.h>
 
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+#include <arm_sve.h>
+
+#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+#endif
+#endif
+
+/*
+ * The Neon versions are built regardless of whether we are building the SVE
+ * versions.
+ */
+static uint64 pg_popcount_neon(const char *buf, int bytes);
+static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask);
+
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
+/*
+ * These are the SVE implementations of the popcount functions.
+ */
+static uint64 pg_popcount_sve(const char *buf, int bytes);
+static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask);
+
+/*
+ * The function pointers are initially set to "choose" functions.  These
+ * functions will first set the pointers to the right implementations (based on
+ * what the current CPU supports) and then will call the pointer to fulfill the
+ * caller's request.
+ */
+static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
+uint64		(*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
+uint64		(*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
+
+static inline bool
+pg_popcount_sve_available(void)
+{
+#ifdef HAVE_ELF_AUX_INFO
+	unsigned long value;
+
+	return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+		(value & HWCAP_SVE) != 0;
+#elif defined(HAVE_GETAUXVAL)
+	return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
+#else
+	return false;
+#endif
+}
+
+static inline void
+choose_popcount_functions(void)
+{
+	if (pg_popcount_sve_available())
+	{
+		pg_popcount_optimized = pg_popcount_sve;
+		pg_popcount_masked_optimized = pg_popcount_masked_sve;
+	}
+	else
+	{
+		pg_popcount_optimized = pg_popcount_neon;
+		pg_popcount_masked_optimized = pg_popcount_masked_neon;
+	}
+}
+
+static uint64
+pg_popcount_choose(const char *buf, int bytes)
+{
+	choose_popcount_functions();
+	return pg_popcount_optimized(buf, bytes);
+}
+
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+	choose_popcount_functions();
+	return pg_popcount_masked_optimized(buf, bytes, mask);
+}
+
+/*
+ * pg_popcount_sve
+ *		Returns number of 1 bits in buf
+ */
+pg_attribute_target("arch=armv8-a+sve")
+static uint64
+pg_popcount_sve(const char *buf, int bytes)
+{
+	svbool_t	pred = svptrue_b64();
+	svuint64_t	accum1 = svdup_u64(0),
+				accum2 = svdup_u64(0),
+				accum3 = svdup_u64(0),
+				accum4 = svdup_u64(0);
+	uint32		vec_len = svcntb(),
+				bytes_per_iteration = 4 * vec_len;
+	uint64		popcnt = 0;
+
+	/*
+	 * For better instruction-level parallelism, each loop iteration operates
+	 * on a block of four registers.
+	 */
+	for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+	{
+		svuint64_t	vec;
+
+		vec = svld1_u64(pred, (const uint64 *) buf);
+		accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		vec = svld1_u64(pred, (const uint64 *) buf);
+		accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		vec = svld1_u64(pred, (const uint64 *) buf);
+		accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		vec = svld1_u64(pred, (const uint64 *) buf);
+		accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+	}
+
+	/*
+	 * If enough data remains, do another iteration on a block of two
+	 * registers.
+	 */
+	bytes_per_iteration = 2 * vec_len;
+	if (bytes >= bytes_per_iteration)
+	{
+		svuint64_t	vec;
+
+		vec = svld1_u64(pred, (const uint64 *) buf);
+		accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		vec = svld1_u64(pred, (const uint64 *) buf);
+		accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		bytes -= bytes_per_iteration;
+	}
+
+	/*
+	 * Add the accumulators.
+	 */
+	popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
+	popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
+
+	/*
+	 * Process any remaining data.
+	 */
+	for (; bytes > 0; bytes -= vec_len)
+	{
+		svuint8_t	vec;
+
+		pred = svwhilelt_b8_s32(0, bytes);
+		vec = svld1_u8(pred, (const uint8 *) buf);
+		popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
+		buf += vec_len;
+	}
+
+	return popcnt;
+}
+
+/*
+ * pg_popcount_masked_sve
+ *		Returns number of 1 bits in buf after applying the mask to each byte
+ */
+pg_attribute_target("arch=armv8-a+sve")
+static uint64
+pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask)
+{
+	svbool_t	pred = svptrue_b64();
+	svuint64_t	accum1 = svdup_u64(0),
+				accum2 = svdup_u64(0),
+				accum3 = svdup_u64(0),
+				accum4 = svdup_u64(0);
+	uint32		vec_len = svcntb(),
+				bytes_per_iteration = 4 * vec_len;
+	uint64		popcnt = 0,
+				mask64 = ~UINT64CONST(0) / 0xFF * mask;
+
+	/*
+	 * For better instruction-level parallelism, each loop iteration operates
+	 * on a block of four registers.
+	 */
+	for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+	{
+		svuint64_t	vec;
+
+		vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+		accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+		accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+		accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+		accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+	}
+
+	/*
+	 * If enough data remains, do another iteration on a block of two
+	 * registers.
+	 */
+	bytes_per_iteration = 2 * vec_len;
+	if (bytes >= bytes_per_iteration)
+	{
+		svuint64_t	vec;
+
+		vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+		accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+		accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
+		buf += vec_len;
+
+		bytes -= bytes_per_iteration;
+	}
+
+	/*
+	 * Add the accumulators.
+	 */
+	popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
+	popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
+
+	/*
+	 * Process any remaining data.
+	 */
+	for (; bytes > 0; bytes -= vec_len)
+	{
+		svuint8_t	vec;
+
+		pred = svwhilelt_b8_s32(0, bytes);
+		vec = svand_n_u8_x(pred, svld1_u8(pred, (const uint8 *) buf), mask);
+		popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
+		buf += vec_len;
+	}
+
+	return popcnt;
+}
+
+#else							/* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
+
+/*
+ * When the SVE version isn't available, there's no point in using function
+ * pointers to vary the implementation.  We instead just make these actual
+ * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined.
+ * The compiler should be able to inline the Neon versions here.
+ */
+uint64
+pg_popcount_optimized(const char *buf, int bytes)
+{
+	return pg_popcount_neon(buf, bytes);
+}
+
+uint64
+pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+{
+	return pg_popcount_masked_neon(buf, bytes, mask);
+}
+
+#endif							/* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
+
 /*
  * pg_popcount32
  *		Return number of 1 bits in word
@@ -44,11 +313,11 @@ pg_popcount64(uint64 word)
 }
 
 /*
- * pg_popcount_optimized
+ * pg_popcount_neon
  *		Returns number of 1 bits in buf
  */
-uint64
-pg_popcount_optimized(const char *buf, int bytes)
+static uint64
+pg_popcount_neon(const char *buf, int bytes)
 {
 	uint8x16_t	vec;
 	uint64x2_t	accum1 = vdupq_n_u64(0),
@@ -124,11 +393,11 @@ pg_popcount_optimized(const char *buf, int bytes)
 }
 
 /*
- * pg_popcount_masked_optimized
+ * pg_popcount_masked_neon
  *		Returns number of 1 bits in buf after applying the mask to each byte
  */
-uint64
-pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+static uint64
+pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask)
 {
 	uint8x16_t	vec,
 				maskv = vdupq_n_u8(mask);