fi
undefine([Ac_cachevar])dnl
])# PGAC_AVX512_POPCNT_INTRINSICS
+
+# PGAC_SVE_POPCNT_INTRINSICS
+# --------------------------
+# Check if the compiler supports the SVE popcount instructions using the
+# svptrue_b64, svdup_u64, svcntb, svld1_u64, svld1_u8, svadd_u64_x,
+# svcnt_u64_x, svcnt_u8_x, svaddv_u64, svaddv_u8, svwhilelt_b8_s32,
+# svand_n_u64_x, and svand_n_u8_x intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_sve_popcnt_intrinsics.
+AC_DEFUN([PGAC_SVE_POPCNT_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_sve_popcnt_intrinsics])])dnl
+AC_CACHE_CHECK([for svcnt_x], [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <arm_sve.h>
+
+ char buf[128];
+
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("arch=armv8-a+sve")))
+ #endif
+ static int popcount_test(void)
+ {
+ svbool_t pred = svptrue_b64();
+ svuint8_t vec8;
+ svuint64_t accum1 = svdup_u64(0),
+ accum2 = svdup_u64(0),
+ vec64;
+ char *p = buf;
+ uint64_t popcnt,
+ mask = 0x5555555555555555;
+
+ vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
+ accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
+ p += svcntb();
+
+ vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
+ accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
+ p += svcntb();
+
+ popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
+
+ pred = svwhilelt_b8_s32(0, sizeof(buf));
+ vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
+ return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
+ }]],
+ [return popcount_test();])],
+ [Ac_cachevar=yes],
+ [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+ pgac_sve_popcnt_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_SVE_POPCNT_INTRINSICS
fi
fi
+# Check for SVE popcount intrinsics
+#
+if test x"$host_cpu" = x"aarch64"; then
+ { $as_echo "$as_me:${as_lineno-$LINENO}: checking for svcnt_x" >&5
+$as_echo_n "checking for svcnt_x... " >&6; }
+if ${pgac_cv_sve_popcnt_intrinsics+:} false; then :
+ $as_echo_n "(cached) " >&6
+else
+ cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h. */
+#include <arm_sve.h>
+
+ char buf[128];
+
+ #if defined(__has_attribute) && __has_attribute (target)
+ __attribute__((target("arch=armv8-a+sve")))
+ #endif
+ static int popcount_test(void)
+ {
+ svbool_t pred = svptrue_b64();
+ svuint8_t vec8;
+ svuint64_t accum1 = svdup_u64(0),
+ accum2 = svdup_u64(0),
+ vec64;
+ char *p = buf;
+ uint64_t popcnt,
+ mask = 0x5555555555555555;
+
+ vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
+ accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec64));
+ p += svcntb();
+
+ vec64 = svand_n_u64_x(pred, svld1_u64(pred, (const uint64_t *) p), mask);
+ accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec64));
+ p += svcntb();
+
+ popcnt = svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
+
+ pred = svwhilelt_b8_s32(0, sizeof(buf));
+ vec8 = svand_n_u8_x(pred, svld1_u8(pred, (const uint8_t *) p), 0x55);
+ return (int) (popcnt + svaddv_u8(pred, svcnt_u8_x(pred, vec8)));
+ }
+int
+main ()
+{
+return popcount_test();
+ ;
+ return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+ pgac_cv_sve_popcnt_intrinsics=yes
+else
+ pgac_cv_sve_popcnt_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+ conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_sve_popcnt_intrinsics" >&5
+$as_echo "$pgac_cv_sve_popcnt_intrinsics" >&6; }
+if test x"$pgac_cv_sve_popcnt_intrinsics" = x"yes"; then
+ pgac_sve_popcnt_intrinsics=yes
+fi
+
+ if test x"$pgac_sve_popcnt_intrinsics" = x"yes"; then
+
+$as_echo "#define USE_SVE_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+ fi
+fi
+
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
#
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm_crc32_u8 and _mm_crc32_u32" >&5
#include <arm_neon.h>
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+#include <arm_sve.h>
+
+#if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
+#include <sys/auxv.h>
+#endif
+#endif
+
+/*
+ * The Neon versions are built regardless of whether we are building the SVE
+ * versions.
+ */
+static uint64 pg_popcount_neon(const char *buf, int bytes);
+static uint64 pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask);
+
+#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK
+
+/*
+ * These are the SVE implementations of the popcount functions.
+ */
+static uint64 pg_popcount_sve(const char *buf, int bytes);
+static uint64 pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask);
+
+/*
+ * The function pointers are initially set to "choose" functions. These
+ * functions will first set the pointers to the right implementations (based on
+ * what the current CPU supports) and then will call the pointer to fulfill the
+ * caller's request.
+ */
+static uint64 pg_popcount_choose(const char *buf, int bytes);
+static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask);
+uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose;
+uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose;
+
+static inline bool
+pg_popcount_sve_available(void)
+{
+#ifdef HAVE_ELF_AUX_INFO
+ unsigned long value;
+
+ return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+ (value & HWCAP_SVE) != 0;
+#elif defined(HAVE_GETAUXVAL)
+ return (getauxval(AT_HWCAP) & HWCAP_SVE) != 0;
+#else
+ return false;
+#endif
+}
+
+static inline void
+choose_popcount_functions(void)
+{
+ if (pg_popcount_sve_available())
+ {
+ pg_popcount_optimized = pg_popcount_sve;
+ pg_popcount_masked_optimized = pg_popcount_masked_sve;
+ }
+ else
+ {
+ pg_popcount_optimized = pg_popcount_neon;
+ pg_popcount_masked_optimized = pg_popcount_masked_neon;
+ }
+}
+
+static uint64
+pg_popcount_choose(const char *buf, int bytes)
+{
+ choose_popcount_functions();
+ return pg_popcount_optimized(buf, bytes);
+}
+
+static uint64
+pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask)
+{
+ choose_popcount_functions();
+ return pg_popcount_masked_optimized(buf, bytes, mask);
+}
+
+/*
+ * pg_popcount_sve
+ * Returns number of 1 bits in buf
+ */
+pg_attribute_target("arch=armv8-a+sve")
+static uint64
+pg_popcount_sve(const char *buf, int bytes)
+{
+ svbool_t pred = svptrue_b64();
+ svuint64_t accum1 = svdup_u64(0),
+ accum2 = svdup_u64(0),
+ accum3 = svdup_u64(0),
+ accum4 = svdup_u64(0);
+ uint32 vec_len = svcntb(),
+ bytes_per_iteration = 4 * vec_len;
+ uint64 popcnt = 0;
+
+ /*
+ * For better instruction-level parallelism, each loop iteration operates
+ * on a block of four registers.
+ */
+ for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+ {
+ svuint64_t vec;
+
+ vec = svld1_u64(pred, (const uint64 *) buf);
+ accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ vec = svld1_u64(pred, (const uint64 *) buf);
+ accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ vec = svld1_u64(pred, (const uint64 *) buf);
+ accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ vec = svld1_u64(pred, (const uint64 *) buf);
+ accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+ }
+
+ /*
+ * If enough data remains, do another iteration on a block of two
+ * registers.
+ */
+ bytes_per_iteration = 2 * vec_len;
+ if (bytes >= bytes_per_iteration)
+ {
+ svuint64_t vec;
+
+ vec = svld1_u64(pred, (const uint64 *) buf);
+ accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ vec = svld1_u64(pred, (const uint64 *) buf);
+ accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ bytes -= bytes_per_iteration;
+ }
+
+ /*
+ * Add the accumulators.
+ */
+ popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
+ popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
+
+ /*
+ * Process any remaining data.
+ */
+ for (; bytes > 0; bytes -= vec_len)
+ {
+ svuint8_t vec;
+
+ pred = svwhilelt_b8_s32(0, bytes);
+ vec = svld1_u8(pred, (const uint8 *) buf);
+ popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
+ buf += vec_len;
+ }
+
+ return popcnt;
+}
+
+/*
+ * pg_popcount_masked_sve
+ * Returns number of 1 bits in buf after applying the mask to each byte
+ */
+pg_attribute_target("arch=armv8-a+sve")
+static uint64
+pg_popcount_masked_sve(const char *buf, int bytes, bits8 mask)
+{
+ svbool_t pred = svptrue_b64();
+ svuint64_t accum1 = svdup_u64(0),
+ accum2 = svdup_u64(0),
+ accum3 = svdup_u64(0),
+ accum4 = svdup_u64(0);
+ uint32 vec_len = svcntb(),
+ bytes_per_iteration = 4 * vec_len;
+ uint64 popcnt = 0,
+ mask64 = ~UINT64CONST(0) / 0xFF * mask;
+
+ /*
+ * For better instruction-level parallelism, each loop iteration operates
+ * on a block of four registers.
+ */
+ for (; bytes >= bytes_per_iteration; bytes -= bytes_per_iteration)
+ {
+ svuint64_t vec;
+
+ vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+ accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+ accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+ accum3 = svadd_u64_x(pred, accum3, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+ accum4 = svadd_u64_x(pred, accum4, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+ }
+
+ /*
+ * If enough data remains, do another iteration on a block of two
+ * registers.
+ */
+ bytes_per_iteration = 2 * vec_len;
+ if (bytes >= bytes_per_iteration)
+ {
+ svuint64_t vec;
+
+ vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+ accum1 = svadd_u64_x(pred, accum1, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ vec = svand_n_u64_x(pred, svld1_u64(pred, (const uint64 *) buf), mask64);
+ accum2 = svadd_u64_x(pred, accum2, svcnt_u64_x(pred, vec));
+ buf += vec_len;
+
+ bytes -= bytes_per_iteration;
+ }
+
+ /*
+ * Add the accumulators.
+ */
+ popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum1, accum2));
+ popcnt += svaddv_u64(pred, svadd_u64_x(pred, accum3, accum4));
+
+ /*
+ * Process any remaining data.
+ */
+ for (; bytes > 0; bytes -= vec_len)
+ {
+ svuint8_t vec;
+
+ pred = svwhilelt_b8_s32(0, bytes);
+ vec = svand_n_u8_x(pred, svld1_u8(pred, (const uint8 *) buf), mask);
+ popcnt += svaddv_u8(pred, svcnt_u8_x(pred, vec));
+ buf += vec_len;
+ }
+
+ return popcnt;
+}
+
+#else /* USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
+
+/*
+ * When the SVE version isn't available, there's no point in using function
+ * pointers to vary the implementation. We instead just make these actual
+ * external functions when USE_SVE_POPCNT_WITH_RUNTIME_CHECK is not defined.
+ * The compiler should be able to inline the Neon versions here.
+ */
+uint64
+pg_popcount_optimized(const char *buf, int bytes)
+{
+ return pg_popcount_neon(buf, bytes);
+}
+
+uint64
+pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+{
+ return pg_popcount_masked_neon(buf, bytes, mask);
+}
+
+#endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */
+
/*
* pg_popcount32
* Return number of 1 bits in word
}
/*
- * pg_popcount_optimized
+ * pg_popcount_neon
* Returns number of 1 bits in buf
*/
-uint64
-pg_popcount_optimized(const char *buf, int bytes)
+static uint64
+pg_popcount_neon(const char *buf, int bytes)
{
uint8x16_t vec;
uint64x2_t accum1 = vdupq_n_u64(0),
}
/*
- * pg_popcount_masked_optimized
+ * pg_popcount_masked_neon
* Returns number of 1 bits in buf after applying the mask to each byte
*/
-uint64
-pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask)
+static uint64
+pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask)
{
uint8x16_t vec,
maskv = vdupq_n_u8(mask);