Use native CRC instructions on 64-bit LoongArch

author John Naylor <john.naylor@postgresql.org>

Thu, 10 Aug 2023 04:36:15 +0000 (11:36 +0700)

committer John Naylor <john.naylor@postgresql.org>

Thu, 10 Aug 2023 04:36:15 +0000 (11:36 +0700)
author John Naylor <john.naylor@postgresql.org>
Thu, 10 Aug 2023 04:36:15 +0000 (11:36 +0700)
committer John Naylor <john.naylor@postgresql.org>
Thu, 10 Aug 2023 04:36:15 +0000 (11:36 +0700)
diff --git a/config/c-compiler.m4 b/config/c-compiler.m4

index 5be8f0f08dcb53ae50bd6b9abe22cb42210680ed..5db02b2ab75de609ccfb888f96be0dc68e40a808 100644 (file)
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -661,3 +661,36 @@ if test x"$Ac_cachevar" = x"yes"; then
  fi
  undefine([Ac_cachevar])dnl
  ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_LOONGARCH_CRC32C_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the LoongArch CRCC instructions, using
+# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
+# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
+# intrinsic functions.
+#
+# We test for the 8-byte variant since platforms capable of running
+# Postgres are 64-bit only (as of PG17), and we know CRC instructions
+# are available there without a runtime check.
+#
+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
+AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
+AC_CACHE_CHECK(
+  [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w],
+  [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+  [unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_LOONGARCH_CRC32C_INTRINSICS
diff --git a/configure b/configure

index 963fbbcf1e75ff06712b3ab5648591a3fbaecca3..86ffccb1ee18bdd20838a5d41bdef7010175fc55 100755 (executable)
--- a/configure
+++ b/configure
@@ -18047,6 +18047,47 @@ fi
  
  fi
  
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+
  
  
  # Select CRC-32C implementation.
@@ -18063,9 +18104,12 @@ fi
  # we're not targeting such a processor, but can nevertheless produce code that
  # uses the CRC instructions, compile both, and select at runtime.
  #
-# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
+# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
  # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+#
+# If we are targeting a LoongArch processor, CRC instructions are
+# always available (at least on 64 bit), so no runtime check is needed.
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
    # Use Intel SSE 4.2 if available.
    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
      USE_SSE42_CRC32C=1
@@ -18083,10 +18127,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
          if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
            USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
          else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-   fi
+          # LoongArch CRCC instructions.
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
        fi
      fi
    fi
@@ -18127,12 +18176,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
          { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
  $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
        else
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+
+$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
+
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
+$as_echo "LoongArch CRCC instructions" >&6; }
+        else
  
  $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
  
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
  $as_echo "slicing-by-8" >&6; }
+        fi
        fi
      fi
    fi
diff --git a/configure.ac b/configure.ac

index 5153b8b3fdc25ee3eaf90141952176b153abe093..116c5a1f68b078e48cff860e1fd78d45f025fad6 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -2099,6 +2099,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
    PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
  fi
  
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+PGAC_LOONGARCH_CRC32C_INTRINSICS()
+
  AC_SUBST(CFLAGS_CRC)
  
  # Select CRC-32C implementation.
@@ -2115,9 +2121,12 @@ AC_SUBST(CFLAGS_CRC)
  # we're not targeting such a processor, but can nevertheless produce code that
  # uses the CRC instructions, compile both, and select at runtime.
  #
-# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
+# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
  # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+#
+# If we are targeting a LoongArch processor, CRC instructions are
+# always available (at least on 64 bit), so no runtime check is needed.
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
    # Use Intel SSE 4.2 if available.
    if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
      USE_SSE42_CRC32C=1
@@ -2135,10 +2144,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
          if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
            USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
          else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-   fi
+          # LoongArch CRCC instructions.
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
        fi
      fi
    fi
@@ -2166,9 +2180,15 @@ else
          PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
          AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
        else
-        AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        AC_MSG_RESULT(slicing-by-8)
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          AC_MSG_RESULT(LoongArch CRCC instructions)
+        else
+          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          AC_MSG_RESULT(slicing-by-8)
+        fi
        fi
      fi
    fi
diff --git a/meson.build b/meson.build

index 0a11efc97a1ac740c3ec1b67418389b14b1b769d..2acb2040037f5f7c0cbfb8ec358136f479c606d2 100644 (file)
--- a/meson.build
+++ b/meson.build
@@ -2065,6 +2065,30 @@ int main(void)
      cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
      have_optimized_crc = true
    endif
+
+elif host_cpu == 'loongarch64'
+
+  prog = '''
+int main(void)
+{
+    unsigned int crc = 0;
+    crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+
+    /* return computed value, to prevent the above being optimized away */
+    return crc == 0;
+}
+'''
+
+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  endif
+
  endif
  
  if not have_optimized_crc
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in

index ee209d6d7023148d8f69a40f5e14a9f494fbd525..d8a2985567fbf19c0a4a46c19946c6369b143887 100644 (file)
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -714,6 +714,9 @@
  /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
  #undef USE_LLVM
  
+/* Define to 1 to use LoongArch CRCC instructions. */
+#undef USE_LOONGARCH_CRC32C
+
  /* Define to 1 to build with LZ4 support. (--with-lz4) */
  #undef USE_LZ4
  
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h

index 7f8779261c3c98949413c1a9ad7679f666bb8e52..d085f1dc00b578b4d7201a1ff399bb81a132e645 100644 (file)
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
  
  extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
  
+#elif defined(USE_LOONGARCH_CRC32C)
+/* Use LoongArch CRCC instructions. */
+
+#define COMP_CRC32C(crc, data, len)                            \
+   ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
+
  #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
  
  /*
diff --git a/src/port/meson.build b/src/port/meson.build

index 9d0cd93c438c7457ee713634cfdcacb2093933b5..deb354418dbb48df0dee5b4c52ae0ac170338a73 100644 (file)
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -92,6 +92,9 @@ replace_funcs_pos = [
    ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
    ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
  
+  # loongarch
+  ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+
    # generic fallback
    ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
  ]
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c

new file mode 100644 (file)

index 0000000..db9da80
--- /dev/null
+++ b/src/port/pg_crc32c_loongarch.c
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_loongarch.c
+ *   Compute CRC-32C checksum using LoongArch CRCC instructions
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *   src/port/pg_crc32c_loongarch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+pg_crc32c
+pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
+{
+   const unsigned char *p = data;
+   const unsigned char *pend = p + len;
+
+   /*
+    * LoongArch doesn't require alignment, but aligned memory access is
+    * significantly faster. Process leading bytes so that the loop below
+    * starts with a pointer aligned to eight bytes.
+    */
+   if (!PointerIsAligned(p, uint16) &&
+       p + 1 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+       p += 1;
+   }
+   if (!PointerIsAligned(p, uint32) &&
+       p + 2 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+       p += 2;
+   }
+   if (!PointerIsAligned(p, uint64) &&
+       p + 4 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+       p += 4;
+   }
+
+   /* Process eight bytes at a time, as far as we can. */
+   while (p + 8 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
+       p += 8;
+   }
+
+   /* Process remaining 0-7 bytes. */
+   if (p + 4 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+       p += 4;
+   }
+   if (p + 2 <= pend)
+   {
+       crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+       p += 2;
+   }
+   if (p < pend)
+   {
+       crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+   }
+
+   return crc;
+}
author	John Naylor <john.naylor@postgresql.org>
	Thu, 10 Aug 2023 04:36:15 +0000 (11:36 +0700)
committer	John Naylor <john.naylor@postgresql.org>
	Thu, 10 Aug 2023 04:36:15 +0000 (11:36 +0700)
config/c-compiler.m4		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
meson.build		patch \| blob \| blame \| history
src/include/pg_config.h.in		patch \| blob \| blame \| history
src/include/port/pg_crc32c.h		patch \| blob \| blame \| history
src/port/meson.build		patch \| blob \| blame \| history
src/port/pg_crc32c_loongarch.c	[new file with mode: 0644]	patch \| blob